mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-13 03:00:37 +00:00
Compare commits
69 Commits
test-tenan
...
refactor-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b5792b9bf | ||
|
|
be0dfa9d3a | ||
|
|
292c42731e | ||
|
|
867b35ce55 | ||
|
|
14ff793582 | ||
|
|
5aaa5302eb | ||
|
|
6a53b8fac6 | ||
|
|
57a6e931ea | ||
|
|
0cceb14e48 | ||
|
|
1983c4d4ad | ||
|
|
d7c41cbbee | ||
|
|
29a2465276 | ||
|
|
f49e923d87 | ||
|
|
a0ee306c74 | ||
|
|
c1731bc4f0 | ||
|
|
95bf19b85a | ||
|
|
80d4afab0c | ||
|
|
0807522a64 | ||
|
|
8eebd5f039 | ||
|
|
8c07ef413d | ||
|
|
14df37c108 | ||
|
|
d4d0aa6ed6 | ||
|
|
a457256fef | ||
|
|
3a22e1335d | ||
|
|
93c77b0383 | ||
|
|
7920b39a27 | ||
|
|
23d5e2bdaa | ||
|
|
3526323bc4 | ||
|
|
af9425394f | ||
|
|
debd134b15 | ||
|
|
df42213dbb | ||
|
|
b6237474d2 | ||
|
|
8b710b9753 | ||
|
|
c187de1101 | ||
|
|
8712e1899e | ||
|
|
d7f1e30112 | ||
|
|
6a9d1030a6 | ||
|
|
8c6e607327 | ||
|
|
f436fb2dfb | ||
|
|
8932d14d50 | ||
|
|
efad64bc7f | ||
|
|
10dae79c6d | ||
|
|
e9583db73b | ||
|
|
0b428f7c41 | ||
|
|
8b692e131b | ||
|
|
0a0e55c3d0 | ||
|
|
5bc9f8eae0 | ||
|
|
4c4d3dc87a | ||
|
|
182dc785d6 | ||
|
|
a9cca7a0fd | ||
|
|
6fd64cd5f6 | ||
|
|
56a4466d0a | ||
|
|
41b8e67305 | ||
|
|
81afd7011c | ||
|
|
3468db8a2b | ||
|
|
9f94d098aa | ||
|
|
cb61944982 | ||
|
|
c700c7db2e | ||
|
|
7c7d225d98 | ||
|
|
8ff7bc5df1 | ||
|
|
890ff3803e | ||
|
|
fefe19a284 | ||
|
|
434fcac357 | ||
|
|
894ac30734 | ||
|
|
c0290467fa | ||
|
|
0e7c03370e | ||
|
|
f731e9b3de | ||
|
|
bd7a9e6274 | ||
|
|
42c6ddef8e |
10
.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
vendored
Normal file
10
.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
## Describe your changes
|
||||
|
||||
## Issue ticket number and link
|
||||
|
||||
## Checklist before requesting a review
|
||||
- [ ] I have performed a self-review of my code.
|
||||
- [ ] If it is a core feature, I have added thorough tests.
|
||||
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
|
||||
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
|
||||
|
||||
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -25,6 +25,8 @@ storage:
|
||||
ansible_host: i-0d9f6dfae0e1c780d
|
||||
pageserver-1.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-0c834be1dddba8b3f
|
||||
pageserver-2.us-west-2.aws.neon.tech:
|
||||
ansible_host: i-051642d372c0a4f32
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
2
.github/ansible/staging.us-east-2.hosts.yaml
vendored
2
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -27,6 +27,8 @@ storage:
|
||||
ansible_host: i-0c3e70929edb5d691
|
||||
pageserver-1.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0565a8b4008aa3f40
|
||||
pageserver-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-01e31cdf7e970586a
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -9,6 +9,7 @@ settings:
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
@@ -23,6 +24,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
|
||||
httpsPort: 443
|
||||
|
||||
#metrics:
|
||||
# enabled: true
|
||||
|
||||
@@ -3,6 +3,7 @@ settings:
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
@@ -16,6 +17,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
|
||||
httpsPort: 443
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -111,6 +111,7 @@ jobs:
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
@@ -126,6 +127,11 @@ jobs:
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check
|
||||
|
||||
build-neon:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
@@ -177,13 +183,12 @@ jobs:
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES="--features testing"
|
||||
CARGO_FLAGS="--locked $CARGO_FEATURES"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features testing,profiling"
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
@@ -789,6 +794,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
environment:
|
||||
name: prod-old
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -834,7 +841,9 @@ jobs:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2 ]
|
||||
target_region: [ eu-west-1, us-east-2 ]
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -906,6 +915,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -945,6 +956,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
environment:
|
||||
name: prod-old
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
@@ -970,8 +983,8 @@ jobs:
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker:
|
||||
name: deploy storage broker on old staging and old prod
|
||||
@@ -988,6 +1001,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
environment:
|
||||
name: prod-old
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
@@ -1036,6 +1051,8 @@ jobs:
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
deploy_link_proxy: false
|
||||
deploy_legacy_scram_proxy: false
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -1051,19 +1068,19 @@ jobs:
|
||||
- name: Re-deploy scram proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy link proxy
|
||||
if: matrix.deploy_link_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
- name: Re-deploy legacy scram proxy
|
||||
if: matrix.deploy_legacy_scram_proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-dev-new:
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
@@ -1083,6 +1100,8 @@ jobs:
|
||||
target_cluster: dev-us-east-2-beta
|
||||
- target_region: eu-west-1
|
||||
target_cluster: dev-eu-west-1-zeta
|
||||
environment:
|
||||
name: dev-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -1121,6 +1140,8 @@ jobs:
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -1136,7 +1157,7 @@ jobs:
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
|
||||
|
||||
deploy-storage-broker-prod-new:
|
||||
runs-on: prod
|
||||
@@ -1160,6 +1181,8 @@ jobs:
|
||||
target_cluster: prod-eu-central-1-gamma
|
||||
- target_region: ap-southeast-1
|
||||
target_cluster: prod-ap-southeast-1-epsilon
|
||||
environment:
|
||||
name: prod-${{ matrix.target_region }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
431
Cargo.lock
generated
431
Cargo.lock
generated
@@ -66,12 +66,6 @@ dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arrayvec"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
|
||||
|
||||
[[package]]
|
||||
name = "asn1-rs"
|
||||
version = "0.5.1"
|
||||
@@ -633,12 +627,6 @@ version = "3.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
@@ -899,7 +887,7 @@ dependencies = [
|
||||
"clap 4.0.29",
|
||||
"comfy-table",
|
||||
"git-version",
|
||||
"nix 0.25.1",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
@@ -934,15 +922,6 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
|
||||
|
||||
[[package]]
|
||||
name = "cpp_demangle"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.5"
|
||||
@@ -1066,7 +1045,7 @@ dependencies = [
|
||||
"crossterm_winapi",
|
||||
"libc",
|
||||
"mio",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"signal-hook",
|
||||
"signal-hook-mio",
|
||||
"winapi",
|
||||
@@ -1176,15 +1155,6 @@ version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
|
||||
|
||||
[[package]]
|
||||
name = "debugid"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
|
||||
dependencies = [
|
||||
"uuid 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "debugid"
|
||||
version = "0.8.0"
|
||||
@@ -1192,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"uuid 1.2.2",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1318,18 +1288,6 @@ dependencies = [
|
||||
"windows-sys 0.42.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "findshlibs"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fixedbitset"
|
||||
version = "0.4.2"
|
||||
@@ -1342,21 +1300,6 @@ version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.1.0"
|
||||
@@ -1758,16 +1701,16 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.5.0"
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||
checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hyper",
|
||||
"native-tls",
|
||||
"pin-project",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-tungstenite",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1821,24 +1764,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inferno"
|
||||
version = "0.10.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"atty",
|
||||
"indexmap",
|
||||
"itoa",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"num-format",
|
||||
"quick-xml",
|
||||
"rgb",
|
||||
"str_stack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.9.6"
|
||||
@@ -2065,15 +1990,6 @@ version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.6.5"
|
||||
@@ -2141,37 +2057,6 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"memoffset 0.6.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.25.1"
|
||||
@@ -2235,16 +2120,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-format"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
|
||||
dependencies = [
|
||||
"arrayvec",
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.45"
|
||||
@@ -2305,51 +2180,12 @@ version = "11.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.44"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_info"
|
||||
version = "3.5.1"
|
||||
@@ -2400,7 +2236,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix 0.25.1",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -2410,7 +2246,6 @@ dependencies = [
|
||||
"postgres-types",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"regex",
|
||||
@@ -2424,12 +2259,12 @@ dependencies = [
|
||||
"signal-hook",
|
||||
"storage_broker",
|
||||
"svg_fmt",
|
||||
"tar",
|
||||
"tempfile",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
@@ -2454,17 +2289,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
|
||||
dependencies = [
|
||||
"instant",
|
||||
"lock_api",
|
||||
"parking_lot_core 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
@@ -2472,21 +2296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"instant",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2583,12 +2393,6 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.4"
|
||||
@@ -2695,25 +2499,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pprof"
|
||||
version = "0.6.1"
|
||||
source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"cfg-if",
|
||||
"findshlibs",
|
||||
"inferno",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"nix 0.23.2",
|
||||
"parking_lot 0.11.2",
|
||||
"symbolic-demangle",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
@@ -2730,6 +2515,7 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"rand",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
@@ -2808,7 +2594,7 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"procfs",
|
||||
"thiserror",
|
||||
]
|
||||
@@ -2885,15 +2671,17 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper",
|
||||
"hyper-tungstenite",
|
||||
"itertools",
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
"rcgen",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"rstest",
|
||||
@@ -2905,6 +2693,7 @@ dependencies = [
|
||||
"sha2",
|
||||
"socket2",
|
||||
"thiserror",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
@@ -2913,20 +2702,12 @@ dependencies = [
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid 1.2.2",
|
||||
"uuid",
|
||||
"webpki-roots",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.21"
|
||||
@@ -3095,12 +2876,10 @@ dependencies = [
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
@@ -3110,7 +2889,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls",
|
||||
"tower-service",
|
||||
"url",
|
||||
@@ -3121,15 +2899,6 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rgb"
|
||||
version = "0.8.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -3310,9 +3079,9 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"nix 0.25.1",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
@@ -3424,14 +3193,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
|
||||
dependencies = [
|
||||
"httpdate",
|
||||
"native-tls",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"sentry-backtrace",
|
||||
"sentry-contexts",
|
||||
"sentry-core",
|
||||
"sentry-panic",
|
||||
"tokio",
|
||||
"ureq",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3489,7 +3259,7 @@ version = "0.29.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
|
||||
dependencies = [
|
||||
"debugid 0.8.0",
|
||||
"debugid",
|
||||
"getrandom",
|
||||
"hex",
|
||||
"serde",
|
||||
@@ -3497,7 +3267,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"time",
|
||||
"url",
|
||||
"uuid 1.2.2",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3571,6 +3341,17 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha-1"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.5"
|
||||
@@ -3719,7 +3500,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"prost",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
@@ -3730,12 +3511,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "str_stack"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.2"
|
||||
@@ -3783,29 +3558,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
|
||||
|
||||
[[package]]
|
||||
name = "symbolic-common"
|
||||
version = "8.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
|
||||
dependencies = [
|
||||
"debugid 0.7.3",
|
||||
"memmap2",
|
||||
"stable_deref_trait",
|
||||
"uuid 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "symbolic-demangle"
|
||||
version = "8.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750"
|
||||
dependencies = [
|
||||
"cpp_demangle",
|
||||
"rustc-demangle",
|
||||
"symbolic-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.105"
|
||||
@@ -3964,10 +3716,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.21.1"
|
||||
name = "tls-listener"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95"
|
||||
checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"hyper",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
@@ -3975,12 +3741,11 @@ dependencies = [
|
||||
"memchr",
|
||||
"mio",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2",
|
||||
"tokio-macros",
|
||||
"winapi",
|
||||
"windows-sys 0.42.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4004,16 +3769,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
@@ -4026,7 +3781,7 @@ dependencies = [
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"log",
|
||||
"parking_lot 0.12.1",
|
||||
"parking_lot",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
@@ -4073,6 +3828,32 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tar"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"futures-core",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.17.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.4"
|
||||
@@ -4299,6 +4080,25 @@ version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.17.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0"
|
||||
dependencies = [
|
||||
"base64 0.13.1",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"http",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand",
|
||||
"sha-1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -4362,9 +4162,11 @@ dependencies = [
|
||||
"base64 0.13.1",
|
||||
"chunked_transfer",
|
||||
"log",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
"rustls",
|
||||
"url",
|
||||
"webpki",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4385,6 +4187,12 @@ version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9"
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utils"
|
||||
version = "0.1.0"
|
||||
@@ -4401,7 +4209,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.25.1",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rand",
|
||||
@@ -4425,12 +4233,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.2.2"
|
||||
@@ -4447,12 +4249,6 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
@@ -4751,7 +4547,6 @@ dependencies = [
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -4775,12 +4570,10 @@ dependencies = [
|
||||
"rand",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"reqwest",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"socket2",
|
||||
"stable_deref_trait",
|
||||
"syn",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
|
||||
11
Cargo.toml
11
Cargo.toml
@@ -1,14 +1,3 @@
|
||||
# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
|
||||
# build work with older cargo versions.
|
||||
#
|
||||
# We have this because as of this writing, the latest cargo Debian package
|
||||
# that's available is 1.56. (Confusingly, the Debian package version number
|
||||
# is 0.57, whereas 'cargo --version' says 1.56.)
|
||||
#
|
||||
# See https://tracker.debian.org/pkg/cargo for the current status of the
|
||||
# package. When that gets updated, we can remove this.
|
||||
cargo-features = ["named-profiles"]
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"compute_tools",
|
||||
|
||||
@@ -29,7 +29,12 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -55,7 +60,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -29,7 +29,12 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -55,7 +60,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
14
README.md
14
README.md
@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
@@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
Starting pageserver at '127.0.0.1:64000' in '.neon'.
|
||||
pageserver started, pid: 2545906
|
||||
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
|
||||
Stopped pageserver 1 process with pid 2545906
|
||||
|
||||
# start pageserver and safekeeper
|
||||
# start pageserver, safekeeper, and broker for their intercommunication
|
||||
> ./target/debug/neon_local start
|
||||
Starting neon broker at 127.0.0.1:50051
|
||||
storage_broker started, pid: 2918372
|
||||
@@ -130,6 +128,12 @@ pageserver started, pid: 2918386
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
|
||||
safekeeper 1 started, pid: 2918437
|
||||
|
||||
# create initial tenant and use it as a default for every future neon_local invocation
|
||||
> ./target/debug/neon_local tenant create --set-default
|
||||
tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
|
||||
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
|
||||
@@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
use serde_json;
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeStatus};
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let status = compute.get_status();
|
||||
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
|
||||
}
|
||||
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
info!("serving /status GET request");
|
||||
@@ -46,16 +28,6 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// DEPRECATED, use POST instead
|
||||
(&Method::GET, "/check_writability") => {
|
||||
info!("serving /check_writability GET request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
|
||||
@@ -37,58 +37,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/ready:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Check whether compute startup process finished successfully
|
||||
description: ""
|
||||
operationId: computeIsReady
|
||||
responses:
|
||||
"200":
|
||||
description: Compute is ready ('true') or not ('false')
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "true"
|
||||
|
||||
/last_activity:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get timestamp of the last compute activity
|
||||
description: ""
|
||||
operationId: getLastComputeActivityTS
|
||||
responses:
|
||||
"200":
|
||||
description: Timestamp of the last compute activity
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
|
||||
/check_writability:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritabilityDeprecated
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
post:
|
||||
tags:
|
||||
- "check"
|
||||
|
||||
@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
|
||||
for b in backs.into_iter() {
|
||||
let state: String = b.get("state");
|
||||
let change: String = b.get("state_change");
|
||||
let state: String = match b.try_get("state") {
|
||||
Ok(state) => state,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
if state == "idle" {
|
||||
let change: String = match b.try_get("state_change") {
|
||||
Ok(state_change) => state_change,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let change = DateTime::parse_from_rfc3339(&change);
|
||||
match change {
|
||||
Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
@@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
/// Reassign all dependent objects and delete requested roles.
|
||||
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
let spec = &node.spec;
|
||||
|
||||
// First, reassign all dependent objects to db owners.
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
if let Some(ops) = &node.spec.delta_operations {
|
||||
// First, reassign all dependent objects to db owners.
|
||||
info!("reassigning dependent objects of to-be-deleted roles");
|
||||
for op in ops {
|
||||
if op.action == "delete_role" {
|
||||
reassign_owned_objects(node, &op.name)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second, proceed with role deletions.
|
||||
let mut xact = client.transaction()?;
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
// Second, proceed with role deletions.
|
||||
info!("processing role deletions");
|
||||
let mut xact = client.transaction()?;
|
||||
for op in ops {
|
||||
// We do not check either role exists or not,
|
||||
// Postgres will take care of it for us
|
||||
@@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
}
|
||||
xact.commit()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
|
||||
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
|
||||
|
||||
let start_time = Instant::now();
|
||||
if let Some(r) = pg_db {
|
||||
// XXX: db owner name is returned as quoted string from Postgres,
|
||||
// when quoting is needed.
|
||||
@@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
info_print!(" -> update");
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
let elapsed = start_time.elapsed().as_millis();
|
||||
info_print!(" ({} ms)", elapsed);
|
||||
}
|
||||
} else {
|
||||
let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
|
||||
@@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
query.push_str(&db.to_pg_options());
|
||||
client.execute(query.as_str(), &[])?;
|
||||
|
||||
let elapsed = start_time.elapsed().as_millis();
|
||||
info_print!(" ({} ms)", elapsed);
|
||||
}
|
||||
|
||||
info_print!("\n");
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
|
||||
@@ -136,22 +136,6 @@ where
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
/// Send SIGTERM to child process
|
||||
pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
|
||||
let pid = child.id();
|
||||
match kill(
|
||||
nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
|
||||
Signal::SIGTERM,
|
||||
) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!("child process with pid {pid} does not exist");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
|
||||
let pid = match pid_file::read(pid_file)
|
||||
|
||||
@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
|
||||
} else if let Some(default_id) = env.default_tenant_id {
|
||||
Ok(default_id)
|
||||
} else {
|
||||
bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
|
||||
anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
|
||||
}
|
||||
|
||||
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
let initial_timeline_id_arg = parse_timeline_id(init_match)?;
|
||||
|
||||
// Create config file
|
||||
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
||||
// load and parse the file
|
||||
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
env.init(pg_version)
|
||||
.context("Failed to initialize neon repository")?;
|
||||
let initial_tenant_id = env
|
||||
.default_tenant_id
|
||||
.expect("default_tenant_id should be generated by the `env.init()` call above");
|
||||
|
||||
// Initialize pageserver, create initial tenant and timeline.
|
||||
let pageserver = PageServerNode::from_env(&env);
|
||||
let initial_timeline_id = pageserver
|
||||
.initialize(
|
||||
Some(initial_tenant_id),
|
||||
initial_timeline_id_arg,
|
||||
&pageserver_config_overrides(init_match),
|
||||
pg_version,
|
||||
)
|
||||
pageserver
|
||||
.initialize(&pageserver_config_overrides(init_match))
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("pageserver init failed: {e:?}");
|
||||
exit(1);
|
||||
});
|
||||
|
||||
env.register_branch_mapping(
|
||||
DEFAULT_BRANCH_NAME.to_owned(),
|
||||
initial_tenant_id,
|
||||
initial_timeline_id,
|
||||
)?;
|
||||
|
||||
Ok(env)
|
||||
}
|
||||
|
||||
@@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
println!(
|
||||
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
|
||||
);
|
||||
|
||||
if create_match.get_flag("set-default") {
|
||||
println!("Setting tenant {new_tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(new_tenant_id);
|
||||
}
|
||||
}
|
||||
Some(("set-default", set_default_match)) => {
|
||||
let tenant_id =
|
||||
parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
|
||||
println!("Setting tenant {tenant_id} as a default one");
|
||||
env.default_tenant_id = Some(tenant_id);
|
||||
}
|
||||
Some(("config", create_match)) => {
|
||||
let tenant_id = get_tenant_id(create_match, env)?;
|
||||
@@ -928,9 +923,8 @@ fn cli() -> Command {
|
||||
.version(GIT_VERSION)
|
||||
.subcommand(
|
||||
Command::new("init")
|
||||
.about("Initialize a new Neon repository")
|
||||
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(
|
||||
Arg::new("config")
|
||||
.long("config")
|
||||
@@ -992,11 +986,14 @@ fn cli() -> Command {
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
|
||||
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
||||
)
|
||||
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
||||
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
|
||||
)
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
|
||||
@@ -201,7 +201,7 @@ impl PostgresNode {
|
||||
.stderr(Stdio::piped());
|
||||
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let sync_handle = cmd
|
||||
@@ -304,17 +304,17 @@ impl PostgresNode {
|
||||
|
||||
// Set up authentication
|
||||
//
|
||||
// $ZENITH_AUTH_TOKEN will be replaced with value from environment
|
||||
// $NEON_AUTH_TOKEN will be replaced with value from environment
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::NeonJWT = auth_type {
|
||||
"$ZENITH_AUTH_TOKEN"
|
||||
"$NEON_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
|
||||
// We parse this string and build it back with token from env var, and for simplicity rebuild
|
||||
// uses only needed variables namely host, port, user, password.
|
||||
format!("postgresql://no_user:{password}@{host}:{port}")
|
||||
@@ -323,7 +323,7 @@ impl PostgresNode {
|
||||
conf.append_line("");
|
||||
conf.append("neon.pageserver_connstring", &pageserver_connstr);
|
||||
if let AuthType::NeonJWT = auth_type {
|
||||
conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
|
||||
conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
|
||||
}
|
||||
conf.append("neon.tenant_id", &self.tenant_id.to_string());
|
||||
conf.append("neon.timeline_id", &self.timeline_id.to_string());
|
||||
@@ -448,7 +448,7 @@ impl PostgresNode {
|
||||
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
|
||||
);
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
|
||||
@@ -296,11 +296,6 @@ impl LocalEnv {
|
||||
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenant_id.is_none() {
|
||||
env.default_tenant_id = Some(TenantId::generate());
|
||||
}
|
||||
|
||||
env.base_data_dir = base_path();
|
||||
|
||||
Ok(env)
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{bail, Context};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -130,83 +130,15 @@ impl PageServerNode {
|
||||
overrides
|
||||
}
|
||||
|
||||
/// Initializes a pageserver node by creating its config with the overrides provided,
|
||||
/// and creating an initial tenant and timeline afterwards.
|
||||
pub fn initialize(
|
||||
&self,
|
||||
create_tenant: Option<TenantId>,
|
||||
initial_timeline_id: Option<TimelineId>,
|
||||
config_overrides: &[&str],
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
/// Initializes a pageserver node by creating its config with the overrides provided.
|
||||
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
|
||||
self.pageserver_init(config_overrides).with_context(|| {
|
||||
format!(
|
||||
"Failed to run init for pageserver node {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
// Then, briefly start it fully to run HTTP commands on it,
|
||||
// to create initial tenant and timeline.
|
||||
// We disable the remote storage, since we stop pageserver right after the timeline creation,
|
||||
// hence most of the uploads will either aborted or not started: no point to start them at all.
|
||||
let disabled_remote_storage_override = "remote_storage={}";
|
||||
let mut pageserver_process = self
|
||||
.start_node(
|
||||
&[disabled_remote_storage_override],
|
||||
// Previous overrides will be taken from the config created before, don't overwrite them.
|
||||
false,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to start a process for pageserver node {}",
|
||||
self.env.pageserver.id,
|
||||
)
|
||||
})?;
|
||||
|
||||
let init_result = self
|
||||
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
|
||||
.context("Failed to create initial tenant and timeline for pageserver");
|
||||
match &init_result {
|
||||
Ok(initial_timeline_id) => {
|
||||
println!("Successfully initialized timeline {initial_timeline_id}")
|
||||
}
|
||||
Err(e) => eprintln!("{e:#}"),
|
||||
}
|
||||
background_process::send_stop_child_process(&pageserver_process)?;
|
||||
|
||||
let exit_code = pageserver_process.wait()?;
|
||||
ensure!(
|
||||
exit_code.success(),
|
||||
format!(
|
||||
"pageserver init failed with exit code {:?}",
|
||||
exit_code.code()
|
||||
)
|
||||
);
|
||||
println!(
|
||||
"Stopped pageserver {} process with pid {}",
|
||||
self.env.pageserver.id,
|
||||
pageserver_process.id(),
|
||||
);
|
||||
init_result
|
||||
}
|
||||
|
||||
fn try_init_timeline(
|
||||
&self,
|
||||
new_tenant_id: Option<TenantId>,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
|
||||
let initial_timeline_info = self.timeline_create(
|
||||
initial_tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
)?;
|
||||
Ok(initial_timeline_info.timeline_id)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
@@ -320,7 +252,7 @@ impl PageServerNode {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
|
||||
vec![("NEON_AUTH_TOKEN".to_owned(), token)]
|
||||
} else {
|
||||
Vec::new()
|
||||
})
|
||||
|
||||
90
deny.toml
Normal file
90
deny.toml
Normal file
@@ -0,0 +1,90 @@
|
||||
# This file was auto-generated using `cargo deny init`.
|
||||
# cargo-deny is a cargo plugin that lets you lint your project's
|
||||
# dependency graph to ensure all your dependencies conform
|
||||
# to your expectations and requirements.
|
||||
|
||||
# Root options
|
||||
targets = []
|
||||
all-features = false
|
||||
no-default-features = false
|
||||
feature-depth = 1
|
||||
|
||||
# This section is considered when running `cargo deny check advisories`
|
||||
# More documentation for the advisories section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
|
||||
[advisories]
|
||||
db-urls = ["https://github.com/rustsec/advisory-db"]
|
||||
vulnerability = "deny"
|
||||
unmaintained = "warn"
|
||||
yanked = "warn"
|
||||
notice = "warn"
|
||||
ignore = []
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
[licenses]
|
||||
unlicensed = "deny"
|
||||
allow = [
|
||||
"Apache-2.0",
|
||||
"Artistic-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
]
|
||||
deny = []
|
||||
copyleft = "warn"
|
||||
allow-osi-fsf-free = "neither"
|
||||
default = "deny"
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
# Zlib license has some restrictions if we decide to change sth
|
||||
{ allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
|
||||
{ allow = ["Zlib"], name = "const_format", version = "*" },
|
||||
]
|
||||
|
||||
[[licenses.clarify]]
|
||||
name = "ring"
|
||||
version = "*"
|
||||
expression = "MIT AND ISC AND OpenSSL"
|
||||
license-files = [
|
||||
{ path = "LICENSE", hash = 0xbd0eed23 },
|
||||
]
|
||||
|
||||
[licenses.private]
|
||||
ignore = true
|
||||
registries = []
|
||||
|
||||
# This section is considered when running `cargo deny check bans`.
|
||||
# More documentation about the 'bans' section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
|
||||
[bans]
|
||||
multiple-versions = "warn"
|
||||
wildcards = "allow"
|
||||
highlight = "all"
|
||||
workspace-default-features = "allow"
|
||||
external-default-features = "allow"
|
||||
allow = []
|
||||
deny = []
|
||||
skip = []
|
||||
skip-tree = []
|
||||
|
||||
# This section is considered when running `cargo deny check sources`.
|
||||
# More documentation about the 'sources' section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
|
||||
[sources]
|
||||
unknown-registry = "warn"
|
||||
unknown-git = "warn"
|
||||
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
|
||||
allow-git = []
|
||||
|
||||
[sources.allow-org]
|
||||
github = [
|
||||
"neondatabase",
|
||||
]
|
||||
gitlab = []
|
||||
bitbucket = []
|
||||
@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
|
||||
|
||||
#### Outgoing connections
|
||||
Compute connects to Pageserver for getting pages.
|
||||
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
|
||||
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
|
||||
The environment variable inside the connection string is substituted with
|
||||
the JWT token.
|
||||
|
||||
@@ -77,7 +77,7 @@ If the GUC is unset, no token is passed.
|
||||
|
||||
Note that both tokens can be (and typically are) the same;
|
||||
the scope is the tenant and the token is usually passed through the
|
||||
`$ZENITH_AUTH_TOKEN` environment variable.
|
||||
`$NEON_AUTH_TOKEN` environment variable.
|
||||
|
||||
### Pageserver
|
||||
#### Overview
|
||||
@@ -114,7 +114,7 @@ either of three values:
|
||||
Pageserver makes a connection to a Safekeeper for each active timeline.
|
||||
As Pageserver may want to access any timeline it has on the disk,
|
||||
it is given a blanket JWT token to access any data on any Safekeeper.
|
||||
This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
|
||||
This token is passed through an environment variable called `NEON_AUTH_TOKEN`
|
||||
(non-configurable as of writing this text).
|
||||
|
||||
A better way _may be_ to store JWT token for each timeline next to it,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "postgres_connection"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
license = "Apache-2.0"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
@@ -12,5 +13,6 @@ rand = "0.8.3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tracing = "0.1"
|
||||
thiserror = "1.0"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
// Tools for calling certain async methods in sync contexts.
|
||||
pub mod sync;
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -194,6 +194,35 @@ macro_rules! retry_read {
|
||||
};
|
||||
}
|
||||
|
||||
/// An error occured during connection being open.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ConnectionError {
|
||||
/// IO error during writing to or reading from the connection socket.
|
||||
#[error("Socket IO error: {0}")]
|
||||
Socket(std::io::Error),
|
||||
/// Invalid packet was received from client
|
||||
#[error("Protocol error: {0}")]
|
||||
Protocol(String),
|
||||
/// Failed to parse a protocol mesage
|
||||
#[error("Message parse error: {0}")]
|
||||
MessageParse(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for ConnectionError {
|
||||
fn from(e: anyhow::Error) -> Self {
|
||||
Self::MessageParse(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnectionError {
|
||||
pub fn into_io_error(self) -> io::Error {
|
||||
match self {
|
||||
ConnectionError::Socket(io) => io,
|
||||
other => io::Error::new(io::ErrorKind::Other, other.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
/// Read one message from the stream.
|
||||
/// This function returns `Ok(None)` in case of EOF.
|
||||
@@ -216,7 +245,9 @@ impl FeMessage {
|
||||
/// }
|
||||
/// ```
|
||||
#[inline(never)]
|
||||
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
|
||||
pub fn read(
|
||||
stream: &mut (impl io::Read + Unpin),
|
||||
) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
Self::read_fut(&mut AsyncishRead(stream)).wait()
|
||||
}
|
||||
|
||||
@@ -224,7 +255,7 @@ impl FeMessage {
|
||||
/// See documentation for `Self::read`.
|
||||
pub fn read_fut<Reader>(
|
||||
stream: &mut Reader,
|
||||
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
|
||||
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
|
||||
where
|
||||
Reader: tokio::io::AsyncRead + Unpin,
|
||||
{
|
||||
@@ -238,17 +269,21 @@ impl FeMessage {
|
||||
let tag = match retry_read!(stream.read_u8().await) {
|
||||
Ok(b) => b,
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
Err(e) => return Err(ConnectionError::Socket(e)),
|
||||
};
|
||||
|
||||
// The message length includes itself, so it better be at least 4.
|
||||
let len = retry_read!(stream.read_u32().await)?
|
||||
let len = retry_read!(stream.read_u32().await)
|
||||
.map_err(ConnectionError::Socket)?
|
||||
.checked_sub(4)
|
||||
.context("invalid message length")?;
|
||||
.ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
|
||||
|
||||
let body = {
|
||||
let mut buffer = vec![0u8; len as usize];
|
||||
stream.read_exact(&mut buffer).await?;
|
||||
stream
|
||||
.read_exact(&mut buffer)
|
||||
.await
|
||||
.map_err(ConnectionError::Socket)?;
|
||||
Bytes::from(buffer)
|
||||
};
|
||||
|
||||
@@ -265,7 +300,11 @@ impl FeMessage {
|
||||
b'c' => Ok(Some(FeMessage::CopyDone)),
|
||||
b'f' => Ok(Some(FeMessage::CopyFail)),
|
||||
b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
|
||||
tag => bail!("unknown message tag: {},'{:?}'", tag, body),
|
||||
tag => {
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"unknown message tag: {tag},'{body:?}'"
|
||||
)))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -275,7 +314,9 @@ impl FeStartupPacket {
|
||||
/// Read startup message from the stream.
|
||||
// XXX: It's tempting yet undesirable to accept `stream` by value,
|
||||
// since such a change will cause user-supplied &mut references to be consumed
|
||||
pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
|
||||
pub fn read(
|
||||
stream: &mut (impl io::Read + Unpin),
|
||||
) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
Self::read_fut(&mut AsyncishRead(stream)).wait()
|
||||
}
|
||||
|
||||
@@ -284,7 +325,7 @@ impl FeStartupPacket {
|
||||
// since such a change will cause user-supplied &mut references to be consumed
|
||||
pub fn read_fut<Reader>(
|
||||
stream: &mut Reader,
|
||||
) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
|
||||
) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
|
||||
where
|
||||
Reader: tokio::io::AsyncRead + Unpin,
|
||||
{
|
||||
@@ -302,31 +343,41 @@ impl FeStartupPacket {
|
||||
let len = match retry_read!(stream.read_u32().await) {
|
||||
Ok(len) => len as usize,
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
Err(e) => return Err(ConnectionError::Socket(e)),
|
||||
};
|
||||
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
bail!("invalid message length");
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"invalid message length {len}"
|
||||
)));
|
||||
}
|
||||
|
||||
let request_code = retry_read!(stream.read_u32().await)?;
|
||||
let request_code =
|
||||
retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
|
||||
|
||||
// the rest of startup packet are params
|
||||
let params_len = len - 8;
|
||||
let mut params_bytes = vec![0u8; params_len];
|
||||
stream.read_exact(params_bytes.as_mut()).await?;
|
||||
stream
|
||||
.read_exact(params_bytes.as_mut())
|
||||
.await
|
||||
.map_err(ConnectionError::Socket)?;
|
||||
|
||||
// Parse params depending on request code
|
||||
let req_hi = request_code >> 16;
|
||||
let req_lo = request_code & ((1 << 16) - 1);
|
||||
let message = match (req_hi, req_lo) {
|
||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
||||
ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
|
||||
if params_len != 8 {
|
||||
return Err(ConnectionError::Protocol(
|
||||
"expected 8 bytes for CancelRequest params".to_string(),
|
||||
));
|
||||
}
|
||||
let mut cursor = Cursor::new(params_bytes);
|
||||
FeStartupPacket::CancelRequest(CancelKeyData {
|
||||
backend_pid: cursor.read_i32().await?,
|
||||
cancel_key: cursor.read_i32().await?,
|
||||
backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
|
||||
cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
|
||||
})
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
||||
@@ -338,7 +389,9 @@ impl FeStartupPacket {
|
||||
FeStartupPacket::GssEncRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
||||
bail!("Unrecognized request code {}", unrecognized_code)
|
||||
return Err(ConnectionError::Protocol(format!(
|
||||
"Unrecognized request code {unrecognized_code}"
|
||||
)));
|
||||
}
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
(major_version, minor_version) => {
|
||||
@@ -346,15 +399,21 @@ impl FeStartupPacket {
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(¶ms_bytes)
|
||||
.context("StartupMessage params: invalid utf-8")?
|
||||
.strip_suffix('\0') // drop packet's own null terminator
|
||||
.context("StartupMessage params: missing null terminator")?
|
||||
.strip_suffix('\0') // drop packet's own null
|
||||
.ok_or_else(|| {
|
||||
ConnectionError::Protocol(
|
||||
"StartupMessage params: missing null terminator".to_string(),
|
||||
)
|
||||
})?
|
||||
.split_terminator('\0');
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens
|
||||
.next()
|
||||
.context("StartupMessage params: key without value")?;
|
||||
let value = tokens.next().ok_or_else(|| {
|
||||
ConnectionError::Protocol(
|
||||
"StartupMessage params: key without value".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
@@ -458,7 +517,7 @@ pub enum BeMessage<'a> {
|
||||
CloseComplete,
|
||||
// None means column is NULL
|
||||
DataRow(&'a [Option<&'a [u8]>]),
|
||||
ErrorResponse(&'a str),
|
||||
ErrorResponse(&'a str, Option<&'a [u8; 5]>),
|
||||
/// Single byte - used in response to SSLRequest/GSSENCRequest.
|
||||
EncryptionResponse(bool),
|
||||
NoData,
|
||||
@@ -606,7 +665,7 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
|
||||
}
|
||||
|
||||
/// Safe write of s into buf as cstring (String in the protocol).
|
||||
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
|
||||
fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
|
||||
let bytes = s.as_ref();
|
||||
if bytes.contains(&0) {
|
||||
return Err(io::Error::new(
|
||||
@@ -626,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
/// Write message to the given buf.
|
||||
// Unlike the reading side, we use BytesMut
|
||||
@@ -765,10 +826,7 @@ impl<'a> BeMessage<'a> {
|
||||
// First byte of each field represents type of this field. Set just enough fields
|
||||
// to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
|
||||
// message text.
|
||||
BeMessage::ErrorResponse(error_msg) => {
|
||||
// For all the errors set Severity to Error and error code to
|
||||
// 'internal error'.
|
||||
|
||||
BeMessage::ErrorResponse(error_msg, pg_error_code) => {
|
||||
// 'E' signalizes ErrorResponse messages
|
||||
buf.put_u8(b'E');
|
||||
write_body(buf, |buf| {
|
||||
@@ -776,7 +834,9 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_slice(b"ERROR\0");
|
||||
|
||||
buf.put_u8(b'C'); // SQLSTATE error code
|
||||
buf.put_slice(b"CXX000\0");
|
||||
buf.put_slice(&terminate_code(
|
||||
pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
|
||||
));
|
||||
|
||||
buf.put_u8(b'M'); // the message
|
||||
write_cstr(error_msg, buf)?;
|
||||
@@ -799,7 +859,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_slice(b"NOTICE\0");
|
||||
|
||||
buf.put_u8(b'C'); // SQLSTATE error code
|
||||
buf.put_slice(b"CXX000\0");
|
||||
buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
|
||||
|
||||
buf.put_u8(b'M'); // the message
|
||||
write_cstr(error_msg.as_bytes(), buf)?;
|
||||
@@ -1087,3 +1147,12 @@ mod tests {
|
||||
let _ = FeStartupPacket::read_fut(stream).await;
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
||||
@@ -3,6 +3,7 @@ name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
name = "utils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
sentry = "0.29.0"
|
||||
sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
async-trait = "0.1"
|
||||
anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
|
||||
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
@@ -22,25 +22,32 @@ pub trait Handler {
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care).
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth md5
|
||||
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
|
||||
bail!("MD5 auth failed")
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
||||
bail!("JWT auth failed")
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
|
||||
}
|
||||
|
||||
fn is_shutdown_requested(&self) -> bool {
|
||||
@@ -61,7 +68,6 @@ pub enum ProtoState {
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum AuthType {
|
||||
Trust,
|
||||
MD5,
|
||||
// This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
|
||||
NeonJWT,
|
||||
}
|
||||
@@ -72,9 +78,8 @@ impl FromStr for AuthType {
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"Trust" => Ok(Self::Trust),
|
||||
"MD5" => Ok(Self::MD5),
|
||||
"NeonJWT" => Ok(Self::NeonJWT),
|
||||
_ => bail!("invalid value \"{s}\" for auth type"),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for auth type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,7 +88,6 @@ impl fmt::Display for AuthType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
AuthType::Trust => "Trust",
|
||||
AuthType::MD5 => "MD5",
|
||||
AuthType::NeonJWT => "NeonJWT",
|
||||
})
|
||||
}
|
||||
@@ -134,7 +138,6 @@ pub struct PostgresBackend {
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
md5_salt: [u8; 4],
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
@@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
|
||||
}
|
||||
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
@@ -187,7 +190,6 @@ impl PostgresBackend {
|
||||
stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
md5_salt: [0u8; 4],
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
@@ -199,10 +201,10 @@ impl PostgresBackend {
|
||||
}
|
||||
|
||||
/// Get direct reference (into the Option) to the read stream.
|
||||
fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
|
||||
fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
|
||||
match &mut self.stream {
|
||||
Some(Stream::Bidirectional(stream)) => Ok(stream),
|
||||
_ => bail!("reader taken"),
|
||||
_ => anyhow::bail!("reader taken"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,7 +228,7 @@ impl PostgresBackend {
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is closed.
|
||||
pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
|
||||
let (state, stream) = (self.state, self.get_stream_in()?);
|
||||
|
||||
use ProtoState::*;
|
||||
@@ -234,6 +236,7 @@ impl PostgresBackend {
|
||||
Initialization | Encrypted => FeStartupPacket::read(stream),
|
||||
Authentication | Established => FeMessage::read(stream),
|
||||
}
|
||||
.map_err(QueryError::from)
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer.
|
||||
@@ -257,7 +260,7 @@ impl PostgresBackend {
|
||||
}
|
||||
|
||||
// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
|
||||
pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
|
||||
let ret = self.run_message_loop(handler);
|
||||
if let Some(stream) = self.stream.as_mut() {
|
||||
let _ = stream.shutdown(Shutdown::Both);
|
||||
@@ -265,7 +268,7 @@ impl PostgresBackend {
|
||||
ret
|
||||
}
|
||||
|
||||
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
|
||||
fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
|
||||
trace!("postgres backend to {:?} started", self.peer_addr);
|
||||
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
@@ -274,7 +277,7 @@ impl PostgresBackend {
|
||||
match self.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(msg) = message {
|
||||
trace!("got message {:?}", msg);
|
||||
trace!("got message {msg:?}");
|
||||
|
||||
match self.process_message(handler, msg, &mut unnamed_query_string)? {
|
||||
ProcessMsgResult::Continue => continue,
|
||||
@@ -285,10 +288,12 @@ impl PostgresBackend {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// If it is a timeout error, continue the loop
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
if let QueryError::Other(e) = &e {
|
||||
if is_socket_read_timed_out(e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -306,7 +311,7 @@ impl PostgresBackend {
|
||||
}
|
||||
stream => {
|
||||
self.stream = stream;
|
||||
bail!("can't start TLs without bidi stream");
|
||||
anyhow::bail!("can't start TLs without bidi stream");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -316,17 +321,16 @@ impl PostgresBackend {
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult> {
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
if self.state < ProtoState::Established {
|
||||
ensure!(
|
||||
matches!(
|
||||
msg,
|
||||
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
|
||||
),
|
||||
"protocol violation"
|
||||
);
|
||||
if self.state < ProtoState::Established
|
||||
&& !matches!(
|
||||
msg,
|
||||
FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
|
||||
)
|
||||
{
|
||||
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
|
||||
}
|
||||
|
||||
let have_tls = self.tls_config.is_some();
|
||||
@@ -350,8 +354,13 @@ impl PostgresBackend {
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
|
||||
bail!("client did not connect with TLS");
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS",
|
||||
None,
|
||||
))?;
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"client did not connect with TLS"
|
||||
)));
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
@@ -367,13 +376,6 @@ impl PostgresBackend {
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::MD5 => {
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.write_message(&BeMessage::AuthenticationMD5Password(
|
||||
self.md5_salt,
|
||||
))?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
@@ -393,20 +395,15 @@ impl PostgresBackend {
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => unreachable!(),
|
||||
AuthType::MD5 => {
|
||||
let (_, md5_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_md5(self, md5_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -420,33 +417,14 @@ impl PostgresBackend {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string) {
|
||||
// ":?" uses the alternate formatting style, which makes anyhow display the
|
||||
// full cause of the error, not just the top-level context + its trace.
|
||||
// We don't want to send that in the ErrorResponse though,
|
||||
// because it's not relevant to the compute node logs.
|
||||
//
|
||||
// We also don't want to log full stacktrace when the error is primitive,
|
||||
// such as usual connection closed.
|
||||
let short_error = format!("{:#}", e);
|
||||
let root_cause = e.root_cause().to_string();
|
||||
if root_cause.contains("connection closed unexpectedly")
|
||||
|| root_cause.contains("Broken pipe (os error 32)")
|
||||
{
|
||||
error!(
|
||||
"query handler for '{}' failed: {}",
|
||||
query_string, short_error
|
||||
);
|
||||
} else {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
|
||||
// TODO: untangle convoluted control flow
|
||||
if e.to_string().contains("failed to run") {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
@@ -471,11 +449,13 @@ impl PostgresBackend {
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
trace!("got execute {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string) {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
log_query_error(query_string, &e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
@@ -494,7 +474,9 @@ impl PostgresBackend {
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
|
||||
bail!("unexpected message type: {:?}", msg);
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message type: {msg:?}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,45 +4,87 @@
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
|
||||
use crate::postgres_backend::AuthType;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
|
||||
use rand::Rng;
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
|
||||
use std::future::Future;
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use tracing::{debug, error, trace};
|
||||
use tracing::{debug, error, info, trace};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
|
||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
e.kind(),
|
||||
ConnectionRefused | ConnectionAborted | ConnectionReset
|
||||
)
|
||||
}
|
||||
|
||||
/// An error, occurred during query processing:
|
||||
/// either during the connection ([`ConnectionError`]) or before/after it.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum QueryError {
|
||||
/// The connection was lost while processing the query.
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<io::Error> for QueryError {
|
||||
fn from(e: io::Error) -> Self {
|
||||
Self::Disconnected(ConnectionError::Socket(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Handler {
|
||||
/// Handle single query.
|
||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||
/// might be not what we want after CopyData streaming, but currently we don't
|
||||
/// care).
|
||||
async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError>;
|
||||
|
||||
/// Called on startup packet receival, allows to process params.
|
||||
///
|
||||
/// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
|
||||
/// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
|
||||
/// to override whole init logic in implementations.
|
||||
fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check auth md5
|
||||
fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
|
||||
bail!("MD5 auth failed")
|
||||
}
|
||||
|
||||
/// Check auth jwt
|
||||
fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
|
||||
bail!("JWT auth failed")
|
||||
fn check_auth_jwt(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_jwt_response: &[u8],
|
||||
) -> Result<(), QueryError> {
|
||||
Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,17 +118,14 @@ impl AsyncWrite for Stream {
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, std::io::Error>> {
|
||||
) -> Poll<io::Result<usize>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
|
||||
Self::Broken => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn poll_flush(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
|
||||
@@ -96,7 +135,7 @@ impl AsyncWrite for Stream {
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
|
||||
@@ -109,7 +148,7 @@ impl AsyncRead for Stream {
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut tokio::io::ReadBuf<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
) -> Poll<io::Result<()>> {
|
||||
match self.get_mut() {
|
||||
Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
|
||||
@@ -120,12 +159,14 @@ impl AsyncRead for Stream {
|
||||
|
||||
pub struct PostgresBackend {
|
||||
stream: Stream,
|
||||
|
||||
// Output buffer. c.f. BeMessage::write why we are using BytesMut here.
|
||||
// The data between 0 and "current position" as tracked by the bytes::Buf
|
||||
// implementation of BytesMut, have already been written.
|
||||
buf_out: BytesMut,
|
||||
|
||||
pub state: ProtoState,
|
||||
|
||||
md5_salt: [u8; 4],
|
||||
auth_type: AuthType,
|
||||
|
||||
peer_addr: SocketAddr,
|
||||
@@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
|
||||
}
|
||||
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
|
||||
fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
@@ -153,14 +194,13 @@ impl PostgresBackend {
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> std::io::Result<Self> {
|
||||
) -> io::Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
Ok(Self {
|
||||
stream: Stream::Unencrypted(BufReader::new(socket)),
|
||||
buf_out: BytesMut::with_capacity(10 * 1024),
|
||||
state: ProtoState::Initialization,
|
||||
md5_salt: [0u8; 4],
|
||||
auth_type,
|
||||
tls_config,
|
||||
peer_addr,
|
||||
@@ -172,30 +212,68 @@ impl PostgresBackend {
|
||||
}
|
||||
|
||||
/// Read full message or return None if connection is closed.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
|
||||
use ProtoState::*;
|
||||
match self.state {
|
||||
Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
|
||||
Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
|
||||
Closed => Ok(None),
|
||||
}
|
||||
.map_err(QueryError::from)
|
||||
}
|
||||
|
||||
/// Flush output buffer into the socket.
|
||||
pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
|
||||
self.stream.write_all(&self.buf_out).await?;
|
||||
pub async fn flush(&mut self) -> io::Result<()> {
|
||||
while self.buf_out.has_remaining() {
|
||||
let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
|
||||
self.buf_out.advance(bytes_written);
|
||||
}
|
||||
self.buf_out.clear();
|
||||
Ok(self)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write message into internal output buffer.
|
||||
pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
|
||||
pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
|
||||
BeMessage::write(&mut self.buf_out, message)?;
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Returns an AsyncWrite implementation that wraps all the data written
|
||||
/// to it in CopyData messages, and writes them to the connection
|
||||
///
|
||||
/// The caller is responsible for sending CopyOutResponse and CopyDone messages.
|
||||
pub fn copyout_writer(&mut self) -> CopyDataWriter {
|
||||
CopyDataWriter { pgb: self }
|
||||
}
|
||||
|
||||
/// A polling function that tries to write all the data from 'buf_out' to the
|
||||
/// underlying stream.
|
||||
fn poll_write_buf(
|
||||
&mut self,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
while self.buf_out.has_remaining() {
|
||||
match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
|
||||
Poll::Ready(Ok(bytes_written)) => {
|
||||
self.buf_out.advance(bytes_written);
|
||||
}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
}
|
||||
}
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
|
||||
Pin::new(&mut self.stream).poll_flush(cx)
|
||||
}
|
||||
|
||||
// Wrapper for run_message_loop() that shuts down socket when we are done
|
||||
pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
|
||||
pub async fn run<F, S>(
|
||||
mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
@@ -209,7 +287,7 @@ impl PostgresBackend {
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
shutdown_watcher: F,
|
||||
) -> Result<()>
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
S: Future,
|
||||
@@ -245,7 +323,7 @@ impl PostgresBackend {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Ok::<(), anyhow::Error>(())
|
||||
Ok::<(), QueryError>(())
|
||||
} => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
@@ -290,14 +368,14 @@ impl PostgresBackend {
|
||||
self.stream = Stream::Tls(Box::new(tls_stream));
|
||||
return Ok(());
|
||||
};
|
||||
bail!("TLS already started");
|
||||
anyhow::bail!("TLS already started");
|
||||
}
|
||||
|
||||
async fn process_handshake_message(
|
||||
&mut self,
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
) -> Result<ProcessMsgResult> {
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
assert!(self.state < ProtoState::Established);
|
||||
let have_tls = self.tls_config.is_some();
|
||||
match msg {
|
||||
@@ -320,8 +398,13 @@ impl PostgresBackend {
|
||||
}
|
||||
FeStartupPacket::StartupMessage { .. } => {
|
||||
if have_tls && !matches!(self.state, ProtoState::Encrypted) {
|
||||
self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
|
||||
bail!("client did not connect with TLS");
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS",
|
||||
None,
|
||||
))?;
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"client did not connect with TLS"
|
||||
)));
|
||||
}
|
||||
|
||||
// NB: startup() may change self.auth_type -- we are using that in proxy code
|
||||
@@ -337,13 +420,6 @@ impl PostgresBackend {
|
||||
.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
AuthType::MD5 => {
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.write_message(&BeMessage::AuthenticationMD5Password(
|
||||
self.md5_salt,
|
||||
))?;
|
||||
self.state = ProtoState::Authentication;
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
|
||||
self.state = ProtoState::Authentication;
|
||||
@@ -364,20 +440,15 @@ impl PostgresBackend {
|
||||
|
||||
match self.auth_type {
|
||||
AuthType::Trust => unreachable!(),
|
||||
AuthType::MD5 => {
|
||||
let (_, md5_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_md5(self, md5_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
}
|
||||
AuthType::NeonJWT => {
|
||||
let (_, jwt_response) = m.split_last().context("protocol violation")?;
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
bail!("auth failed: {}", e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -400,33 +471,28 @@ impl PostgresBackend {
|
||||
handler: &mut impl Handler,
|
||||
msg: FeMessage,
|
||||
unnamed_query_string: &mut Bytes,
|
||||
) -> Result<ProcessMsgResult> {
|
||||
) -> Result<ProcessMsgResult, QueryError> {
|
||||
// Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
|
||||
// TODO: change that to proper top-level match of protocol state with separate message handling for each state
|
||||
assert!(self.state == ProtoState::Established);
|
||||
|
||||
match msg {
|
||||
FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
|
||||
bail!("protocol violation");
|
||||
return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
|
||||
}
|
||||
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
// ":?" uses the alternate formatting style, which makes anyhow display the
|
||||
// full cause of the error, not just the top-level context + its trace.
|
||||
// We don't want to send that in the ErrorResponse though,
|
||||
// because it's not relevant to the compute node logs.
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
// TODO: untangle convoluted control flow
|
||||
if e.to_string().contains("failed to run") {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
}
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
@@ -451,11 +517,13 @@ impl PostgresBackend {
|
||||
|
||||
FeMessage::Execute(_) => {
|
||||
let query_string = cstr_to_str(unnamed_query_string)?;
|
||||
trace!("got execute {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
trace!("got execute {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
log_query_error(query_string, &e);
|
||||
self.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
// NOTE there is no ReadyForQuery message. This handler is used
|
||||
// for basebackup and it uses CopyOut which doesn't require
|
||||
@@ -474,10 +542,99 @@ impl PostgresBackend {
|
||||
// We prefer explicit pattern matching to wildcards, because
|
||||
// this helps us spot the places where new variants are missing
|
||||
FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
|
||||
bail!("unexpected message type: {:?}", msg);
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unexpected message type: {:?}",
|
||||
msg
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ProcessMsgResult::Continue)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
|
||||
/// messages.
|
||||
///
|
||||
|
||||
pub struct CopyDataWriter<'a> {
|
||||
pgb: &'a mut PostgresBackend,
|
||||
}
|
||||
|
||||
impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
|
||||
// It's not strictly required to flush between each message, but makes it easier
|
||||
// to view in wireshark, and usually the messages that the callers write are
|
||||
// decently-sized anyway.
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
}
|
||||
|
||||
// CopyData
|
||||
// XXX: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
// the length cannot exceed u32.
|
||||
this.pgb.write_message(&BeMessage::CopyData(buf))?;
|
||||
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
fn poll_flush(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn log_query_error(query: &str, e: &QueryError) {
|
||||
match e {
|
||||
QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
|
||||
if is_expected_io_error(io_error) {
|
||||
info!("query handler for '{query}' failed with expected io error: {io_error}");
|
||||
} else {
|
||||
error!("query handler for '{query}' failed with io error: {io_error}");
|
||||
}
|
||||
}
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
|
||||
use utils::{
|
||||
postgres_backend::{AuthType, Handler, PostgresBackend},
|
||||
postgres_backend_async::QueryError,
|
||||
};
|
||||
|
||||
fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
||||
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
|
||||
@@ -105,7 +108,7 @@ fn ssl() {
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), QueryError> {
|
||||
self.got_query = query_string == QUERY;
|
||||
Ok(())
|
||||
}
|
||||
@@ -152,7 +155,7 @@ fn no_ssl() {
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_query_string: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), QueryError> {
|
||||
panic!()
|
||||
}
|
||||
}
|
||||
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_query_string: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), QueryError> {
|
||||
panic!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "pageserver"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
@@ -9,8 +10,6 @@ default = []
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
profiling = ["pprof"]
|
||||
|
||||
[dependencies]
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
@@ -39,7 +38,6 @@ pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
rstar = "0.9.3"
|
||||
@@ -49,7 +47,7 @@ serde_json = { version = "1.0", features = ["raw_value"] }
|
||||
serde_with = "2.0"
|
||||
signal-hook = "0.3.10"
|
||||
svg_fmt = "0.4.1"
|
||||
tar = "0.4.33"
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
@@ -69,7 +67,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
reqwest = "0.11.13"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
|
||||
@@ -10,20 +10,25 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tracing::*;
|
||||
|
||||
use crate::task_mgr;
|
||||
use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
|
||||
/// NB: This relies on a modified version of tokio_tar that does *not* write the
|
||||
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
|
||||
/// without explicitly calling 'finish' or 'into_inner'!
|
||||
///
|
||||
/// See https://github.com/neondatabase/tokio-tar/pull/1
|
||||
///
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::tenant::TimelineRequestContext;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
@@ -34,116 +39,135 @@ use postgres_ffi::PG_TLI;
|
||||
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
/// Only include relational data if 'full_backup' is true.
|
||||
///
|
||||
/// Currently we use empty 'req_lsn' in two cases:
|
||||
/// * During the basebackup right after timeline creation
|
||||
/// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
/// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
/// to start the replication.
|
||||
pub async fn send_basebackup_tarball<'a, W>(
|
||||
write: &'a mut W,
|
||||
timeline: &'a Timeline,
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
ctx: &'a TimelineRequestContext,
|
||||
) -> Result<(), PageReconstructError>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
// "zenith.signal" file, so that postgres can read it during startup.
|
||||
//
|
||||
// We don't keep full history of record boundaries in the page server,
|
||||
// however, only the predecessor of the latest record on each
|
||||
// timeline. So we can only provide prev_record_lsn when you take a
|
||||
// base backup at the end of the timeline, i.e. at last_record_lsn.
|
||||
// Even at the end of the timeline, we sometimes don't have a valid
|
||||
// prev_lsn value; that happens if the timeline was just branched from
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. The caller should've
|
||||
// already checked that it's a valid LSN.
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
// zero, though, if no WAL has been generated on this timeline
|
||||
// yet.)
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
if req_lsn == end_of_timeline.last {
|
||||
(end_of_timeline.prev, req_lsn)
|
||||
} else {
|
||||
(Lsn(0), req_lsn)
|
||||
}
|
||||
} else {
|
||||
// Backup was requested at end of the timeline.
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||
return Err(PageReconstructError::Other(anyhow!(
|
||||
"prev LSN doesn't match"
|
||||
)));
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
backup_prev
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
backup_lsn, prev_lsn, full_backup
|
||||
);
|
||||
|
||||
let basebackup = Basebackup {
|
||||
ar: Builder::new_non_terminated(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
ctx,
|
||||
};
|
||||
basebackup
|
||||
.send_tarball()
|
||||
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
|
||||
.await
|
||||
}
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a, W>
|
||||
struct Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
ar: Builder<AbortableWrite<W>>,
|
||||
timeline: &'a Arc<Timeline>,
|
||||
pub lsn: Lsn,
|
||||
ar: Builder<&'a mut W>,
|
||||
timeline: &'a Timeline,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
full_backup: bool,
|
||||
finished: bool,
|
||||
ctx: &'a TimelineRequestContext,
|
||||
}
|
||||
|
||||
// Create basebackup with non-rel data in it.
|
||||
// Only include relational data if 'full_backup' is true.
|
||||
//
|
||||
// Currently we use empty lsn in two cases:
|
||||
// * During the basebackup right after timeline creation
|
||||
// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
// to start the replication.
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
pub fn new(
|
||||
write: W,
|
||||
timeline: &'a Arc<Timeline>,
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
) -> Result<Basebackup<'a, W>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
// "zenith.signal" file, so that postgres can read it during startup.
|
||||
//
|
||||
// We don't keep full history of record boundaries in the page server,
|
||||
// however, only the predecessor of the latest record on each
|
||||
// timeline. So we can only provide prev_record_lsn when you take a
|
||||
// base backup at the end of the timeline, i.e. at last_record_lsn.
|
||||
// Even at the end of the timeline, we sometimes don't have a valid
|
||||
// prev_lsn value; that happens if the timeline was just branched from
|
||||
// an old LSN and it doesn't have any WAL of its own yet. We will set
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. The caller should've
|
||||
// already checked that it's a valid LSN.
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
// zero, though, if no WAL has been generated on this timeline
|
||||
// yet.)
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
if req_lsn == end_of_timeline.last {
|
||||
(end_of_timeline.prev, req_lsn)
|
||||
} else {
|
||||
(Lsn(0), req_lsn)
|
||||
}
|
||||
} else {
|
||||
// Backup was requested at end of the timeline.
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) {
|
||||
ensure!(backup_prev == provided_prev_lsn)
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
backup_prev
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
backup_lsn, prev_lsn, full_backup
|
||||
);
|
||||
|
||||
Ok(Basebackup {
|
||||
ar: Builder::new(AbortableWrite::new(write)),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
finished: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
async fn send_tarball(mut self) -> Result<(), PageReconstructError> {
|
||||
// TODO include checksum
|
||||
|
||||
// Create pgdata subdirs structure
|
||||
for dir in PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.context("could not add directory to basebackup tarball")?;
|
||||
}
|
||||
|
||||
// Send empty config files.
|
||||
// Send config files.
|
||||
for filepath in PGDATA_SPECIAL_FILES.iter() {
|
||||
if *filepath == "pg_hba.conf" {
|
||||
let data = PG_HBA.as_bytes();
|
||||
let header = new_tar_header(filepath, data.len() as u64)?;
|
||||
self.ar.append(&header, data)?;
|
||||
self.ar
|
||||
.append(&header, data)
|
||||
.await
|
||||
.context("could not add config file to basebackup tarball")?;
|
||||
} else {
|
||||
let header = new_tar_header(filepath, 0)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.context("could not add config file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,75 +177,93 @@ where
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in
|
||||
with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno)?;
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
// Gather and send relational files in each database if full backup is requested.
|
||||
if self.full_backup {
|
||||
for rel in with_ondemand_download_sync(|| {
|
||||
self.timeline.list_rels(spcnode, dbnode, self.lsn)
|
||||
})? {
|
||||
self.add_rel(rel)?;
|
||||
for rel in self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_rel(rel).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
|
||||
self.add_twophase_file(xid)?;
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
bail!("failpoint basebackup-before-control-file")
|
||||
Err(PageReconstructError::from(anyhow!(
|
||||
"failpoint basebackup-before-control-file"
|
||||
)))
|
||||
});
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
self.finished = true;
|
||||
self.add_pgcontrol_file().await?;
|
||||
self.ar.finish().await.context("could not finish tarball")?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks =
|
||||
with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
|
||||
|
||||
// Function that adds relation segment data to archive
|
||||
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
|
||||
let file_name = tag.to_segfile_name(segment_index as u32);
|
||||
let header = new_tar_header(&file_name, data.len() as u64)?;
|
||||
self.ar.append(&header, data.as_slice())?;
|
||||
Ok(())
|
||||
};
|
||||
async fn add_rel(&mut self, tag: RelTag) -> Result<(), PageReconstructError> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(tag, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
add_file(0, &vec![])?;
|
||||
let file_name = tag.to_segfile_name(0);
|
||||
let header = new_tar_header(&file_name, 0)?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.await
|
||||
.context("could not write empty relfile to tar stream")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Add a file for each chunk of blocks (aka segment)
|
||||
let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
|
||||
for (seg, blocks) in chunks.into_iter().enumerate() {
|
||||
let mut startblk = 0;
|
||||
let mut seg = 0;
|
||||
while startblk < nblocks {
|
||||
let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
|
||||
let mut segment_data: Vec<u8> = vec![];
|
||||
for blknum in blocks {
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
|
||||
.no_ondemand_download()?;
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
add_file(seg, &segment_data)?;
|
||||
let file_name = tag.to_segfile_name(seg as u32);
|
||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, segment_data.as_slice())
|
||||
.await
|
||||
.context("could not write relfile segment to tar stream")?;
|
||||
|
||||
seg += 1;
|
||||
startblk = endblk;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -230,17 +272,18 @@ where
|
||||
//
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = with_ondemand_download_sync(|| {
|
||||
self.timeline.get_slru_segment_size(slru, segno, self.lsn)
|
||||
})?;
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = with_ondemand_download_sync(|| {
|
||||
self.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
|
||||
})?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
@@ -253,7 +296,7 @@ where
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
|
||||
self.ar.append(&header, slru_buf.as_slice())?;
|
||||
self.ar.append(&header, slru_buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
Ok(())
|
||||
@@ -265,16 +308,17 @@ where
|
||||
// Each directory contains a PG_VERSION file, and the default database
|
||||
// directories also contain pg_filenode.map files.
|
||||
//
|
||||
fn add_dbdir(
|
||||
async fn add_dbdir(
|
||||
&mut self,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = with_ondemand_download_sync(|| {
|
||||
self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
|
||||
})?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
ensure!(img.len() == 512);
|
||||
Some(img)
|
||||
} else {
|
||||
@@ -284,14 +328,14 @@ where
|
||||
if spcnode == GLOBALTABLESPACE_OID {
|
||||
let pg_version_str = self.timeline.pg_version.to_string();
|
||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes())?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
|
||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
// filenode map for global tablespace
|
||||
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
} else {
|
||||
warn!("global/pg_filenode.map is missing");
|
||||
}
|
||||
@@ -309,8 +353,8 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn)
|
||||
.no_ondemand_download()?
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
@@ -321,18 +365,18 @@ where
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", dbnode);
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
self.ar.append(&header, &mut io::empty()).await?;
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
|
||||
let pg_version_str = self.timeline.pg_version.to_string();
|
||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes())?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
self.ar.append(&header, &img[..]).await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
@@ -341,8 +385,11 @@ where
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -350,7 +397,7 @@ where
|
||||
buf.put_u32_le(crc);
|
||||
let path = format!("pg_twophase/{:>08X}", xid);
|
||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||
self.ar.append(&header, &buf[..])?;
|
||||
self.ar.append(&header, &buf[..]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -359,7 +406,7 @@ where
|
||||
// Add generated pg_control file and bootstrap WAL segment.
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
@@ -371,17 +418,23 @@ where
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
||||
}
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)?;
|
||||
self.ar
|
||||
.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let checkpoint_bytes =
|
||||
with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes =
|
||||
with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
|
||||
.context("failed get control bytes")?;
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_checkpoint(self.lsn, self.ctx)
|
||||
.await
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_control_file(self.lsn, self.ctx)
|
||||
.await
|
||||
.context("failed get control bytes")?;
|
||||
|
||||
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
|
||||
&pg_control_bytes,
|
||||
@@ -392,7 +445,7 @@ where
|
||||
|
||||
//send pg_control
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
self.ar.append(&header, &pg_control_bytes[..]).await?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
@@ -404,24 +457,11 @@ where
|
||||
postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
self.ar.append(&header, &wal_seg[..]).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W> Drop for Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// If the basebackup was not finished, prevent the Archive::drop() from
|
||||
/// writing the end-of-archive marker.
|
||||
fn drop(&mut self) {
|
||||
if !self.finished {
|
||||
self.ar.get_mut().abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Create new tarball entry header
|
||||
//
|
||||
@@ -457,57 +497,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
}
|
||||
|
||||
/// A wrapper that passes through all data to the underlying Write,
|
||||
/// until abort() is called.
|
||||
///
|
||||
/// tar::Builder has an annoying habit of finishing the archive with
|
||||
/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
|
||||
/// even if an error occurs and we don't finish building the archive.
|
||||
/// We'd rather abort writing the tarball immediately than construct
|
||||
/// a seemingly valid but incomplete archive. This wrapper allows us
|
||||
/// to swallow the end-of-archive marker that Builder::drop() emits,
|
||||
/// without writing it to the underlying sink.
|
||||
///
|
||||
struct AbortableWrite<W> {
|
||||
w: W,
|
||||
aborted: bool,
|
||||
}
|
||||
|
||||
impl<W> AbortableWrite<W> {
|
||||
pub fn new(w: W) -> Self {
|
||||
AbortableWrite { w, aborted: false }
|
||||
}
|
||||
|
||||
pub fn abort(&mut self) {
|
||||
self.aborted = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl<W> Write for AbortableWrite<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
|
||||
if self.aborted {
|
||||
Ok(data.len())
|
||||
} else {
|
||||
self.w.write(data)
|
||||
}
|
||||
}
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
if self.aborted {
|
||||
Ok(())
|
||||
} else {
|
||||
self.w.flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
|
||||
where
|
||||
F: Send + Fn() -> PageReconstructResult<T>,
|
||||
T: Send,
|
||||
{
|
||||
task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
|
||||
}
|
||||
|
||||
@@ -13,8 +13,8 @@ use tracing::*;
|
||||
use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
context::{DownloadBehavior, RequestContext, TaskKind},
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
@@ -40,8 +40,6 @@ const FEATURES: &[&str] = &[
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
|
||||
fn version() -> String {
|
||||
@@ -247,15 +245,12 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
// Install signal handlers
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// Start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
|
||||
// Launch broker client
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
|
||||
|
||||
// Initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
AuthType::Trust => None,
|
||||
AuthType::NeonJWT => {
|
||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||
@@ -264,19 +259,35 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
match var("ZENITH_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
|
||||
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
|
||||
(old, Ok(v)) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
if let Ok(v_old) = old {
|
||||
warn!(
|
||||
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
|
||||
);
|
||||
if v_old != v {
|
||||
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
|
||||
}
|
||||
}
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
Err(VarError::NotPresent) => {
|
||||
(Ok(v), _) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
(_, Err(VarError::NotPresent)) => {
|
||||
info!("No JWT token for authentication with Safekeeper detected");
|
||||
}
|
||||
Err(e) => {
|
||||
(_, Err(e)) => {
|
||||
return Err(e).with_context(|| {
|
||||
"Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
|
||||
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
|
||||
})
|
||||
}
|
||||
};
|
||||
@@ -292,61 +303,79 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let mgmt_ctx = RequestContext::new(TaskKind::HttpEndpointListener, DownloadBehavior::Error);
|
||||
let cancellation_token = Box::leak(Box::new(mgmt_ctx.cancellation_token().clone()));
|
||||
let router = http::make_router(conf, auth.clone(), remote_storage)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
||||
.with_graceful_shutdown(cancellation_token.cancelled());
|
||||
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::HttpEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"http endpoint listener",
|
||||
true,
|
||||
async {
|
||||
server.await?;
|
||||
Ok(())
|
||||
match server.await {
|
||||
Ok(()) => info!("HTTP endpoint listener shut down"),
|
||||
Err(err) => error!("HTTP endpoint listener shut down with error: {err:?}"),
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
pageserver::billing_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let metrics_ctx = RequestContext::new(
|
||||
TaskKind::MetricsCollection,
|
||||
DownloadBehavior::Error, // metrics collector shouldn't be downloading anything
|
||||
);
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.id,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection")),
|
||||
);
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
task_mgr::spawn(
|
||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::LibpqEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint listener",
|
||||
true,
|
||||
async move {
|
||||
page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
|
||||
},
|
||||
);
|
||||
{
|
||||
let libpq_ctx = RequestContext::new(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||
"libpq endpoint listener",
|
||||
true,
|
||||
async move {
|
||||
match page_service::libpq_listener_main(
|
||||
conf,
|
||||
auth,
|
||||
pageserver_listener,
|
||||
conf.auth_type,
|
||||
libpq_ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => info!("libpq endpoint listener shut down"),
|
||||
Err(err) => error!("libpq endpoint listener shut down with error: {err:?}"),
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
signals.handle(|signal| match signal {
|
||||
@@ -355,7 +384,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
std::process::exit(111);
|
||||
}
|
||||
|
||||
@@ -364,8 +392,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
|
||||
BACKGROUND_RUNTIME.block_on(task_mgr::shutdown_pageserver(0));
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
@@ -138,7 +138,6 @@ pub struct PageServerConf {
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
|
||||
/// Storage broker endpoints to connect to.
|
||||
@@ -165,25 +164,6 @@ pub struct PageServerConf {
|
||||
/// startup code to the connection code through a dozen layers.
|
||||
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ProfilingConfig {
|
||||
Disabled,
|
||||
PageRequests,
|
||||
}
|
||||
|
||||
impl FromStr for ProfilingConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
|
||||
let result = match s {
|
||||
"disabled" => ProfilingConfig::Disabled,
|
||||
"page_requests" => ProfilingConfig::PageRequests,
|
||||
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
// and avoid possible confusion with nested options
|
||||
pub enum BuilderValue<T> {
|
||||
@@ -226,7 +206,6 @@ struct PageServerConfigBuilder {
|
||||
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_endpoint: BuilderValue<Uri>,
|
||||
broker_keepalive_interval: BuilderValue<Duration>,
|
||||
|
||||
@@ -262,7 +241,6 @@ impl Default for PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
@@ -348,10 +326,6 @@ impl PageServerConfigBuilder {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn profiling(&mut self, profiling: ProfilingConfig) {
|
||||
self.profiling = BuilderValue::Set(profiling)
|
||||
}
|
||||
|
||||
pub fn log_format(&mut self, log_format: LogFormat) {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
@@ -405,7 +379,6 @@ impl PageServerConfigBuilder {
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: self
|
||||
@@ -588,7 +561,6 @@ impl PageServerConf {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||
"log_format" => builder.log_format(
|
||||
@@ -722,7 +694,6 @@ impl PageServerConf {
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5000),
|
||||
@@ -898,7 +869,6 @@ log_format = 'json'
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: humantime::parse_duration(
|
||||
@@ -949,7 +919,6 @@ log_format = 'json'
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
|
||||
@@ -6,23 +6,25 @@
|
||||
|
||||
use anyhow;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::task_mgr;
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::mgr;
|
||||
use pageserver_api::models::TenantState;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use reqwest::Url;
|
||||
|
||||
/// BillingMetric struct that defines the format for one metric entry
|
||||
/// ConsumptionMetric struct that defines the format for one metric entry
|
||||
/// i.e.
|
||||
///
|
||||
/// ```json
|
||||
@@ -30,27 +32,36 @@ use reqwest::Url;
|
||||
/// "metric": "remote_storage_size",
|
||||
/// "type": "absolute",
|
||||
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
/// "timeline_id": "00000000000000000000000000000000",
|
||||
/// "time": ...,
|
||||
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
/// "time": "2022-12-28T11:07:19.317310284Z",
|
||||
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
/// "value": 12345454,
|
||||
/// }
|
||||
/// ```
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct BillingMetric {
|
||||
pub metric: BillingMetricKind,
|
||||
pub struct ConsumptionMetric {
|
||||
pub metric: ConsumptionMetricKind,
|
||||
#[serde(rename = "type")]
|
||||
pub metric_type: &'static str,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub time: DateTime<Utc>,
|
||||
pub idempotency_key: String,
|
||||
pub value: u64,
|
||||
}
|
||||
|
||||
impl BillingMetric {
|
||||
pub fn new_absolute(
|
||||
metric: BillingMetricKind,
|
||||
impl ConsumptionMetric {
|
||||
pub fn new_absolute<R: Rng + ?Sized>(
|
||||
metric: ConsumptionMetricKind,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
value: u64,
|
||||
node_id: NodeId,
|
||||
rng: &mut R,
|
||||
) -> Self {
|
||||
Self {
|
||||
metric,
|
||||
@@ -58,6 +69,8 @@ impl BillingMetric {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
time: Utc::now(),
|
||||
// key that allows metric collector to distinguish unique events
|
||||
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
|
||||
value,
|
||||
}
|
||||
}
|
||||
@@ -65,7 +78,7 @@ impl BillingMetric {
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum BillingMetricKind {
|
||||
pub enum ConsumptionMetricKind {
|
||||
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
/// This is an absolute, per-timeline metric.
|
||||
WrittenSize,
|
||||
@@ -80,9 +93,12 @@ pub enum BillingMetricKind {
|
||||
/// Size of the remote storage (S3) directory.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
RemoteStorageSize,
|
||||
/// Logical size of the data in the timeline
|
||||
/// This is an absolute, per-timeline metric
|
||||
TimelineLogicalSize,
|
||||
}
|
||||
|
||||
impl FromStr for BillingMetricKind {
|
||||
impl FromStr for ConsumptionMetricKind {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
@@ -91,55 +107,62 @@ impl FromStr for BillingMetricKind {
|
||||
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
|
||||
"resident_size" => Ok(Self::ResidentSize),
|
||||
"remote_storage_size" => Ok(Self::RemoteStorageSize),
|
||||
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for BillingMetricKind {
|
||||
impl fmt::Display for ConsumptionMetricKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
BillingMetricKind::WrittenSize => "written_size",
|
||||
BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||
BillingMetricKind::ResidentSize => "resident_size",
|
||||
BillingMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||
ConsumptionMetricKind::WrittenSize => "written_size",
|
||||
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||
ConsumptionMetricKind::ResidentSize => "resident_size",
|
||||
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct BillingMetricsKey {
|
||||
pub struct ConsumptionMetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: BillingMetricKind,
|
||||
metric: ConsumptionMetricKind,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct EventChunk<'a> {
|
||||
events: &'a [BillingMetric],
|
||||
events: &'a [ConsumptionMetric],
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
/// Main task that serves metrics collection
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_interval: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
node_id: NodeId,
|
||||
metrics_ctx: RequestContext,
|
||||
) {
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!("starting collect_metrics");
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
|
||||
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = metrics_ctx.cancelled() => {
|
||||
info!("collect_metrics received cancellation request");
|
||||
return Ok(());
|
||||
return;
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
|
||||
if let Err(err) = collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &metrics_ctx).await {
|
||||
// Log the error and continue
|
||||
error!("metrics collection failed: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -151,10 +174,12 @@ pub async fn collect_metrics(
|
||||
/// Cache metrics to avoid sending the same metrics multiple times.
|
||||
pub async fn collect_metrics_task(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
|
||||
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
|
||||
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_task. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
@@ -165,33 +190,55 @@ pub async fn collect_metrics_task(
|
||||
|
||||
// iterate through list of Active tenants and collect metrics
|
||||
for (tenant_id, tenant_state) in tenants {
|
||||
if tenant_state != TenantState::Active {
|
||||
if ctx.is_cancelled() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||
// If the tenant was shut down while while we were looking elsewhere, skip it.
|
||||
let tenant_ctx = match tenant.get_context(ctx) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(_state) => {
|
||||
debug!(
|
||||
"skipping metrics collection for tenant {tenant_id} because it is not active"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if let Ok(timeline_ctx) = timeline.get_context(&tenant_ctx) {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
current_metrics.push((
|
||||
BillingMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: BillingMetricKind::WrittenSize,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::WrittenSize,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let (timeline_logical_size, is_exact) =
|
||||
timeline.get_current_logical_size(&timeline_ctx)?;
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
if is_exact {
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::TimelineLogicalSize,
|
||||
},
|
||||
timeline_logical_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
|
||||
debug!(
|
||||
"per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
|
||||
tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
|
||||
}
|
||||
|
||||
let tenant_remote_size = tenant.get_remote_size().await?;
|
||||
@@ -201,19 +248,19 @@ pub async fn collect_metrics_task(
|
||||
);
|
||||
|
||||
current_metrics.push((
|
||||
BillingMetricsKey {
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: BillingMetricKind::ResidentSize,
|
||||
metric: ConsumptionMetricKind::ResidentSize,
|
||||
},
|
||||
tenant_resident_size,
|
||||
));
|
||||
|
||||
current_metrics.push((
|
||||
BillingMetricsKey {
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: BillingMetricKind::RemoteStorageSize,
|
||||
metric: ConsumptionMetricKind::RemoteStorageSize,
|
||||
},
|
||||
tenant_remote_size,
|
||||
));
|
||||
@@ -237,24 +284,32 @@ pub async fn collect_metrics_task(
|
||||
const CHUNK_SIZE: usize = 1000;
|
||||
let chunks = current_metrics.chunks(CHUNK_SIZE);
|
||||
|
||||
let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
|
||||
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
// enrich metrics with timestamp and metric_kind before sending
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
|
||||
BillingMetric::new_absolute(
|
||||
curr_key.metric,
|
||||
curr_key.tenant_id,
|
||||
curr_key.timeline_id,
|
||||
*curr_val,
|
||||
)
|
||||
}));
|
||||
|
||||
// this code block is needed to convince compiler
|
||||
// that rng is not reused aroung await point
|
||||
{
|
||||
// enrich metrics with timestamp and metric_kind before sending
|
||||
let mut rng = rand::thread_rng();
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
|
||||
ConsumptionMetric::new_absolute(
|
||||
curr_key.metric,
|
||||
curr_key.tenant_id,
|
||||
curr_key.timeline_id,
|
||||
*curr_val,
|
||||
node_id,
|
||||
&mut rng,
|
||||
)
|
||||
}));
|
||||
}
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("BillingMetric should not fail serialization");
|
||||
.expect("ConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
348
pageserver/src/context.rs
Normal file
348
pageserver/src/context.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
//!
|
||||
//! Most async functions throughout the pageserver take a `ctx: &RequestContext`
|
||||
//! argument. It is used to control desired behaviour of the operation, and to
|
||||
//! allow cancelling the operation gracefully.
|
||||
//!
|
||||
//! # Context hierarchy
|
||||
//!
|
||||
//! RequestContext's form a hierarchy. For example:
|
||||
//!
|
||||
//! listener context (LibpqEndpointListener)
|
||||
//! connection context (PageRequestHandler)
|
||||
//! per-request context (PageRequestHandler)
|
||||
//!
|
||||
//! The top "listener context" is created at pageserver startup. The tokio
|
||||
//! task that listens on the libpq protocol TCP port holds that context. When
|
||||
//! it accepts a connection, it spawns a new task to handle that connection
|
||||
//! and creates a new per-connection context for it. The mgmt API listener,
|
||||
//! background jobs, and other things form separate but similar hierarchies.
|
||||
//!
|
||||
//! Usually, each tokio task has its own context, but it's not a strict
|
||||
//! requirement and some tasks can hold multiple contexts, and converesely,
|
||||
//! some contexts are shared by multiple tasks that work together to perform
|
||||
//! some operation.
|
||||
//!
|
||||
//! The hierarchy is not explictly tracked in the RequestContext struct
|
||||
//! itself, but only by their cancellation tokens. It's entirely possible for
|
||||
//! the parent context to be dropped before its children.
|
||||
//!
|
||||
//! # Tenant and Timeline registration
|
||||
//!
|
||||
//! Most operations are performed on a particular Tenant or Timeline. When
|
||||
//! operating on a Tenant or Timeline, it's important that the Tenant/Timeline
|
||||
//! isn't detached or deleted while there are tasks working on it. To ensure
|
||||
//! that, a RequestContext can be registered with a Tenant or Timeline. See
|
||||
//! `Tenant::register_context` and `Timeline::register_context` When
|
||||
//! shutting down a Tenant or Timeline, the shutdown routine cancels all the
|
||||
//! registered contexts, and waits for them to be dropped before completing
|
||||
//! the shutdown.
|
||||
//!
|
||||
//! To enforce that you hold a registered context when operating on a Tenant
|
||||
//! or Timeline, most functions take a TimelineRequestContext or
|
||||
//! TenantRequestContext reference as argument.
|
||||
//!
|
||||
//! NOTE: The Tenant / Timeline registration is separate from the context
|
||||
//! hierarchy. You can create a new RequestContext with TimelineRequestContext
|
||||
//! as the parent, and register it with a different timeline, for example.
|
||||
//!
|
||||
//! # Notes
|
||||
//!
|
||||
//! All RequestContexts in the system have a unique ID, and are also tracked
|
||||
//! in a global hash table, CONTEXTS.
|
||||
//!
|
||||
//! - Futures are normally not assumed to be async cancellation-safe. Pass a
|
||||
//! RequestContext as argument and use cancel() on it instead.
|
||||
//!
|
||||
//! - If you perform an operation that depends on some external actor or the
|
||||
//! network, use the cancellation token to check for cancellation
|
||||
//!
|
||||
//! - By convention, the appropriate context for current operation is carried in
|
||||
//! a variable called 'ctx'. If a function handles multiple contexts, it's
|
||||
//! best to *not* have a variable called 'ctx', to force you to think which
|
||||
//! one to use in each call.
|
||||
//!
|
||||
//! # TODO
|
||||
//! - include a unique request ID for tracing
|
||||
//!
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// Each RequestContext has a unique context ID. It's just an increasing
|
||||
/// number that we assign.
|
||||
static NEXT_CONTEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// Global registry of contexts
|
||||
static CONTEXTS: Lazy<Mutex<HashMap<RequestContextId, (TaskKind, CancellationToken)>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct RequestContextId(u64);
|
||||
|
||||
///
|
||||
pub struct RequestContext {
|
||||
context_id: RequestContextId,
|
||||
task_kind: TaskKind,
|
||||
|
||||
download_behavior: DownloadBehavior,
|
||||
cancellation_token: CancellationToken,
|
||||
}
|
||||
|
||||
/// DownloadBehavior option specifies the behavior if completing the operation
|
||||
/// would require downloading a layer file from remote storage.
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DownloadBehavior {
|
||||
/// Download the layer file. It can take a while.
|
||||
Download,
|
||||
|
||||
/// Download the layer file, but print a warning to the log. This should be used
|
||||
/// in code where the layer file is expected to already exist locally.
|
||||
Warn,
|
||||
|
||||
/// Return a PageReconstructError::NeedsDownload error
|
||||
Error,
|
||||
}
|
||||
|
||||
///
|
||||
/// There are many kinds of tasks in the system. Some are associated with a particular
|
||||
/// tenant or timeline, while others are global.
|
||||
///
|
||||
/// The task kind affects the shutdown sequence on pageserver shutdown and on detach
|
||||
/// of an individual tenant. For example, when shutting down the pageserver, we shut
|
||||
/// down the LibpqEndpointListeners first, so that we don't accept any more client
|
||||
/// connections while we perform the rest of the shutdown duties. See
|
||||
/// [`Timeline::graceful_shutdown and`] and [`tenant_mgr::shutdown_pageserver`]
|
||||
/// for details.
|
||||
///
|
||||
/// Note that we don't try to limit how many task of a certain kind can be running
|
||||
/// at the same time.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub enum TaskKind {
|
||||
// libpq listener task. It just accepts connection and spawns a
|
||||
// PageRequestHandler task for each connection.
|
||||
LibpqEndpointListener,
|
||||
|
||||
// HTTP endpoint listener.
|
||||
HttpEndpointListener,
|
||||
|
||||
// Task that handles a single connection. A PageRequestHandler task
|
||||
// starts detached from any particular tenant or timeline, but it can be
|
||||
// associated with one later, after receiving a command from the client.
|
||||
PageRequestHandler,
|
||||
|
||||
// Context for one management API request
|
||||
MgmtRequest,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
WalReceiverManager,
|
||||
|
||||
// Handles a connection to a safekeeper, to stream WAL to a timeline.
|
||||
WalReceiverConnection,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
GarbageCollector,
|
||||
|
||||
// Compaction. One per tenant.
|
||||
Compaction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
// Task that flushes frozen in-memory layers to disk
|
||||
LayerFlush,
|
||||
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
// task that handles attaching a tenant
|
||||
Attach,
|
||||
|
||||
// task that handles metrics collection
|
||||
MetricsCollection,
|
||||
|
||||
// task that drives downloading layers
|
||||
DownloadAllRemoteLayers,
|
||||
|
||||
// Only used in unit tests
|
||||
UnitTest,
|
||||
}
|
||||
|
||||
impl Drop for RequestContext {
|
||||
fn drop(&mut self) {
|
||||
CONTEXTS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&self.context_id)
|
||||
.expect("context is not in global registry");
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestContext {
|
||||
/// Create a new RequestContext
|
||||
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
let cancellation_token = CancellationToken::new();
|
||||
let context_id = RequestContextId(NEXT_CONTEXT_ID.fetch_add(1, Ordering::Relaxed));
|
||||
CONTEXTS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(context_id, (task_kind, cancellation_token.clone()));
|
||||
|
||||
RequestContext {
|
||||
task_kind,
|
||||
context_id,
|
||||
download_behavior,
|
||||
cancellation_token,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new RequestContext, as a child of 'parent'.
|
||||
pub fn with_parent(
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
parent: &RequestContext,
|
||||
) -> Self {
|
||||
let cancellation_token = parent.cancellation_token.child_token();
|
||||
let context_id = RequestContextId(NEXT_CONTEXT_ID.fetch_add(1, Ordering::Relaxed));
|
||||
CONTEXTS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(context_id, (task_kind, cancellation_token.clone()));
|
||||
|
||||
RequestContext {
|
||||
task_kind,
|
||||
context_id,
|
||||
download_behavior,
|
||||
cancellation_token,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn context_id(&self) -> RequestContextId {
|
||||
self.context_id
|
||||
}
|
||||
|
||||
pub fn task_kind(&self) -> TaskKind {
|
||||
self.task_kind
|
||||
}
|
||||
|
||||
pub fn download_behavior(&self) -> DownloadBehavior {
|
||||
self.download_behavior
|
||||
}
|
||||
|
||||
pub fn cancellation_token(&self) -> &CancellationToken {
|
||||
&self.cancellation_token
|
||||
}
|
||||
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
self.cancellation_token.is_cancelled()
|
||||
}
|
||||
|
||||
pub async fn cancelled(&self) {
|
||||
self.cancellation_token.cancelled().await
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Cancel all the contexts in 'context_ids' and wait for them to finish.
|
||||
///
|
||||
/// Whenever we notice that one of the contexts has finished, it is removed
|
||||
/// from 'context_ids'. On return, it is empty.
|
||||
///
|
||||
pub async fn cancel_and_wait(context_ids: &mut Vec<RequestContextId>) {
|
||||
{
|
||||
let contexts = CONTEXTS.lock().unwrap();
|
||||
context_ids.retain(|context_id| {
|
||||
if let Some((task_kind, cancellation_token)) = contexts.get(context_id) {
|
||||
info!("cancelling task {task_kind:?} with ID {context_id:?}");
|
||||
cancellation_token.cancel();
|
||||
true
|
||||
} else {
|
||||
// Already gone
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
wait_contexts_to_finish(context_ids).await
|
||||
}
|
||||
|
||||
async fn wait_contexts_to_finish(context_ids: &mut Vec<RequestContextId>) {
|
||||
let mut n = 0;
|
||||
while !context_ids.is_empty() {
|
||||
{
|
||||
let contexts = CONTEXTS.lock().unwrap();
|
||||
while let Some(context_id) = context_ids.last() {
|
||||
if let Some((task_kind, _cancellation_token)) = contexts.get(context_id) {
|
||||
info!("waiting for task {task_kind:?} with ID {context_id:?} to finish");
|
||||
break;
|
||||
} else {
|
||||
context_ids.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
if !context_ids.is_empty() {
|
||||
crate::exponential_backoff(
|
||||
n,
|
||||
crate::DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
crate::DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancel and wait for all tasks of given 'kind' to finish
|
||||
pub async fn shutdown_tasks(kind: TaskKind) {
|
||||
let mut context_ids = Vec::new();
|
||||
{
|
||||
let contexts = CONTEXTS.lock().unwrap();
|
||||
for (&context_id, (task_kind, cancellation_token)) in contexts.iter() {
|
||||
if *task_kind == kind {
|
||||
cancellation_token.cancel();
|
||||
context_ids.push(context_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
wait_contexts_to_finish(&mut context_ids).await
|
||||
}
|
||||
|
||||
/// Cancel all remaining contexts.
|
||||
///
|
||||
/// This is used as part of pageserver shutdown. We have already shut down all
|
||||
/// tasks / contexts, this is just a backstop or sanity check to make sure we
|
||||
/// didn't miss anything. Hence, also print a warning for any remaining tasks.
|
||||
pub async fn shutdown_all_tasks() {
|
||||
loop {
|
||||
let mut context_ids = Vec::new();
|
||||
{
|
||||
let contexts = CONTEXTS.lock().unwrap();
|
||||
|
||||
if contexts.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for (&context_id, (task_kind, cancellation_token)) in contexts.iter() {
|
||||
cancellation_token.cancel();
|
||||
context_ids.push(context_id);
|
||||
warn!(
|
||||
"unexpected task of kind {:?} with ID {:?} still running",
|
||||
*task_kind, context_id
|
||||
);
|
||||
}
|
||||
}
|
||||
wait_contexts_to_finish(&mut context_ids).await
|
||||
}
|
||||
}
|
||||
@@ -4,16 +4,16 @@ use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::tenant::{PageReconstructError, Timeline, TimelineRequestContext};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
@@ -77,29 +77,50 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
})
|
||||
}
|
||||
|
||||
fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
|
||||
match err {
|
||||
PageReconstructError::Other(err) => ApiError::InternalServerError(err),
|
||||
PageReconstructError::NeedsDownload(_, _) => {
|
||||
// This shouldn't happen, because we use a RequestContext that requests to
|
||||
// download any missing layer files on-demand.
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"would need to download remote layer file"
|
||||
))
|
||||
}
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::WalRedo(err) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
ctx: Option<&TimelineRequestContext>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let mut info = build_timeline_info_common(timeline)?;
|
||||
let mut info = build_timeline_info_common(timeline, ctx)?;
|
||||
if include_non_incremental_logical_size {
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||
// we're executing this function, we will outlive the timeline on-disk state.
|
||||
info.current_logical_size_non_incremental = Some(
|
||||
timeline
|
||||
.get_current_logical_size_non_incremental(
|
||||
info.last_record_lsn,
|
||||
CancellationToken::new(),
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
if let Some(ctx) = ctx {
|
||||
info.current_logical_size_non_incremental = Some(
|
||||
timeline
|
||||
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
|
||||
.await?,
|
||||
);
|
||||
} else {
|
||||
info!("could not calculate non-incremental size for timeline because it is not active");
|
||||
}
|
||||
}
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
|
||||
fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: Option<&TimelineRequestContext>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
@@ -119,12 +140,16 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = match timeline.get_current_logical_size() {
|
||||
Ok(size) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
let current_logical_size = if let Some(ctx) = ctx {
|
||||
match timeline.get_current_logical_size(ctx) {
|
||||
Ok((size, _)) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
||||
let state = timeline.current_state();
|
||||
@@ -170,20 +195,23 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.new_timeline_id
|
||||
.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
&tenant_ctx,
|
||||
)
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
Ok(Some((new_timeline, timeline_ctx))) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline)
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, Some(&timeline_ctx))
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
@@ -198,21 +226,25 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let top_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &top_ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info =
|
||||
build_timeline_info(&timeline, include_non_incremental_logical_size)
|
||||
.await
|
||||
.context(
|
||||
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
|
||||
)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let timeline_ctx = timeline.get_context(&tenant_ctx).ok();
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
timeline_ctx.as_ref(),
|
||||
)
|
||||
.await
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
@@ -261,19 +293,26 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let top_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let (tenant, tenant_ctx) = mgr::get_active_tenant(tenant_id, &top_ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.get_timeline(timeline_id)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline_ctx = timeline.get_context(&tenant_ctx).ok();
|
||||
|
||||
let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
timeline_ctx.as_ref(),
|
||||
)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
}
|
||||
@@ -294,13 +333,19 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let timeline = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
|
||||
|
||||
let (timeline, ctx) = tenant
|
||||
.get_active_timeline(timeline_id, &ctx)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(apierror_from_prerror)?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
@@ -340,7 +385,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
mgr::delete_timeline(tenant_id, timeline_id)
|
||||
// deleting shouldn't require downloading anything
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
mgr::delete_timeline(tenant_id, timeline_id, &ctx)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
|
||||
@@ -418,8 +466,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut _req_ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -446,13 +496,15 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
.gather_size_inputs()
|
||||
.gather_size_inputs(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -495,6 +547,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
|
||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
@@ -583,9 +637,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
Some(tenant) => {
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = tenant.wait_to_become_active().await {
|
||||
if let res @ Err(_) = tenant.wait_to_become_active(ctx).await {
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in tenant_mgr::create_tenant, and there aren't any remote timelines
|
||||
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
@@ -607,6 +661,8 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period = Some(
|
||||
@@ -669,7 +725,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
}
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id, &ctx)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.await
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
@@ -721,11 +777,21 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
|
||||
let gc_result = wait_task_done
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let gc_result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
.context("wait for gc task")
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
@@ -738,14 +804,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
let (timeline, ctx) = tenant
|
||||
.get_active_timeline(timeline_id, &ctx)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.compact()
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -759,18 +828,21 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
let (timeline, ctx) = tenant
|
||||
.get_active_timeline(timeline_id, &ctx)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact()
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -784,13 +856,15 @@ async fn timeline_download_remote_layers_handler_post(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
let (timeline, ctx) = tenant
|
||||
.get_active_timeline(timeline_id, &ctx)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match timeline.spawn_download_all_remote_layers().await {
|
||||
match timeline.spawn_download_all_remote_layers(&ctx).await {
|
||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||
}
|
||||
@@ -803,11 +877,13 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let (tenant, ctx) = mgr::get_active_tenant(tenant_id, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
let (timeline, _ctx) = tenant
|
||||
.get_active_timeline(timeline_id, &ctx)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let info = timeline
|
||||
.get_download_all_remote_layers_task_info()
|
||||
|
||||
@@ -2,17 +2,18 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use futures::StreamExt;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use tokio_tar::Archive;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{Timeline, TimelineRequestContext};
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -42,10 +43,11 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
/// This is currently only used to import a cluster freshly created by initdb.
|
||||
/// The code that deals with the checkpoint would not work right if the
|
||||
/// cluster was not shut down cleanly.
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
pub async fn import_timeline_from_postgres_datadir(
|
||||
tline: &Timeline,
|
||||
pgdata_path: &Path,
|
||||
pgdata_lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
@@ -65,9 +67,11 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
let absolute_path = entry.path();
|
||||
let relative_path = absolute_path.strip_prefix(pgdata_path)?;
|
||||
|
||||
let file = File::open(absolute_path)?;
|
||||
let mut file = tokio::fs::File::open(absolute_path).await?;
|
||||
let len = metadata.len() as usize;
|
||||
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||
if let Some(control_file) =
|
||||
import_file(&mut modification, relative_path, &mut file, len, ctx).await?
|
||||
{
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
modification.flush()?;
|
||||
@@ -96,19 +100,22 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
tline,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
pgdata_lsn,
|
||||
)?;
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_rel<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
async fn import_rel(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
path: &Path,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
mut reader: Reader,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
len: usize,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
@@ -139,7 +146,14 @@ fn import_rel<Reader: Read>(
|
||||
// Call put_rel_creation for every segment of the relation,
|
||||
// because there is no guarantee about the order in which we are processing segments.
|
||||
// ignore "relation already exists" error
|
||||
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) {
|
||||
//
|
||||
// FIXME: use proper error type for this, instead of parsing the error message.
|
||||
// Or better yet, keep track of which relations we've already created
|
||||
// https://github.com/neondatabase/neon/issues/3309
|
||||
if let Err(e) = modification
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
if e.to_string().contains("already exists") {
|
||||
debug!("relation {} already exists. we must be extending it", rel);
|
||||
} else {
|
||||
@@ -148,7 +162,7 @@ fn import_rel<Reader: Read>(
|
||||
}
|
||||
|
||||
loop {
|
||||
let r = reader.read_exact(&mut buf);
|
||||
let r = reader.read_exact(&mut buf).await;
|
||||
match r {
|
||||
Ok(_) => {
|
||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
@@ -174,19 +188,20 @@ fn import_rel<Reader: Read>(
|
||||
//
|
||||
// If we process rel segments out of order,
|
||||
// put_rel_extend will skip the update.
|
||||
modification.put_rel_extend(rel, blknum)?;
|
||||
modification.put_rel_extend(rel, blknum, ctx).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
async fn import_slru(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
mut reader: Reader,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
len: usize,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("importing slru file {path:?}");
|
||||
|
||||
@@ -202,11 +217,13 @@ fn import_slru<Reader: Read>(
|
||||
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
|
||||
|
||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||
modification
|
||||
.put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
|
||||
.await?;
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
let r = reader.read_exact(&mut buf);
|
||||
let r = reader.read_exact(&mut buf).await;
|
||||
match r {
|
||||
Ok(_) => {
|
||||
modification.put_slru_page_image(
|
||||
@@ -237,11 +254,12 @@ fn import_slru<Reader: Read>(
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(
|
||||
async fn import_wal(
|
||||
walpath: &Path,
|
||||
tline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
||||
|
||||
@@ -249,7 +267,7 @@ fn import_wal(
|
||||
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
|
||||
let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
@@ -265,12 +283,14 @@ fn import_wal(
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let mut file = File::open(&path)?;
|
||||
let mut file = std::fs::File::open(&path)?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
use std::io::Seek;
|
||||
file.seek(std::io::SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
use std::io::Read;
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != WAL_SEGMENT_SIZE - offset {
|
||||
// Maybe allow this for .partial files?
|
||||
@@ -285,8 +305,8 @@ fn import_wal(
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
|
||||
.no_ondemand_download()?;
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
@@ -310,10 +330,11 @@ fn import_wal(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_basebackup_from_tar<Reader: Read>(
|
||||
pub async fn import_basebackup_from_tar(
|
||||
tline: &Timeline,
|
||||
reader: Reader,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {base_lsn}");
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
@@ -322,21 +343,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
// Import base
|
||||
for base_tar_entry in tar::Archive::new(reader).entries()? {
|
||||
let entry = base_tar_entry?;
|
||||
let mut entries = Archive::new(reader).entries()?;
|
||||
while let Some(base_tar_entry) = entries.next().await {
|
||||
let mut entry = base_tar_entry?;
|
||||
let header = entry.header();
|
||||
let len = header.entry_size()? as usize;
|
||||
let file_path = header.path()?.into_owned();
|
||||
|
||||
match header.entry_type() {
|
||||
tar::EntryType::Regular => {
|
||||
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
|
||||
tokio_tar::EntryType::Regular => {
|
||||
if let Some(res) =
|
||||
import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
|
||||
{
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
tokio_tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
}
|
||||
_ => {
|
||||
@@ -356,31 +380,35 @@ pub fn import_basebackup_from_tar<Reader: Read>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_wal_from_tar<Reader: Read>(
|
||||
pub async fn import_wal_from_tar(
|
||||
tline: &Timeline,
|
||||
reader: Reader,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<()> {
|
||||
// Set up walingest mutable state
|
||||
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
|
||||
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
let mut pg_wal_tar = tar::Archive::new(reader);
|
||||
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
|
||||
let mut pg_wal_tar = Archive::new(reader);
|
||||
let mut pg_wal_entries = pg_wal_tar.entries()?;
|
||||
while last_lsn <= end_lsn {
|
||||
let bytes = {
|
||||
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
|
||||
let mut entry = pg_wal_entries
|
||||
.next()
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
|
||||
let header = entry.header();
|
||||
let file_path = header.path()?.into_owned();
|
||||
|
||||
match header.entry_type() {
|
||||
tar::EntryType::Regular => {
|
||||
tokio_tar::EntryType::Regular => {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
|
||||
let file_name = file_path
|
||||
@@ -390,9 +418,9 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
ensure!(expected_filename == file_name);
|
||||
|
||||
debug!("processing wal file {:?}", file_path);
|
||||
read_all_bytes(entry)?
|
||||
read_all_bytes(&mut entry).await?
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
tokio_tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
continue;
|
||||
}
|
||||
@@ -413,8 +441,8 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
|
||||
.no_ondemand_download()?;
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
@@ -433,7 +461,7 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
}
|
||||
|
||||
// Log any extra unused files
|
||||
for e in &mut pg_wal_entries_iter {
|
||||
while let Some(e) = pg_wal_entries.next().await {
|
||||
let entry = e?;
|
||||
let header = entry.header();
|
||||
let file_path = header.path()?.into_owned();
|
||||
@@ -443,11 +471,12 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn import_file<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
async fn import_file(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
file_path: &Path,
|
||||
reader: Reader,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
len: usize,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
let file_name = match file_path.file_name() {
|
||||
Some(name) => name.to_string_lossy(),
|
||||
@@ -466,7 +495,7 @@ fn import_file<Reader: Read>(
|
||||
|
||||
match file_name.as_ref() {
|
||||
"pg_control" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&bytes[..])?;
|
||||
@@ -479,15 +508,17 @@ fn import_file<Reader: Read>(
|
||||
return Ok(Some(pg_control));
|
||||
}
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification
|
||||
.put_relmap_file(spcnode, dbnode, bytes, ctx)
|
||||
.await?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
@@ -502,44 +533,48 @@ fn import_file<Reader: Read>(
|
||||
|
||||
match file_name.as_ref() {
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification
|
||||
.put_relmap_file(spcnode, dbnode, bytes, ctx)
|
||||
.await?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("pg_xact") {
|
||||
let slru = SlruKind::Clog;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported clog slru");
|
||||
} else if file_path.starts_with("pg_multixact/offsets") {
|
||||
let slru = SlruKind::MultiXactOffsets;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact offsets slru");
|
||||
} else if file_path.starts_with("pg_multixact/members") {
|
||||
let slru = SlruKind::MultiXactMembers;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact members slru");
|
||||
} else if file_path.starts_with("pg_twophase") {
|
||||
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
|
||||
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification
|
||||
.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
|
||||
.await?;
|
||||
debug!("imported twophase file");
|
||||
} else if file_path.starts_with("pg_wal") {
|
||||
debug!("found wal file in base section. ignore it");
|
||||
} else if file_path.starts_with("zenith.signal") {
|
||||
// Parse zenith signal file to set correct previous LSN
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||
// TODO write serialization and deserialization in the same place.
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
@@ -576,8 +611,8 @@ fn import_file<Reader: Read>(
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
|
||||
async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
reader.read_to_end(&mut buf)?;
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::copy_from_slice(&buf[..]))
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
mod auth;
|
||||
pub mod basebackup;
|
||||
pub mod billing_metrics;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
@@ -9,7 +10,6 @@ pub(crate) mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod repository;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
@@ -22,7 +22,6 @@ pub mod walredo;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -42,35 +41,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
||||
|
||||
// Shut down any page service tasks.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
tenant::mgr::shutdown_all_tenants().await;
|
||||
|
||||
// Stop syncing with remote storage.
|
||||
//
|
||||
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
|
||||
// Should it?
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
|
||||
@@ -209,15 +209,34 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_upload_queue_unfinished_tasks",
|
||||
"Number of tasks in the upload queue that are not finished yet.",
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
"Number of ongoing calls to remote timeline client. \
|
||||
Used to populate pageserver_remote_timeline_client_calls_started. \
|
||||
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_timeline_client_calls_started",
|
||||
"When calling a remote timeline client method, we record the current value \
|
||||
of the calls_unfinished gauge in this histogram. Plot the histogram \
|
||||
over time in a heatmap to visualize how many operations were ongoing \
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -248,15 +267,12 @@ impl RemoteOpFileKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
|
||||
pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
|
||||
pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
|
||||
|
||||
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_operation_seconds",
|
||||
"Time spent on remote storage operations. \
|
||||
Grouped by tenant, timeline, operation_kind and status",
|
||||
Grouped by tenant, timeline, operation_kind and status. \
|
||||
Does not account for time spent waiting in remote timeline client's queues.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -475,21 +491,6 @@ impl Drop for TimelineMetrics {
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
for file_kind in REMOTE_OPERATION_FILE_KINDS {
|
||||
for op in REMOTE_OPERATION_KINDS {
|
||||
for status in REMOTE_OPERATION_STATUSES {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
file_kind,
|
||||
op,
|
||||
status,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -510,7 +511,8 @@ pub struct RemoteTimelineClientMetrics {
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
@@ -519,7 +521,8 @@ impl RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
unfinished_tasks: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls_started_hist: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
@@ -558,16 +561,37 @@ impl RemoteTimelineClientMetrics {
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
pub fn unfinished_tasks(
|
||||
fn calls_unfinished_gauge(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntGauge {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.unfinished_tasks.lock().unwrap();
|
||||
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn calls_started_hist(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.calls_started_hist.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
@@ -580,6 +604,58 @@ impl RemoteTimelineClientMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteTimelineClientMetrics::call_begin`].
|
||||
#[must_use]
|
||||
pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
|
||||
|
||||
impl RemoteTimelineClientCallMetricGuard {
|
||||
/// Consume this guard object without decrementing the metric.
|
||||
/// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
|
||||
pub fn will_decrement_manually(mut self) {
|
||||
self.0 = None; // prevent drop() from decrementing
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClientCallMetricGuard {
|
||||
fn drop(&mut self) {
|
||||
if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
|
||||
guard.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
/// Increment the metrics that track ongoing calls to the remote timeline client instance.
|
||||
///
|
||||
/// Drop the returned guard object once the operation is finished to decrement the values.
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
||||
/// is more suitable.
|
||||
/// Never do both.
|
||||
pub(crate) fn call_begin(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> RemoteTimelineClientCallMetricGuard {
|
||||
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
self.calls_started_hist(file_kind, op_kind)
|
||||
.observe(unfinished_metric.get() as f64);
|
||||
unfinished_metric.inc();
|
||||
RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
|
||||
}
|
||||
|
||||
/// Manually decrement the metric instead of using the guard object.
|
||||
/// Using the guard object is generally preferable.
|
||||
/// See [`call_begin`] for more context.
|
||||
pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
|
||||
let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
debug_assert!(
|
||||
unfinished_metric.get() > 0,
|
||||
"begin and end should cancel out"
|
||||
);
|
||||
unfinished_metric.dec();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClientMetrics {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientMetrics {
|
||||
@@ -587,13 +663,22 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
remote_operation_time,
|
||||
unfinished_tasks,
|
||||
calls_unfinished_gauge,
|
||||
calls_started_hist,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
// custom protocol.
|
||||
//
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::{Stream, StreamExt};
|
||||
@@ -19,6 +19,8 @@ use pageserver_api::models::{
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
|
||||
PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
};
|
||||
use pq_proto::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
@@ -26,11 +28,9 @@ use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::pin;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::io::SyncIoBridge;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::postgres_backend_async::QueryError;
|
||||
use utils::{
|
||||
auth::{Claims, JwtAuth, Scope},
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -42,30 +42,32 @@ use utils::{
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant::{Tenant, TenantRequestContext, Timeline, TimelineRequestContext};
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
|
||||
fn copyin_stream<'a>(
|
||||
pgb: &'a mut PostgresBackend,
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a {
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = ctx.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
let msg = format!("pageserver is shutting down");
|
||||
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
|
||||
Err(anyhow::anyhow!(msg))
|
||||
let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
|
||||
Err(QueryError::Other(anyhow::anyhow!(msg)))
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg }
|
||||
@@ -78,14 +80,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = format!("client terminated connection with Terminate message during COPY");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
let msg = "client terminated connection with Terminate message during COPY";
|
||||
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {:?}", m);
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
let msg = format!("unexpected message {m:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
break;
|
||||
}
|
||||
@@ -95,12 +98,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection during COPY";
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||
let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
|
||||
pgb.flush().await?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(e) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, e))?;
|
||||
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
||||
Err(io_error)?;
|
||||
}
|
||||
Err(other) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, other))?;
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -119,6 +126,7 @@ pub async fn libpq_listener_main(
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
listener_ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
listener.set_nonblocking(true)?;
|
||||
let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
|
||||
@@ -127,8 +135,9 @@ pub async fn libpq_listener_main(
|
||||
while let Some(res) = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = listener_ctx.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("libpq listener shutting down");
|
||||
None
|
||||
}
|
||||
|
||||
@@ -142,18 +151,33 @@ pub async fn libpq_listener_main(
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let local_auth = auth.clone();
|
||||
|
||||
let connection_ctx = RequestContext::with_parent(
|
||||
TaskKind::PageRequestHandler,
|
||||
DownloadBehavior::Download,
|
||||
&listener_ctx,
|
||||
);
|
||||
|
||||
// PageRequestHandler tasks are not associated with any particular
|
||||
// timeline in the task manager. In practice most connections will
|
||||
// only deal with a particular timeline, but we don't know which one
|
||||
// yet.
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::PageRequestHandler,
|
||||
None,
|
||||
None,
|
||||
"serving compute connection task",
|
||||
false,
|
||||
page_service_conn_main(conf, local_auth, socket, auth_type),
|
||||
async move {
|
||||
if let Err(err) = page_service_conn_main(
|
||||
conf,
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
connection_ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!("connection handler exited with error: {err:?}");
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
@@ -173,6 +197,7 @@ async fn page_service_conn_main(
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
connection_ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Immediately increment the gauge, then create a job to decrement it on task exit.
|
||||
// One of the pros of `defer!` is that this will *most probably*
|
||||
@@ -187,34 +212,32 @@ async fn page_service_conn_main(
|
||||
.set_nodelay(true)
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||
let cancellation_token = connection_ctx.cancellation_token().clone();
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
|
||||
let result = pgbackend
|
||||
.run(&mut conn_handler, task_mgr::shutdown_watcher)
|
||||
.run(&mut conn_handler, || cancellation_token.cancelled())
|
||||
.await;
|
||||
match result {
|
||||
Ok(()) => {
|
||||
// we've been requested to shut down
|
||||
Ok(())
|
||||
}
|
||||
Err(err) => {
|
||||
let root_cause_io_err_kind = err
|
||||
.root_cause()
|
||||
.downcast_ref::<io::Error>()
|
||||
.map(|e| e.kind());
|
||||
|
||||
Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
|
||||
// `ConnectionReset` error happens when the Postgres client closes the connection.
|
||||
// As this disconnection happens quite often and is expected,
|
||||
// we decided to downgrade the logging level to `INFO`.
|
||||
// See: https://github.com/neondatabase/neon/issues/1683.
|
||||
if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
|
||||
if io_error.kind() == io::ErrorKind::ConnectionReset {
|
||||
info!("Postgres client disconnected");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(err)
|
||||
Err(io_error).context("Postgres connection error")
|
||||
}
|
||||
}
|
||||
other => other.context("Postgres query error"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,35 +274,38 @@ impl PageRequestMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PageServerHandler {
|
||||
conf: &'static PageServerConf,
|
||||
_conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
connection_ctx: RequestContext,
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
connection_ctx: RequestContext,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
conf,
|
||||
_conf: conf,
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip(self, pgb))]
|
||||
async fn handle_pagerequests(
|
||||
&self,
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
let (tenant, ctx) = get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
@@ -291,7 +317,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let (timeline, ctx) = tenant.get_active_timeline(timeline_id, &ctx)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
@@ -303,7 +329,7 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = ctx.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
break;
|
||||
@@ -316,7 +342,7 @@ impl PageServerHandler {
|
||||
Some(FeMessage::CopyData(bytes)) => bytes,
|
||||
Some(FeMessage::Terminate) => break,
|
||||
Some(m) => {
|
||||
bail!("unexpected message: {m:?} during COPY");
|
||||
anyhow::bail!("unexpected message: {m:?} during COPY");
|
||||
}
|
||||
None => break, // client disconnected
|
||||
};
|
||||
@@ -330,22 +356,27 @@ impl PageServerHandler {
|
||||
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
// TODO: We could create a new per-request context here, with unique ID.
|
||||
// Currently we use the same per-timeline context for all requests
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let _timer = metrics.get_rel_exists.start_timer();
|
||||
self.handle_get_rel_exists_request(&timeline, &req).await
|
||||
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let _timer = metrics.get_rel_size.start_timer();
|
||||
self.handle_get_nblocks_request(&timeline, &req).await
|
||||
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let _timer = metrics.get_page_at_lsn.start_timer();
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req).await
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let _timer = metrics.get_db_size.start_timer();
|
||||
self.handle_db_size_request(&timeline, &req).await
|
||||
self.handle_db_size_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
};
|
||||
|
||||
@@ -366,19 +397,20 @@ impl PageServerHandler {
|
||||
|
||||
#[instrument(skip(self, pgb))]
|
||||
async fn handle_import_basebackup(
|
||||
&self,
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
) -> Result<(), QueryError> {
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
|
||||
let (tenant, tenant_ctx) =
|
||||
get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||
let (timeline, ctx) =
|
||||
tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &tenant_ctx)?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
@@ -395,11 +427,9 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let copyin_stream = copyin_stream(pgb);
|
||||
pin!(copyin_stream);
|
||||
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb, &ctx));
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
|
||||
.import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
|
||||
.await?;
|
||||
|
||||
// Drain the rest of the Copy data
|
||||
@@ -423,17 +453,21 @@ impl PageServerHandler {
|
||||
|
||||
#[instrument(skip(self, pgb))]
|
||||
async fn handle_import_wal(
|
||||
&self,
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
ensure!(timeline.get_last_record_lsn() == start_lsn);
|
||||
) -> Result<(), QueryError> {
|
||||
let (timeline, ctx) =
|
||||
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx).await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
return Err(QueryError::Other(
|
||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
@@ -442,9 +476,9 @@ impl PageServerHandler {
|
||||
info!("importing wal");
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb, &ctx));
|
||||
let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
|
||||
import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
// Drain the rest of the Copy data
|
||||
@@ -457,7 +491,11 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
// TODO Does it make sense to overshoot?
|
||||
ensure!(timeline.get_last_record_lsn() >= end_lsn);
|
||||
if timeline.get_last_record_lsn() < end_lsn {
|
||||
return Err(QueryError::Other(
|
||||
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
@@ -486,7 +524,8 @@ impl PageServerHandler {
|
||||
mut lsn: Lsn,
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<Lsn> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<Lsn> {
|
||||
if latest {
|
||||
// Latest page version was requested. If LSN is given, it is a hint
|
||||
// to the page server that there have been no modifications to the
|
||||
@@ -509,7 +548,7 @@ impl PageServerHandler {
|
||||
if lsn <= last_record_lsn {
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline.wait_lsn(lsn).await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
@@ -517,11 +556,11 @@ impl PageServerHandler {
|
||||
}
|
||||
} else {
|
||||
if lsn == Lsn(0) {
|
||||
bail!("invalid LSN(0) in request");
|
||||
anyhow::bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn).await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
}
|
||||
ensure!(
|
||||
anyhow::ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, **latest_gc_cutoff_lsn
|
||||
@@ -529,60 +568,61 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let exists = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_exists(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let exists = timeline
|
||||
.get_rel_exists(req.rel, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
async fn handle_get_nblocks_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let n_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_size(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
|
||||
async fn handle_db_size_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let total_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let total_blocks = timeline
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
@@ -590,15 +630,17 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delay helps the requests to
|
||||
// hit the race condition from github issue #1047 more easily.
|
||||
@@ -608,14 +650,9 @@ impl PageServerHandler {
|
||||
}
|
||||
*/
|
||||
|
||||
let page = crate::tenant::with_ondemand_download(|| {
|
||||
// FIXME: this profiling now happens at different place than it used to. The
|
||||
// current profiling is based on a thread-local variable, so it doesn't work
|
||||
// across awaits
|
||||
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -624,7 +661,7 @@ impl PageServerHandler {
|
||||
|
||||
#[instrument(skip(self, pgb))]
|
||||
async fn handle_basebackup_request(
|
||||
&self,
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -633,12 +670,14 @@ impl PageServerHandler {
|
||||
full_backup: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// check that the timeline exists
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
let (timeline, ctx) =
|
||||
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx).await?;
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn).await?;
|
||||
timeline.wait_lsn(lsn, &ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
@@ -648,17 +687,20 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
let mut writer = CopyDataSink {
|
||||
pgb,
|
||||
rt: tokio::runtime::Handle::current(),
|
||||
};
|
||||
tokio::task::block_in_place(|| {
|
||||
let basebackup =
|
||||
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
|
||||
tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()
|
||||
})?;
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
{
|
||||
let mut writer = pgb.copyout_writer();
|
||||
basebackup::send_basebackup_tarball(
|
||||
&mut writer,
|
||||
&timeline,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
pgb.flush().await?;
|
||||
info!("basebackup complete");
|
||||
@@ -668,7 +710,7 @@ impl PageServerHandler {
|
||||
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
||||
if self.auth.is_none() {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(());
|
||||
@@ -690,20 +732,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
jwt_response: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), QueryError> {
|
||||
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
|
||||
// which requires auth to be present
|
||||
let data = self
|
||||
.auth
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.decode(str::from_utf8(jwt_response)?)?;
|
||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
||||
|
||||
if matches!(data.claims.scope, Scope::Tenant) {
|
||||
ensure!(
|
||||
data.claims.tenant_id.is_some(),
|
||||
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"jwt token scope is Tenant, but tenant id is missing"
|
||||
)
|
||||
)));
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -715,22 +756,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("process query {:?}", query_string);
|
||||
) -> Result<(), QueryError> {
|
||||
debug!("process query {query_string:?}");
|
||||
|
||||
if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
"invalid param number for pagestream command"
|
||||
);
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
@@ -740,18 +792,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
let (_, params_raw) = query_string.split_at("basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
ensure!(
|
||||
params.len() >= 2,
|
||||
"invalid param number for basebackup command"
|
||||
);
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let lsn = if params.len() == 3 {
|
||||
Some(Lsn::from_str(params[2])?)
|
||||
Some(
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -766,16 +824,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
);
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
|
||||
let (timeline, _ctx) =
|
||||
get_active_timeline_with_timeout(tenant_id, timeline_id, &self.connection_ctx)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
@@ -794,22 +857,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
ensure!(
|
||||
params.len() >= 2,
|
||||
"invalid param number for fullbackup command"
|
||||
);
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(Lsn::from_str(params[2])?)
|
||||
Some(
|
||||
Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if params.len() > 3 {
|
||||
Some(Lsn::from_str(params[3])?)
|
||||
Some(
|
||||
Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -834,12 +906,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
ensure!(params.len() == 5);
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let base_lsn = Lsn::from_str(params[2])?;
|
||||
let end_lsn = Lsn::from_str(params[3])?;
|
||||
let pg_version = u32::from_str(params[4])?;
|
||||
if params.len() != 5 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import basebackup command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
let base_lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
let pg_version = u32::from_str(params[4])
|
||||
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
@@ -857,7 +938,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
pgb.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if query_string.starts_with("import wal ") {
|
||||
@@ -867,11 +951,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// caller should poll the http api to check when that is done.
|
||||
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
ensure!(params.len() == 4);
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let start_lsn = Lsn::from_str(params[2])?;
|
||||
let end_lsn = Lsn::from_str(params[3])?;
|
||||
if params.len() != 4 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for import wal command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
let start_lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
let end_lsn = Lsn::from_str(params[3])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
@@ -882,7 +974,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
pgb.write_message(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
@@ -893,12 +988,18 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(params.len() == 1, "invalid param number for config command");
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
if params.len() != 1 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for config command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id).await?;
|
||||
let (tenant, _ctx) =
|
||||
get_active_tenant_with_timeout(tenant_id, &self.connection_ctx).await?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
@@ -935,7 +1036,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
bail!("unknown command");
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -947,12 +1050,25 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
|
||||
Ok(wait_result) => wait_result
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
.map(move |()| tenant),
|
||||
async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
parent_ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Tenant>, TenantRequestContext)> {
|
||||
let child_ctx = RequestContext::with_parent(
|
||||
parent_ctx.task_kind(),
|
||||
parent_ctx.download_behavior(),
|
||||
parent_ctx,
|
||||
);
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id).await?;
|
||||
match tokio::time::timeout(
|
||||
Duration::from_secs(30),
|
||||
tenant.wait_to_become_active(child_ctx),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(ctx)) => Ok((tenant, ctx)),
|
||||
Ok(Err(err)) => Err(err),
|
||||
Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
|
||||
}
|
||||
}
|
||||
@@ -961,37 +1077,9 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
|
||||
async fn get_active_timeline_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
get_active_tenant_with_timeout(tenant_id)
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Timeline>, TimelineRequestContext)> {
|
||||
get_active_tenant_with_timeout(tenant_id, ctx)
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
}
|
||||
|
||||
///
|
||||
/// A std::io::Write implementation that wraps all data written to it in CopyData
|
||||
/// messages.
|
||||
///
|
||||
struct CopyDataSink<'a> {
|
||||
pgb: &'a mut PostgresBackend,
|
||||
rt: tokio::runtime::Handle,
|
||||
}
|
||||
|
||||
impl<'a> io::Write for CopyDataSink<'a> {
|
||||
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
|
||||
// CopyData
|
||||
// FIXME: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
// the length cannot exceed u32.
|
||||
// FIXME: flush isn't really required, but makes it easier
|
||||
// to view in wireshark
|
||||
self.pgb.write_message(&BeMessage::CopyData(data))?;
|
||||
self.rt.block_on(self.pgb.flush())?;
|
||||
trace!("CopyData sent for {} bytes!", data.len());
|
||||
|
||||
Ok(data.len())
|
||||
}
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
// no-op
|
||||
Ok(())
|
||||
}
|
||||
.and_then(|(tenant, ctx)| tenant.get_active_timeline(timeline_id, &ctx))
|
||||
}
|
||||
|
||||
@@ -6,11 +6,10 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::PageReconstructResult;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::repository::*;
|
||||
use crate::tenant::{PageReconstructError, Timeline, TimelineRequestContext};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{repository::*, try_no_ondemand_download};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -20,7 +19,6 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::Range;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -35,14 +33,6 @@ pub enum LsnForTimestamp {
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CalculateLogicalSizeError {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||
/// and other special kinds of files, in a versioned key-value store. The
|
||||
@@ -92,76 +82,83 @@ impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page version.
|
||||
pub fn get_rel_page_at_lsn(
|
||||
pub async fn get_rel_page_at_lsn(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
return PageReconstructResult::Success(ZERO_PAGE.clone());
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
self.get(key, lsn)
|
||||
self.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub fn get_db_size(
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<usize> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
PageReconstructResult::Success(total_blocks)
|
||||
Ok(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
pub fn get_rel_size(
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return PageReconstructResult::Success(nblocks);
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
|
||||
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
// without extending it. Tolerate that by claiming that
|
||||
// any non-existent FSM fork has size 0.
|
||||
return PageReconstructResult::Success(0);
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -174,47 +171,51 @@ impl Timeline {
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
}
|
||||
PageReconstructResult::Success(nblocks)
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
pub fn get_rel_exists(
|
||||
pub async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
_latest: bool,
|
||||
) -> PageReconstructResult<bool> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return PageReconstructResult::Success(true);
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
pub fn list_rels(
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<RelTag>> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -226,53 +227,56 @@ impl Timeline {
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
PageReconstructResult::Success(rels)
|
||||
Ok(rels)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub fn get_slru_page_at_lsn(
|
||||
pub async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn)
|
||||
self.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub fn get_slru_segment_size(
|
||||
pub async fn get_slru_segment_size(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf.get_u32_le())
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub fn get_slru_segment_exists(
|
||||
pub async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<bool> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.segments.get(&segno).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,10 +287,11 @@ impl Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub fn find_lsn_for_timestamp(
|
||||
pub async fn find_lsn_for_timestamp(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
) -> PageReconstructResult<LsnForTimestamp> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
@@ -302,12 +307,15 @@ impl Timeline {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
));
|
||||
let cmp = self
|
||||
.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
@@ -319,15 +327,15 @@ impl Timeline {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
@@ -337,7 +345,7 @@ impl Timeline {
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -349,26 +357,25 @@ impl Timeline {
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
pub fn is_latest_commit_timestamp_ge_than(
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
) -> PageReconstructResult<bool> {
|
||||
for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
|
||||
let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
probe_lsn
|
||||
));
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.await?
|
||||
{
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
blknum,
|
||||
probe_lsn
|
||||
));
|
||||
let clog_page = self
|
||||
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
@@ -377,76 +384,99 @@ impl Timeline {
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return PageReconstructResult::Success(true);
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PageReconstructResult::Success(false)
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub fn list_slru_segments(
|
||||
pub async fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<u32>> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.segments),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_relmap_file(
|
||||
pub async fn get_relmap_file(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
|
||||
pub async fn list_dbdirs(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.dbdirs),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
pub async fn get_twophase_file(
|
||||
&self,
|
||||
xid: TransactionId,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = twophase_file_key(xid);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
|
||||
pub async fn list_twophase_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<HashSet<TransactionId>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.xids),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.xids),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CONTROLFILE_KEY, lsn)
|
||||
pub async fn get_control_file(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CHECKPOINT_KEY, lsn)
|
||||
pub async fn get_checkpoint(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
@@ -457,23 +487,20 @@ impl Timeline {
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<u64, PageReconstructError> {
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in
|
||||
crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
|
||||
.await?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||
if ctx.is_cancelled() {
|
||||
return Err(PageReconstructError::Cancelled);
|
||||
}
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -486,7 +513,11 @@ impl Timeline {
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
|
||||
pub async fn collect_keyspace(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<KeySpace> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -494,7 +525,7 @@ impl Timeline {
|
||||
result.add_key(DBDIR_KEY);
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
@@ -503,15 +534,16 @@ impl Timeline {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> =
|
||||
with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||
.await?
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
rels.sort_unstable();
|
||||
for rel in rels {
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
|
||||
@@ -527,13 +559,13 @@ impl Timeline {
|
||||
] {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get_download(slrudir_key, lsn).await?;
|
||||
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get_download(segsize_key, lsn).await?;
|
||||
let mut buf = self.get(segsize_key, lsn, ctx).await?;
|
||||
let segsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(
|
||||
@@ -545,7 +577,7 @@ impl Timeline {
|
||||
|
||||
// Then pg_twophase
|
||||
result.add_key(TWOPHASEDIR_KEY);
|
||||
let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
||||
xids.sort_unstable();
|
||||
@@ -703,9 +735,15 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
let mut dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
|
||||
@@ -731,9 +769,14 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
|
||||
pub async fn put_twophase_file(
|
||||
&mut self,
|
||||
xid: TransactionId,
|
||||
img: Bytes,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
@@ -757,16 +800,21 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
|
||||
pub async fn drop_dbdir(
|
||||
&mut self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true)
|
||||
.no_ondemand_download()?;
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
|
||||
.await?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
let mut dir = DbDirectory::des(&buf)?;
|
||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||
let buf = DbDirectory::ser(&dir)?;
|
||||
@@ -789,11 +837,16 @@ impl<'a> DatadirModification<'a> {
|
||||
/// Create a relation fork.
|
||||
///
|
||||
/// 'nblocks' is the initial size.
|
||||
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_creation(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
||||
// Didn't exist. Update dbdir
|
||||
@@ -805,7 +858,7 @@ impl<'a> DatadirModification<'a> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
@@ -833,17 +886,18 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Truncate relation
|
||||
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_truncation(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, last_lsn, true)
|
||||
.no_ondemand_download()?
|
||||
{
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
|
||||
// Update the entry with the new size.
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -863,12 +917,17 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// Extend relation
|
||||
/// If new size is smaller, do nothing.
|
||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_extend(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
|
||||
// only extend relation here. never decrease the size
|
||||
if nblocks > old_size {
|
||||
@@ -884,12 +943,16 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Drop a relation.
|
||||
pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_drop(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Remove it from the directory entry
|
||||
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
@@ -900,7 +963,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
@@ -912,15 +975,16 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_slru_segment_creation(
|
||||
pub async fn put_slru_segment_creation(
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
@@ -956,10 +1020,15 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
pub async fn drop_slru_segment(
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.remove(&segno) {
|
||||
@@ -983,9 +1052,13 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
pub async fn drop_twophase_file(
|
||||
&mut self,
|
||||
xid: TransactionId,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -1079,7 +1152,11 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
|
||||
async fn get(
|
||||
&self,
|
||||
key: Key,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// version in that case.
|
||||
//
|
||||
@@ -1087,18 +1164,20 @@ impl<'a> DatadirModification<'a> {
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
PageReconstructResult::Success(img.clone())
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn)
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1505,17 +1584,18 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline(
|
||||
tenant: &crate::tenant::Tenant,
|
||||
tenant: &std::sync::Arc<crate::tenant::Tenant>,
|
||||
timeline_id: utils::id::TimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<std::sync::Arc<Timeline>> {
|
||||
let tline = tenant
|
||||
.create_empty_timeline(timeline_id, Lsn(8), pg_version)?
|
||||
.initialize()?;
|
||||
tenant_ctx: &crate::tenant::TenantRequestContext,
|
||||
) -> anyhow::Result<(std::sync::Arc<Timeline>, TimelineRequestContext)> {
|
||||
let (tline, timeline_ctx) =
|
||||
tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version, tenant_ctx)?;
|
||||
let tline = tline.initialize(&timeline_ctx)?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
Ok(tline)
|
||||
Ok((tline, timeline_ctx))
|
||||
}
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
//!
|
||||
//! Support for profiling
|
||||
//!
|
||||
//! This relies on a modified version of the 'pprof-rs' crate. That's not very
|
||||
//! nice, so to avoid a hard dependency on that, this is an optional feature.
|
||||
//!
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
|
||||
/// The actual implementation is in the `profiling_impl` submodule. If the profiling
|
||||
/// feature is not enabled, it's just a dummy implementation that panics if you
|
||||
/// try to enabled profiling in the configuration.
|
||||
pub use profiling_impl::*;
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
use pprof;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Start profiling the current thread. Returns a guard object;
|
||||
/// the profiling continues until the guard is dropped.
|
||||
///
|
||||
/// Note: profiling is not re-entrant. If you call 'profpoint_start' while
|
||||
/// profiling is already started, nothing happens, and the profiling will be
|
||||
/// stopped when either guard object is dropped.
|
||||
#[inline]
|
||||
pub fn profpoint_start(
|
||||
conf: &crate::config::PageServerConf,
|
||||
point: ProfilingConfig,
|
||||
) -> Option<ProfilingGuard> {
|
||||
if conf.profiling == point {
|
||||
pprof::start_profiling();
|
||||
Some(ProfilingGuard(PhantomData))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A hack to remove Send and Sync from the ProfilingGuard. Because the
|
||||
/// profiling is attached to current thread.
|
||||
////
|
||||
/// See comments in https://github.com/rust-lang/rust/issues/68318
|
||||
type PhantomUnsend = std::marker::PhantomData<*mut u8>;
|
||||
|
||||
pub struct ProfilingGuard(PhantomUnsend);
|
||||
|
||||
impl Drop for ProfilingGuard {
|
||||
fn drop(&mut self) {
|
||||
pprof::stop_profiling();
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the profiler. This must be called before any 'profpoint_start' calls.
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Exit the profiler. Writes the flamegraph to current workdir.
|
||||
pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
|
||||
// Write out the flamegraph
|
||||
if let Some(profiler_guard) = profiler_guard {
|
||||
if let Ok(report) = profiler_guard.report().build() {
|
||||
// this gets written under the workdir
|
||||
let file = std::fs::File::create("flamegraph.svg").unwrap();
|
||||
let mut options = pprof::flamegraph::Options::default();
|
||||
options.image_width = Some(2500);
|
||||
report.flamegraph_with_options(file, &mut options).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
|
||||
#[cfg(not(feature = "profiling"))]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
|
||||
pub struct DummyProfilerGuard;
|
||||
|
||||
impl Drop for DummyProfilerGuard {
|
||||
fn drop(&mut self) {
|
||||
// do nothing, this exists to calm Clippy down
|
||||
}
|
||||
}
|
||||
|
||||
pub fn profpoint_start(
|
||||
_conf: &PageServerConf,
|
||||
_point: ProfilingConfig,
|
||||
) -> Option<DummyProfilerGuard> {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
// shouldn't happen, we don't allow profiling in the config if the support
|
||||
// for it is disabled.
|
||||
panic!("profiling enabled but the binary was compiled without profiling support");
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
|
||||
}
|
||||
@@ -1,59 +1,21 @@
|
||||
//!
|
||||
//! This module provides centralized handling of tokio tasks in the Page Server.
|
||||
//! This module provides some helpers for spawning tokio tasks in the pageserver.
|
||||
//!
|
||||
//! We provide a few basic facilities:
|
||||
//! - A global registry of tasks that lists what kind of tasks they are, and
|
||||
//! which tenant or timeline they are working on
|
||||
//!
|
||||
//! - The ability to request a task to shut down.
|
||||
//!
|
||||
//!
|
||||
//! # How it works?
|
||||
//!
|
||||
//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new
|
||||
//! task is spawned, a PageServerTask entry is added there, and when a
|
||||
//! task dies, it removes itself from the hashmap. If you want to kill a
|
||||
//! task, you can scan the hashmap to find it.
|
||||
//!
|
||||
//! # Task shutdown
|
||||
//!
|
||||
//! To kill a task, we rely on co-operation from the victim. Each task is
|
||||
//! expected to periodically call the `is_shutdown_requested()` function, and
|
||||
//! if it returns true, exit gracefully. In addition to that, when waiting for
|
||||
//! the network or other long-running operation, you can use
|
||||
//! `shutdown_watcher()` function to get a Future that will become ready if
|
||||
//! the current task has been requested to shut down. You can use that with
|
||||
//! Tokio select!().
|
||||
//!
|
||||
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
|
||||
//! Depending on what task panics, we might want to kill the whole server, or
|
||||
//! only a single tenant or timeline.
|
||||
//! Mostly just a wrapper around tokio::spawn, with some code to handle panics.
|
||||
//!
|
||||
|
||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::panic::{resume_unwind, AssertUnwindSafe};
|
||||
|
||||
use futures::FutureExt;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::task_local;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
use crate::context::{self, TaskKind};
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
@@ -92,10 +54,6 @@ use crate::shutdown_pageserver;
|
||||
// runtime. If a GetPage request comes in before the load of a tenant has finished, the
|
||||
// GetPage request will wait for the tenant load to finish.
|
||||
//
|
||||
// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to
|
||||
// protect data structures. Let's keep it that way. Synchronous code is easier to debug
|
||||
// and analyze, and there's a lot of hairy, low-level, performance critical code there.
|
||||
//
|
||||
// It's nice to have different runtimes, so that you can quickly eyeball how much CPU
|
||||
// time each class of operations is taking, with 'top -H' or similar.
|
||||
//
|
||||
@@ -135,355 +93,81 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
impl fmt::Display for PageserverTaskId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
/// Each task that we track is associated with a "task ID". It's just an
|
||||
/// increasing number that we assign. Note that it is different from tokio::task::Id.
|
||||
static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// Global registry of tasks
|
||||
static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
task_local! {
|
||||
// This is a cancellation token which will be cancelled when a task needs to shut down. The
|
||||
// root token is kept in the global registry, so that anyone can send the signal to request
|
||||
// task shutdown.
|
||||
static SHUTDOWN_TOKEN: CancellationToken;
|
||||
|
||||
// Each task holds reference to its own PageServerTask here.
|
||||
static CURRENT_TASK: Arc<PageServerTask>;
|
||||
}
|
||||
|
||||
///
|
||||
/// There are many kinds of tasks in the system. Some are associated with a particular
|
||||
/// tenant or timeline, while others are global.
|
||||
///
|
||||
/// Note that we don't try to limit how many task of a certain kind can be running
|
||||
/// at the same time.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub enum TaskKind {
|
||||
// libpq listener task. It just accepts connection and spawns a
|
||||
// PageRequestHandler task for each connection.
|
||||
LibpqEndpointListener,
|
||||
|
||||
// HTTP endpoint listener.
|
||||
HttpEndpointListener,
|
||||
|
||||
// Task that handles a single connection. A PageRequestHandler task
|
||||
// starts detached from any particular tenant or timeline, but it can be
|
||||
// associated with one later, after receiving a command from the client.
|
||||
PageRequestHandler,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
WalReceiverManager,
|
||||
|
||||
// Handles a connection to a safekeeper, to stream WAL to a timeline.
|
||||
WalReceiverConnection,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
GarbageCollector,
|
||||
|
||||
// Compaction. One per tenant.
|
||||
Compaction,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
// Task that flushes frozen in-memory layers to disk
|
||||
LayerFlushTask,
|
||||
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
// task that handles attaching a tenant
|
||||
Attach,
|
||||
|
||||
// task that handhes metrics collection
|
||||
MetricsCollection,
|
||||
|
||||
// task that drives downloading layers
|
||||
DownloadAllRemoteLayers,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct MutableTaskState {
|
||||
/// Tenant and timeline that this task is associated with.
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Handle for waiting for the task to exit. It can be None, if the
|
||||
/// the task has already exited.
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
struct PageServerTask {
|
||||
#[allow(dead_code)] // unused currently
|
||||
task_id: PageserverTaskId,
|
||||
|
||||
kind: TaskKind,
|
||||
|
||||
name: String,
|
||||
|
||||
// To request task shutdown, just cancel this token.
|
||||
cancel: CancellationToken,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
}
|
||||
|
||||
/// Launch a new task
|
||||
/// Note: if shutdown_process_on_error is set to true failure
|
||||
/// of the task will lead to shutdown of entire process
|
||||
///
|
||||
/// This is a wrapper around tokio::spawn. One difference is that the Future
|
||||
/// is marked to return nothing to avoid silently swallowing errors. This
|
||||
/// forces the future to handle errors by itself. If you need the return
|
||||
/// value, you could create another function that passes it through, but we
|
||||
/// don't have a need for that currently.
|
||||
///
|
||||
/// If shutdown_process_on_panic is set to true, panic of the task will lead
|
||||
/// to shutdown of entire process. Otherwise we log the panic and continue.
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
shutdown_process_on_error: bool,
|
||||
shutdown_process_on_panic: bool,
|
||||
future: F,
|
||||
) -> PageserverTaskId
|
||||
) -> JoinHandle<F::Output>
|
||||
where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
F: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
let cancel = CancellationToken::new();
|
||||
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
||||
let task = Arc::new(PageServerTask {
|
||||
task_id: PageserverTaskId(task_id),
|
||||
kind,
|
||||
name: name.to_string(),
|
||||
cancel: cancel.clone(),
|
||||
mutable: Mutex::new(MutableTaskState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
join_handle: None,
|
||||
}),
|
||||
});
|
||||
|
||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
let task_name = name.to_string();
|
||||
let task_cloned = Arc::clone(&task);
|
||||
let join_handle = runtime.spawn(task_wrapper(
|
||||
task_name,
|
||||
task_id,
|
||||
task_cloned,
|
||||
cancel,
|
||||
shutdown_process_on_error,
|
||||
future,
|
||||
));
|
||||
task_mut.join_handle = Some(join_handle);
|
||||
drop(task_mut);
|
||||
|
||||
// The task is now running. Nothing more to do here
|
||||
PageserverTaskId(task_id)
|
||||
runtime.spawn(task_wrapper(task_name, shutdown_process_on_panic, future))
|
||||
}
|
||||
|
||||
/// This wrapper function runs in a newly-spawned task. It initializes the
|
||||
/// task-local variables and calls the payload function.
|
||||
async fn task_wrapper<F>(
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
task: Arc<PageServerTask>,
|
||||
shutdown_token: CancellationToken,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
/// This wrapper function runs in a newly-spawned task. To handle panics.
|
||||
async fn task_wrapper<F, R>(task_name: String, shutdown_process_on_panic: bool, future: F) -> R
|
||||
where
|
||||
F: Future<Output = R> + Send + 'static,
|
||||
{
|
||||
debug!("Starting task '{}'", task_name);
|
||||
|
||||
let result = SHUTDOWN_TOKEN
|
||||
.scope(
|
||||
shutdown_token,
|
||||
CURRENT_TASK.scope(task, {
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
AssertUnwindSafe(future).catch_unwind()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
task_finish(result, task_name, task_id, shutdown_process_on_error).await;
|
||||
}
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
// unwinding that would expose us to unwind-unsafe behavior.
|
||||
let result = AssertUnwindSafe(future).catch_unwind().await;
|
||||
|
||||
async fn task_finish(
|
||||
result: std::result::Result<
|
||||
anyhow::Result<()>,
|
||||
std::boxed::Box<dyn std::any::Any + std::marker::Send>,
|
||||
>,
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
shutdown_process_on_error: bool,
|
||||
) {
|
||||
// Remove our entry from the global hashmap.
|
||||
let task = TASKS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&task_id)
|
||||
.expect("no task in registry");
|
||||
|
||||
let mut shutdown_process = false;
|
||||
{
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
}
|
||||
Ok(Err(err)) => {
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
);
|
||||
}
|
||||
// Handle panics
|
||||
match result {
|
||||
Ok(result) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
result
|
||||
}
|
||||
Err(err) => {
|
||||
if shutdown_process_on_panic {
|
||||
error!("Shutting down: task '{}' panicked: {:?}", task_name, err);
|
||||
shutdown_pageserver(1).await;
|
||||
unreachable!();
|
||||
} else {
|
||||
error!("Task '{}' panicked: {:?}", task_name, err);
|
||||
resume_unwind(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
shutdown_pageserver(1).await;
|
||||
}
|
||||
}
|
||||
|
||||
// expected to be called from the task of the given id.
|
||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
||||
CURRENT_TASK.with(|ct| {
|
||||
let mut task_mut = ct.mutable.lock().unwrap();
|
||||
task_mut.tenant_id = tenant_id;
|
||||
task_mut.timeline_id = timeline_id;
|
||||
});
|
||||
}
|
||||
|
||||
/// Is there a task running that matches the criteria
|
||||
|
||||
/// Signal and wait for tasks to shut down.
|
||||
///
|
||||
/// Perform pageserver shutdown. This is called on receiving a signal,
|
||||
/// or if one of the tasks marked as 'shutdown_process_on_error' dies.
|
||||
///
|
||||
/// The arguments are used to select the tasks to kill. Any None arguments are
|
||||
/// ignored. For example, to shut down all WalReceiver tasks:
|
||||
///
|
||||
/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None)
|
||||
///
|
||||
/// Or to shut down all tasks for given timeline:
|
||||
///
|
||||
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
|
||||
///
|
||||
pub async fn shutdown_tasks(
|
||||
kind: Option<TaskKind>,
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
) {
|
||||
let mut victim_tasks = Vec::new();
|
||||
/// This never returns.
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
context::shutdown_tasks(TaskKind::LibpqEndpointListener).await;
|
||||
|
||||
{
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
victim_tasks.push(Arc::clone(task));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Shut down all tenants gracefully
|
||||
crate::tenant::mgr::shutdown_all_tenants().await;
|
||||
|
||||
for task in victim_tasks {
|
||||
let join_handle = {
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
let join_handle = task_mut.join_handle.take();
|
||||
drop(task_mut);
|
||||
join_handle
|
||||
};
|
||||
if let Some(join_handle) = join_handle {
|
||||
let _ = join_handle.await;
|
||||
} else {
|
||||
// Possibly one of:
|
||||
// * The task had not even fully started yet.
|
||||
// * It was shut down concurrently and already exited
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_task_kind() -> Option<TaskKind> {
|
||||
CURRENT_TASK.try_with(|ct| ct.kind).ok()
|
||||
}
|
||||
|
||||
pub fn current_task_id() -> Option<PageserverTaskId> {
|
||||
CURRENT_TASK.try_with(|ct| ct.task_id).ok()
|
||||
}
|
||||
|
||||
/// A Future that can be used to check if the current task has been requested to
|
||||
/// shut down.
|
||||
pub async fn shutdown_watcher() {
|
||||
let token = SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
token.cancelled().await;
|
||||
}
|
||||
|
||||
/// Clone the current task's cancellation token, which can be moved across tasks.
|
||||
///
|
||||
/// When the task which is currently executing is shutdown, the cancellation token will be
|
||||
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
|
||||
/// `tokio::task::JoinSet::spawn`.
|
||||
pub fn shutdown_token() -> CancellationToken {
|
||||
SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_token() called in an unexpected task or thread")
|
||||
}
|
||||
|
||||
/// Has the current task been requested to shut down?
|
||||
pub fn is_shutdown_requested() -> bool {
|
||||
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
|
||||
cancel.is_cancelled()
|
||||
} else {
|
||||
if !cfg!(test) {
|
||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
||||
}
|
||||
false
|
||||
}
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
context::shutdown_tasks(TaskKind::HttpEndpointListener).await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
context::shutdown_all_tasks().await;
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
|
||||
@@ -260,8 +260,10 @@ where
|
||||
/// contain the version, even if it's missing from the returned
|
||||
/// layer.
|
||||
///
|
||||
/// NOTE: This only searches the 'historic' layers, *not* the
|
||||
/// 'open' and 'frozen' layers!
|
||||
///
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<L>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
|
||||
@@ -8,6 +8,8 @@ use std::sync::Arc;
|
||||
use tokio::fs;
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
@@ -16,9 +18,9 @@ use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant::{Tenant, TenantRequestContext, TenantState};
|
||||
use crate::IGNORED_TENANT_FILE_NAME;
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -181,25 +183,11 @@ pub async fn shutdown_all_tenants() {
|
||||
tenants_to_shut_down
|
||||
};
|
||||
|
||||
// Shut down all existing walreceiver connections and stop accepting the new ones.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
|
||||
|
||||
// Ok, no background tasks running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
//
|
||||
// We assume that any incoming connections that might request pages from
|
||||
// the tenant have already been terminated by the caller, so there
|
||||
// should be no more activity in any of the repositories.
|
||||
//
|
||||
// On error, log it but continue with the shutdown for other tenants.
|
||||
for tenant in tenants_to_shut_down {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.freeze_and_flush().await {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
let mut shutdown_futures: FuturesUnordered<_> = FuturesUnordered::new();
|
||||
for tenant in tenants_to_shut_down.iter() {
|
||||
shutdown_futures.push(tenant.graceful_shutdown(true));
|
||||
}
|
||||
while let Some(_result) = shutdown_futures.next().await {}
|
||||
}
|
||||
|
||||
pub async fn create_tenant(
|
||||
@@ -234,36 +222,47 @@ pub async fn update_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
get_tenant(tenant_id, true)
|
||||
.await?
|
||||
.update_tenant_config(tenant_conf);
|
||||
let (tenant, _ctx) = get_active_tenant(tenant_id, ctx).await?;
|
||||
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
pub async fn get_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
parent_ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Tenant>, TenantRequestContext)> {
|
||||
let tenant = get_tenant(tenant_id).await?;
|
||||
let tenant_ctx = match tenant.get_context(parent_ctx) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(state) => anyhow::bail!("Tenant {} is not active, state: {:?}", tenant_id, state,),
|
||||
};
|
||||
Ok((tenant, tenant_ctx))
|
||||
}
|
||||
|
||||
pub async fn get_tenant(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = TENANTS.read().await;
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
anyhow::bail!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
|
||||
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
match get_tenant(tenant_id, true).await {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id).await?;
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
match get_active_tenant(tenant_id, ctx).await {
|
||||
Ok((tenant, ctx)) => {
|
||||
tenant.delete_timeline(timeline_id, &ctx).await?;
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
@@ -395,27 +394,31 @@ where
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
{
|
||||
let tenant = {
|
||||
let tenants_accessor = TENANTS.write().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => match tenant.current_state() {
|
||||
TenantState::Attaching
|
||||
| TenantState::Loading
|
||||
| TenantState::Broken
|
||||
| TenantState::Active => tenant.set_stopping(),
|
||||
| TenantState::Active => {
|
||||
tenant.set_stopping();
|
||||
Arc::clone(tenant)
|
||||
}
|
||||
TenantState::Stopping => {
|
||||
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
||||
}
|
||||
},
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
// Shut down all tenant and timeline tasks.
|
||||
tenant.graceful_shutdown(true).await;
|
||||
|
||||
// All tasks that operated on the tenant or any of its timelines have no finished,
|
||||
// and they are in Stopped state so that new ones cannot appear anymore. Proceed
|
||||
// with the cleanup.
|
||||
match tenant_cleanup
|
||||
.await
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
@@ -430,65 +433,10 @@ where
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => tenant.set_broken(),
|
||||
Some(tenant) => tenant.set_broken(&e.to_string()),
|
||||
None => warn!("Tenant {tenant_id} got removed from memory"),
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
// Run in task_mgr to avoid race with detach operation
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
);
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! unless the pageserver is configured without remote storage.
|
||||
//!
|
||||
//! We allocate the client instance in [Timeline][`crate::tenant::Timeline`], i.e.,
|
||||
//! either in [`crate::tenant_mgr`] during startup or when creating a new
|
||||
//! either in [`crate::tenant::mgr`] during startup or when creating a new
|
||||
//! timeline.
|
||||
//! However, the client does not become ready for use until we've initialized its upload queue:
|
||||
//!
|
||||
@@ -214,7 +214,8 @@ use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{info, warn};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -225,12 +226,12 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
tenant::metadata::TimelineMetadata,
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
tenant::TimelineRequestContext,
|
||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||
};
|
||||
|
||||
@@ -298,8 +299,8 @@ impl RemoteTimelineClient {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<RemoteTimelineClient> {
|
||||
Ok(RemoteTimelineClient {
|
||||
) -> RemoteTimelineClient {
|
||||
RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: &BACKGROUND_RUNTIME,
|
||||
tenant_id,
|
||||
@@ -307,31 +308,56 @@ impl RemoteTimelineClient {
|
||||
storage_impl: remote_storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the upload queue for a remote storage that already received
|
||||
/// an index file upload, i.e., it's not empty.
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
pub fn init_upload_queue(
|
||||
self: &Arc<Self>,
|
||||
index_part: &IndexPart,
|
||||
upload_ctx: TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancellation_token = upload_ctx.cancellation_token().clone();
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, upload_ctx)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.spawn_cancellation_watch(cancellation_token);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize the upload queue for the case where the remote storage is empty,
|
||||
/// i.e., it doesn't have an `IndexPart`.
|
||||
pub fn init_upload_queue_for_empty_remote(
|
||||
&self,
|
||||
self: &Arc<Self>,
|
||||
local_metadata: &TimelineMetadata,
|
||||
upload_ctx: TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancellation_token = upload_ctx.cancellation_token().clone();
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
upload_queue.initialize_empty_remote(local_metadata, upload_ctx)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
self.spawn_cancellation_watch(cancellation_token);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a task that calls `stop` on cancellation. It's important that we
|
||||
/// stop the upload queue promptly, because it holds onto the RequestContext,
|
||||
/// which in turn prevents the Timeline from shutting down.
|
||||
fn spawn_cancellation_watch(self: &Arc<Self>, cancellation_token: CancellationToken) {
|
||||
let self_rc = Arc::clone(self);
|
||||
task_mgr::spawn(
|
||||
self.runtime.handle(),
|
||||
"remote upload queue cancellation watch",
|
||||
false,
|
||||
async move {
|
||||
cancellation_token.cancelled().await;
|
||||
self_rc.stop();
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
|
||||
match &*self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
@@ -367,6 +393,10 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Download index file
|
||||
pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self
|
||||
.metrics
|
||||
.call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
|
||||
|
||||
download::download_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -393,22 +423,27 @@ impl RemoteTimelineClient {
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await?;
|
||||
let downloaded_size = {
|
||||
let _unfinished_gauge_guard = self
|
||||
.metrics
|
||||
.call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
|
||||
download::download_layer_file(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
|
||||
// Update the metadata for given layer file. The remote index file
|
||||
// might be missing some information for the file; this allows us
|
||||
@@ -517,7 +552,7 @@ impl RemoteTimelineClient {
|
||||
metadata_bytes,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
|
||||
|
||||
@@ -549,7 +584,7 @@ impl RemoteTimelineClient {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!(
|
||||
@@ -601,7 +636,7 @@ impl RemoteTimelineClient {
|
||||
// schedule the actual deletions
|
||||
for name in names {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
}
|
||||
@@ -616,7 +651,10 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
/// Wait for all previously scheduled uploads/deletions to complete
|
||||
///
|
||||
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
pub async fn wait_completion(
|
||||
self: &Arc<Self>,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (sender, mut receiver) = tokio::sync::watch::channel(());
|
||||
let barrier_op = UploadOp::Barrier(sender);
|
||||
|
||||
@@ -630,9 +668,16 @@ impl RemoteTimelineClient {
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
|
||||
if receiver.changed().await.is_err() {
|
||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||
}
|
||||
tokio::select! {
|
||||
result = receiver.changed() => {
|
||||
if result.is_err() {
|
||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||
}
|
||||
},
|
||||
_ = ctx.cancelled() => {
|
||||
anyhow::bail!("request cancelled while waiting on uploads to finish");
|
||||
},
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -675,7 +720,7 @@ impl RemoteTimelineClient {
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
info!("starting op: {}", next_op);
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters
|
||||
match next_op {
|
||||
@@ -710,16 +755,15 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Spawn task to perform the task
|
||||
let self_rc = Arc::clone(self);
|
||||
|
||||
let cancellation_token = upload_queue.upload_ctx.cancellation_token().clone();
|
||||
|
||||
task_mgr::spawn(
|
||||
self.runtime.handle(),
|
||||
TaskKind::RemoteUploadTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
false,
|
||||
async move {
|
||||
self_rc.perform_upload_task(task).await;
|
||||
Ok(())
|
||||
self_rc.perform_upload_task(task, cancellation_token).await;
|
||||
}
|
||||
.instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)),
|
||||
);
|
||||
@@ -739,7 +783,11 @@ impl RemoteTimelineClient {
|
||||
/// The task can be shut down, however. That leads to stopping the whole
|
||||
/// queue.
|
||||
///
|
||||
async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
|
||||
async fn perform_upload_task(
|
||||
self: &Arc<Self>,
|
||||
task: Arc<UploadTask>,
|
||||
cancellation_token: CancellationToken,
|
||||
) {
|
||||
// Loop to retry until it completes.
|
||||
loop {
|
||||
// If we're requested to shut down, close up shop and exit.
|
||||
@@ -747,13 +795,13 @@ impl RemoteTimelineClient {
|
||||
// Note: We only check for the shutdown requests between retries, so
|
||||
// if a shutdown request arrives while we're busy uploading, in the
|
||||
// upload::upload:*() call below, we will wait not exit until it has
|
||||
// finisheed. We probably could cancel the upload by simply dropping
|
||||
// finished. We probably could cancel the upload by simply dropping
|
||||
// the Future, but we're not 100% sure if the remote storage library
|
||||
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
||||
// upload finishes or times out soon enough.
|
||||
if task_mgr::is_shutdown_requested() {
|
||||
if cancellation_token.is_cancelled() {
|
||||
info!("upload task cancelled by shutdown request");
|
||||
self.update_upload_queue_unfinished_metric(-1, &task.op);
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
self.stop();
|
||||
return;
|
||||
}
|
||||
@@ -849,7 +897,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// sleep until it's time to retry, or we're cancelled
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => { },
|
||||
_ = cancellation_token.cancelled() => { },
|
||||
_ = exponential_backoff(
|
||||
retries,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
@@ -867,7 +915,7 @@ impl RemoteTimelineClient {
|
||||
task.op, retries
|
||||
);
|
||||
} else {
|
||||
info!("remote task {} completed successfully", task.op);
|
||||
debug!("remote task {} completed successfully", task.op);
|
||||
}
|
||||
|
||||
// The task has completed succesfully. Remove it from the in-progress list.
|
||||
@@ -901,22 +949,40 @@ impl RemoteTimelineClient {
|
||||
// Launch any queued tasks that were unblocked by this one.
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
self.update_upload_queue_unfinished_metric(-1, &task.op);
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
}
|
||||
|
||||
fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
|
||||
let (file_kind, op_kind) = match op {
|
||||
fn calls_unfinished_metric_impl(
|
||||
&self,
|
||||
op: &UploadOp,
|
||||
) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
|
||||
let res = match op {
|
||||
UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
|
||||
UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
|
||||
UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
|
||||
UploadOp::Barrier(_) => {
|
||||
// we do not account these
|
||||
return;
|
||||
return None;
|
||||
}
|
||||
};
|
||||
self.metrics
|
||||
.unfinished_tasks(&file_kind, &op_kind)
|
||||
.add(delta)
|
||||
Some(res)
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
let guard = self.metrics.call_begin(&file_kind, &op_kind);
|
||||
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
self.metrics.call_end(&file_kind, &op_kind);
|
||||
}
|
||||
|
||||
fn stop(&self) {
|
||||
@@ -967,7 +1033,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Tear down queued ops
|
||||
for op in qi.queued_operations.into_iter() {
|
||||
self.update_upload_queue_unfinished_metric(-1, &op);
|
||||
self.calls_unfinished_metric_end(&op);
|
||||
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
|
||||
// which is exactly what we want to happen.
|
||||
drop(op);
|
||||
@@ -983,7 +1049,9 @@ impl RemoteTimelineClient {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -1002,7 +1070,7 @@ mod tests {
|
||||
Lsn(0),
|
||||
// Any version will do
|
||||
// but it should be consistent with the one in the tests
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
|
||||
// go through serialize + deserialize to fix the header, including checksum
|
||||
@@ -1037,9 +1105,19 @@ mod tests {
|
||||
// Test scheduling
|
||||
#[test]
|
||||
fn upload_scheduling() -> anyhow::Result<()> {
|
||||
// Use a current-thread runtime in the test
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let _entered = runtime.enter();
|
||||
|
||||
let harness = TenantHarness::create("upload_scheduling")?;
|
||||
let (tenant, tenant_ctx) = runtime.block_on(harness.load());
|
||||
let (_timeline, timeline_ctx) =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &tenant_ctx)?;
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
std::fs::create_dir_all(&timeline_path)?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
@@ -1057,14 +1135,6 @@ mod tests {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
|
||||
// Use a current-thread runtime in the test
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let _entered = runtime.enter();
|
||||
|
||||
// Test outline:
|
||||
//
|
||||
// Schedule upload of a bunch of layers. Check that they are started immediately, not queued
|
||||
@@ -1100,7 +1170,11 @@ mod tests {
|
||||
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
let upload_ctx = timeline_ctx.register_another(RequestContext::new(
|
||||
TaskKind::RemoteUploadTask,
|
||||
DownloadBehavior::Error,
|
||||
));
|
||||
client.init_upload_queue_for_empty_remote(&metadata, upload_ctx)?;
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let content_foo = dummy_contents("foo");
|
||||
@@ -1140,7 +1214,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Wait for the uploads to finish
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
runtime.block_on(client.wait_completion(&timeline_ctx))?;
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1177,7 +1251,7 @@ mod tests {
|
||||
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
|
||||
|
||||
// Finish them
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
runtime.block_on(client.wait_completion(&timeline_ctx))?;
|
||||
|
||||
assert_remote_files(&["bar", "baz", "index_part.json"], &remote_timeline_dir);
|
||||
|
||||
|
||||
@@ -8,10 +8,9 @@ use std::future::Future;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::{debug, error, info, info_span, warn, Instrument};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -175,7 +174,7 @@ pub async fn list_remote_timelines<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
|
||||
) -> anyhow::Result<HashSet<TimelineId>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = conf.remote_path(&tenant_path)?;
|
||||
|
||||
@@ -194,7 +193,6 @@ pub async fn list_remote_timelines<'a>(
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut part_downloads = FuturesUnordered::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
@@ -205,35 +203,22 @@ pub async fn list_remote_timelines<'a>(
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
|
||||
// yet, launch a download task for it.
|
||||
if !timeline_ids.contains(&timeline_id) {
|
||||
timeline_ids.insert(timeline_id);
|
||||
let storage_clone = storage.clone();
|
||||
part_downloads.push(async move {
|
||||
(
|
||||
timeline_id,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
|
||||
.instrument(info_span!("download_index_part", timeline=%timeline_id))
|
||||
.await,
|
||||
)
|
||||
});
|
||||
}
|
||||
// list_prefixes is assumed to return unique names. Ensure this here.
|
||||
// NB: it's safer to bail out than warn-log this because the pageserver
|
||||
// needs to absolutely know about _all_ timelines that exist, so that
|
||||
// GC knows all the branchpoints. If we skipped over a timeline instead,
|
||||
// GC could delete a layer that's still needed by that timeline.
|
||||
anyhow::ensure!(
|
||||
!timeline_ids.contains(&timeline_id),
|
||||
"list_prefixes contains duplicate timeline id {timeline_id}"
|
||||
);
|
||||
timeline_ids.insert(timeline_id);
|
||||
}
|
||||
|
||||
// Wait for all the download tasks to complete.
|
||||
let mut timeline_parts = Vec::new();
|
||||
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
|
||||
let index_part = part_upload_result
|
||||
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
|
||||
|
||||
debug!("Successfully fetched index part for timeline {timeline_id}");
|
||||
timeline_parts.push((timeline_id, index_part));
|
||||
}
|
||||
Ok(timeline_parts)
|
||||
Ok(timeline_ids)
|
||||
}
|
||||
|
||||
pub async fn download_index_part(
|
||||
pub(super) async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -83,11 +83,6 @@ where
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<L>,
|
||||
|
||||
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
|
||||
/// so we need to make sure we're backwards-` (and maybe forwards-) compatible
|
||||
/// First pass is to move it to Optional and the next would be its removal
|
||||
missing_layers: Option<HashSet<L>>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
@@ -167,8 +162,6 @@ impl IndexPartUnclean {
|
||||
let IndexPartUnclean {
|
||||
version,
|
||||
timeline_layers,
|
||||
// this is an unused field, ignore it on cleaning
|
||||
missing_layers: _,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
@@ -189,7 +182,6 @@ impl IndexPartUnclean {
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
missing_layers: None,
|
||||
layer_metadata: layer_metadata
|
||||
.into_iter()
|
||||
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
|
||||
@@ -225,7 +217,6 @@ impl IndexPart {
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers: Some(HashSet::new()),
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
@@ -259,7 +250,6 @@ mod tests {
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
@@ -267,7 +257,6 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
missing_layers: None, // disabled fields should not carry unused values further
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
@@ -283,7 +272,6 @@ mod tests {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
@@ -296,7 +284,6 @@ mod tests {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
missing_layers: None,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
@@ -322,6 +309,7 @@ mod tests {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["This shouldn't fail deserialization"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
@@ -346,7 +334,6 @@ mod tests {
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
missing_layers: None,
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
|
||||
|
||||
@@ -3,10 +3,9 @@ use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::{PageReconstructError, TenantRequestContext, TimelineRequestContext};
|
||||
|
||||
use super::Tenant;
|
||||
use utils::id::TimelineId;
|
||||
@@ -63,13 +62,14 @@ pub(super) async fn gather_inputs(
|
||||
tenant: &Tenant,
|
||||
limit: &Arc<Semaphore>,
|
||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||
tenant_ctx: &TenantRequestContext,
|
||||
) -> anyhow::Result<ModelInputs> {
|
||||
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let timelines = tenant
|
||||
.refresh_gc_info()
|
||||
.refresh_gc_info(tenant_ctx)
|
||||
.await
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
@@ -97,9 +97,21 @@ pub(super) async fn gather_inputs(
|
||||
// used to determine the `retention_period` for the size model
|
||||
let mut max_cutoff_distance = None;
|
||||
|
||||
let mut ctx_dropguards: Vec<tokio_util::sync::DropGuard> = Vec::new();
|
||||
|
||||
for timeline in timelines {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
let ctx = match timeline.get_context(tenant_ctx) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(state) => {
|
||||
info!("skipping tenant size calculation for timeline because it is in {state:?} state");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
ctx_dropguards.push(ctx.cancellation_token().clone().drop_guard());
|
||||
let ctx = Arc::new(ctx);
|
||||
|
||||
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
||||
// there's a race between the update (holding tenant.gc_lock) and this read but it
|
||||
// might not be an issue, because it's not for Timeline::gc
|
||||
@@ -169,19 +181,23 @@ pub(super) async fn gather_inputs(
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
|
||||
for (lsn, _kind) in &interesting_lsns {
|
||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
|
||||
for (lsn, _kind) in interesting_lsns.iter() {
|
||||
let lsn = *lsn;
|
||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, lsn)) {
|
||||
updates.push(Update {
|
||||
lsn: *lsn,
|
||||
lsn,
|
||||
timeline_id: timeline.timeline_id,
|
||||
command: Command::Update(*size),
|
||||
});
|
||||
|
||||
needed_cache.insert((timeline.timeline_id, *lsn));
|
||||
needed_cache.insert((timeline.timeline_id, lsn));
|
||||
} else {
|
||||
let timeline = Arc::clone(&timeline);
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
|
||||
let ctx_clone = Arc::clone(&ctx);
|
||||
joinset.spawn(async move {
|
||||
calculate_logical_size(parallel_size_calcs, timeline, lsn, &ctx_clone).await
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -357,7 +373,7 @@ enum LsnKind {
|
||||
struct TimelineAtLsnSizeResult(
|
||||
Arc<crate::tenant::Timeline>,
|
||||
utils::lsn::Lsn,
|
||||
Result<u64, CalculateLogicalSizeError>,
|
||||
Result<u64, PageReconstructError>,
|
||||
);
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
||||
@@ -365,14 +381,13 @@ async fn calculate_logical_size(
|
||||
limit: Arc<tokio::sync::Semaphore>,
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> Result<TimelineAtLsnSizeResult, PageReconstructError> {
|
||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
.expect("global semaphore should not have been closed");
|
||||
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn)
|
||||
.await?;
|
||||
let size_res = timeline.calculate_logical_size(lsn, ctx).await;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
}
|
||||
|
||||
|
||||
@@ -109,7 +109,7 @@ pub trait Layer: Send + Sync {
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call, or a struct with a cached older image of the page if one
|
||||
/// is available. If this returns PageReconstructResult::Continue, look up
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
|
||||
@@ -1,45 +1,39 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::task_mgr::BACKGROUND_RUNTIME;
|
||||
use crate::tenant::Tenant;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
pub fn start_background_loops(tenant_id: TenantId) {
|
||||
pub fn start_background_loops(tenant: &Arc<Tenant>) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
|
||||
let tenant_clone = Arc::clone(tenant);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
compaction_loop(tenant_id)
|
||||
compaction_loop(&tenant_clone)
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
let tenant_clone = Arc::clone(tenant);
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
gc_loop(tenant_id)
|
||||
gc_loop(&tenant_clone)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -47,25 +41,27 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
///
|
||||
/// Compaction task's main loop
|
||||
///
|
||||
async fn compaction_loop(tenant_id: TenantId) {
|
||||
async fn compaction_loop(tenant: &Arc<Tenant>) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let top_ctx = RequestContext::new(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
|
||||
let tenant_ctx = match tenant.get_context(&top_ctx) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(state) => {
|
||||
// This could happen if the tenant is detached or the pageserver is shut
|
||||
// down immediately after loading or attaching completed and the tenant
|
||||
// was activated. It seems unlikely enough in practice that we better print
|
||||
// a warning, as it could also be a bug.
|
||||
error!("Not running compaction loop, tenant is not active: {state:?}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
},
|
||||
};
|
||||
|
||||
let mut sleep_duration = tenant.get_compaction_period();
|
||||
if sleep_duration == Duration::ZERO {
|
||||
info!("automatic compaction is disabled");
|
||||
@@ -73,7 +69,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration().await {
|
||||
if let Err(e) = tenant.compaction_iteration(&tenant_ctx).await {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
}
|
||||
@@ -81,9 +77,9 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = tenant_ctx.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break ;
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
}
|
||||
@@ -98,25 +94,28 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
async fn gc_loop(tenant_id: TenantId) {
|
||||
async fn gc_loop(tenant: &Arc<Tenant>) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let top_ctx = RequestContext::new(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let tenant_ctx = match tenant.get_context(&top_ctx) {
|
||||
Ok(ctx) => ctx,
|
||||
Err(state) => {
|
||||
// This could happen if the tenant is detached or the pageserver is shut
|
||||
// down immediately after loading or attaching completed and the tenant
|
||||
// was activated. It seems unlikely enough in practice that we better print
|
||||
// a warning, as it could also be a bug.
|
||||
error!("Not running GC loop, tenant is not active: {state:?}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(tenant) => tenant,
|
||||
},
|
||||
};
|
||||
|
||||
let gc_period = tenant.get_gc_period();
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let mut sleep_duration = gc_period;
|
||||
@@ -127,7 +126,10 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
|
||||
// Run compaction
|
||||
if let Err(e) = tenant
|
||||
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &tenant_ctx)
|
||||
.await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
@@ -137,7 +139,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = tenant_ctx.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
@@ -149,46 +151,3 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
trace!("GC loop stopped.");
|
||||
}
|
||||
|
||||
async fn wait_for_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
wait: Duration,
|
||||
) -> ControlFlow<(), Arc<Tenant>> {
|
||||
let tenant = loop {
|
||||
match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => break tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to get a tenant {tenant_id}: {e:#}");
|
||||
tokio::time::sleep(wait).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
ControlFlow::Continue(tenant)
|
||||
} else {
|
||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
match tenant_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = *tenant_state_updates.borrow();
|
||||
match new_state {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(tenant);
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,6 +4,7 @@ use super::storage_layer::LayerFileName;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::TimelineRequestContext;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
@@ -73,6 +74,13 @@ pub(crate) struct UploadQueueInitialized {
|
||||
/// tasks to finish. For example, metadata upload cannot be performed before all
|
||||
/// preceding layer file uploads have completed.
|
||||
pub(crate) queued_operations: VecDeque<UploadOp>,
|
||||
|
||||
/// Context used for the upload tasks. Note that this is associated with the
|
||||
/// Timeline, so this prevents the Timeline from being shut down. To ensure quick
|
||||
/// shutdown, RemoteTimelineClient spawns a task to wait for cancellation on the
|
||||
/// context and stop the queue. Otherwise we woudn't notice the cancellation
|
||||
/// until next upload attempt.
|
||||
pub(crate) upload_ctx: TimelineRequestContext,
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueStopped {
|
||||
@@ -83,6 +91,7 @@ impl UploadQueue {
|
||||
pub(crate) fn initialize_empty_remote(
|
||||
&mut self,
|
||||
metadata: &TimelineMetadata,
|
||||
upload_ctx: TimelineRequestContext,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
@@ -108,6 +117,7 @@ impl UploadQueue {
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
upload_ctx,
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
@@ -117,6 +127,7 @@ impl UploadQueue {
|
||||
pub(crate) fn initialize_with_current_remote_index_part(
|
||||
&mut self,
|
||||
index_part: &IndexPart,
|
||||
upload_ctx: TimelineRequestContext,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
@@ -153,6 +164,7 @@ impl UploadQueue {
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
upload_ctx,
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -31,6 +31,7 @@ use once_cell::sync::OnceCell;
|
||||
use std::future::Future;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
pub use connection_manager::spawn_connection_manager_task;
|
||||
@@ -76,7 +77,7 @@ pub fn is_broker_client_initialized() -> bool {
|
||||
|
||||
/// A handle of an asynchronous task.
|
||||
/// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
|
||||
/// and a cancellation channel that it can listen to for earlier interrupts.
|
||||
/// and a cancellation token that it can listen to for earlier interrupts.
|
||||
///
|
||||
/// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission.
|
||||
/// That may lead to certain events not being observed by the listener.
|
||||
@@ -84,7 +85,7 @@ pub fn is_broker_client_initialized() -> bool {
|
||||
pub struct TaskHandle<E> {
|
||||
join_handle: Option<tokio::task::JoinHandle<anyhow::Result<()>>>,
|
||||
events_receiver: watch::Receiver<TaskStateUpdate<E>>,
|
||||
cancellation: watch::Sender<()>,
|
||||
cancellation: CancellationToken,
|
||||
}
|
||||
|
||||
pub enum TaskEvent<E> {
|
||||
@@ -102,20 +103,18 @@ pub enum TaskStateUpdate<E> {
|
||||
impl<E: Clone> TaskHandle<E> {
|
||||
/// Initializes the task, starting it immediately after the creation.
|
||||
pub fn spawn<Fut>(
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, watch::Receiver<()>) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>) -> Fut + Send + 'static,
|
||||
cancellation: CancellationToken,
|
||||
) -> Self
|
||||
where
|
||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||
E: Send + Sync + 'static,
|
||||
{
|
||||
let (cancellation, cancellation_receiver) = watch::channel(());
|
||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||
|
||||
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
|
||||
events_sender.send(TaskStateUpdate::Started).ok();
|
||||
task(events_sender, cancellation_receiver).await
|
||||
task(events_sender).await
|
||||
});
|
||||
|
||||
TaskHandle {
|
||||
@@ -157,7 +156,7 @@ impl<E: Clone> TaskHandle<E> {
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
pub async fn shutdown(self) {
|
||||
if let Some(jh) = self.join_handle {
|
||||
self.cancellation.send(()).ok();
|
||||
self.cancellation.cancel();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
|
||||
@@ -11,9 +11,9 @@
|
||||
|
||||
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::context::{DownloadBehavior, RequestContext, TaskKind};
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{Timeline, TimelineRequestContext};
|
||||
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -46,6 +46,7 @@ pub fn spawn_connection_manager_task(
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
auth_token: Option<Arc<String>>,
|
||||
ctx: TimelineRequestContext,
|
||||
) {
|
||||
let mut broker_client = get_broker_client().clone();
|
||||
|
||||
@@ -54,9 +55,6 @@ pub fn spawn_connection_manager_task(
|
||||
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverManager,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
@@ -70,20 +68,21 @@ pub fn spawn_connection_manager_task(
|
||||
);
|
||||
loop {
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = ctx.cancelled() => {
|
||||
info!("WAL receiver shutdown requested, shutting down");
|
||||
walreceiver_state.shutdown().await;
|
||||
return Ok(());
|
||||
return;
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut walreceiver_state,
|
||||
&ctx,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
info!("Connection manager loop ended, shutting down");
|
||||
walreceiver_state.shutdown().await;
|
||||
return Ok(());
|
||||
return;
|
||||
}
|
||||
},
|
||||
}
|
||||
@@ -101,6 +100,7 @@ pub fn spawn_connection_manager_task(
|
||||
async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
ctx: &TimelineRequestContext,
|
||||
) -> ControlFlow<(), ()> {
|
||||
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
||||
|
||||
@@ -226,6 +226,7 @@ async fn connection_manager_loop_step(
|
||||
.change_connection(
|
||||
new_candidate.safekeeper_id,
|
||||
new_candidate.wal_source_connconf,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -389,26 +390,38 @@ impl WalreceiverState {
|
||||
&mut self,
|
||||
new_sk_id: NodeId,
|
||||
new_wal_source_connconf: PgConnectionConfig,
|
||||
ctx: &TimelineRequestContext,
|
||||
) {
|
||||
self.drop_old_connection(true).await;
|
||||
|
||||
let id = self.id;
|
||||
let connect_timeout = self.wal_connect_timeout;
|
||||
let timeline = Arc::clone(&self.timeline);
|
||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
new_wal_source_connconf,
|
||||
events_sender,
|
||||
cancellation,
|
||||
connect_timeout,
|
||||
)
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||
});
|
||||
|
||||
let child_ctx = ctx.register_another(RequestContext::with_parent(
|
||||
TaskKind::WalReceiverConnection,
|
||||
DownloadBehavior::Download,
|
||||
ctx,
|
||||
));
|
||||
let cancellation_token = child_ctx.cancellation_token().clone();
|
||||
|
||||
let connection_handle = TaskHandle::spawn(
|
||||
move |events_sender| {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
new_wal_source_connconf,
|
||||
events_sender,
|
||||
connect_timeout,
|
||||
child_ctx,
|
||||
)
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||
},
|
||||
cancellation_token,
|
||||
);
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
self.wal_connection = Some(WalConnection {
|
||||
@@ -820,6 +833,7 @@ fn wal_stream_connection_config(
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use url::Host;
|
||||
|
||||
fn dummy_broker_sk_timeline(
|
||||
@@ -900,12 +914,15 @@ mod tests {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
connection_task: TaskHandle::spawn(
|
||||
move |sender| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
},
|
||||
CancellationToken::new(),
|
||||
),
|
||||
discovered_new_wal: None,
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
@@ -1062,12 +1079,15 @@ mod tests {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
connection_task: TaskHandle::spawn(
|
||||
move |sender| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
},
|
||||
CancellationToken::new(),
|
||||
),
|
||||
discovered_new_wal: None,
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
@@ -1127,12 +1147,15 @@ mod tests {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
connection_task: TaskHandle::spawn(
|
||||
move |sender| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
},
|
||||
CancellationToken::new(),
|
||||
),
|
||||
discovered_new_wal: None,
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
@@ -1189,7 +1212,10 @@ mod tests {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
||||
connection_task: TaskHandle::spawn(
|
||||
move |_| async move { Ok(()) },
|
||||
CancellationToken::new(),
|
||||
),
|
||||
discovered_new_wal: Some(NewCommittedWAL {
|
||||
discovered_at: time_over_threshold,
|
||||
lsn: new_lsn,
|
||||
@@ -1233,18 +1259,18 @@ mod tests {
|
||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||
let (tenant, tenant_ctx) = harness.load().await;
|
||||
let (timeline, timeline_ctx) = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &tenant_ctx)
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
let timeline = timeline.initialize(&timeline_ctx).unwrap();
|
||||
|
||||
WalreceiverState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
},
|
||||
timeline: harness
|
||||
.load()
|
||||
.await
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager")
|
||||
.initialize()
|
||||
.unwrap(),
|
||||
timeline,
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Actual Postgres connection handler to stream WAL to the server.
|
||||
|
||||
use std::{
|
||||
error::Error,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -11,7 +12,7 @@ use bytes::BytesMut;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use fail::fail_point;
|
||||
use futures::StreamExt;
|
||||
use postgres::{SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
@@ -20,21 +21,18 @@ use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::{
|
||||
metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
|
||||
};
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
use crate::{
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{Timeline, WalReceiverInfo},
|
||||
tenant::{Timeline, TimelineRequestContext, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -60,8 +58,8 @@ pub async fn handle_walreceiver_connection(
|
||||
timeline: Arc<Timeline>,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
mut cancellation: watch::Receiver<()>,
|
||||
connect_timeout: Duration,
|
||||
ctx: TimelineRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {wal_source_connconf:?}");
|
||||
@@ -70,10 +68,17 @@ pub async fn handle_walreceiver_connection(
|
||||
let mut config = wal_source_connconf.to_tokio_postgres_config();
|
||||
config.application_name("pageserver");
|
||||
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
time::timeout(connect_timeout, config.connect(postgres::NoTls))
|
||||
.await
|
||||
.context("Timed out while waiting for walreceiver connection to open")?
|
||||
.context("Failed to open walreceiver connection")?
|
||||
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
|
||||
Ok(Ok(client_and_conn)) => client_and_conn,
|
||||
Ok(Err(conn_err)) => {
|
||||
let expected_error = ignore_expected_errors(conn_err)?;
|
||||
info!("DB connection stream finished: {expected_error}");
|
||||
return Ok(());
|
||||
}
|
||||
Err(elapsed) => anyhow::bail!(
|
||||
"Timed out while waiting {elapsed} for walreceiver connection to open"
|
||||
),
|
||||
}
|
||||
};
|
||||
|
||||
info!("connected!");
|
||||
@@ -92,12 +97,9 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
let mut connection_cancellation = cancellation.clone();
|
||||
let cancellation_token = ctx.cancellation_token().clone();
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverConnection,
|
||||
Some(timeline.tenant_id),
|
||||
Some(timeline.timeline_id),
|
||||
"walreceiver connection",
|
||||
false,
|
||||
async move {
|
||||
@@ -105,17 +107,14 @@ pub async fn handle_walreceiver_connection(
|
||||
connection_result = connection => match connection_result{
|
||||
Ok(()) => info!("Walreceiver db connection closed"),
|
||||
Err(connection_error) => {
|
||||
if connection_error.is_closed() {
|
||||
info!("Connection closed regularly: {connection_error}")
|
||||
} else {
|
||||
warn!("Connection aborted: {connection_error}")
|
||||
if let Err(e) = ignore_expected_errors(connection_error) {
|
||||
warn!("Connection aborted: {e:#}")
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
_ = connection_cancellation.changed() => info!("Connection cancelled"),
|
||||
_ = cancellation_token.cancelled() => info!("Connection cancelled"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
@@ -175,12 +174,13 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest =
|
||||
with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
|
||||
let cancellation = ctx.cancellation_token().clone();
|
||||
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
_ = cancellation.changed() => {
|
||||
_ = cancellation.cancelled() => {
|
||||
info!("walreceiver interrupted");
|
||||
None
|
||||
}
|
||||
@@ -190,14 +190,9 @@ pub async fn handle_walreceiver_connection(
|
||||
let replication_message = match replication_message {
|
||||
Ok(message) => message,
|
||||
Err(replication_error) => {
|
||||
if replication_error.is_closed() {
|
||||
info!("Replication stream got closed");
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(
|
||||
anyhow::Error::new(replication_error).context("replication stream error")
|
||||
);
|
||||
}
|
||||
let expected_error = ignore_expected_errors(replication_error)?;
|
||||
info!("Replication stream finished: {expected_error}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -251,16 +246,10 @@ pub async fn handle_walreceiver_connection(
|
||||
// at risk of hitting a deadlock.
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
with_ondemand_download(|| {
|
||||
walingest.ingest_record(
|
||||
recdata.clone(),
|
||||
lsn,
|
||||
&mut modification,
|
||||
&mut decoded,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
@@ -335,10 +324,11 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size(&ctx)
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = ReplicationFeedback {
|
||||
current_timeline_size: timeline
|
||||
.get_current_logical_size()
|
||||
.context("Status update creation failed to get current logical size")?,
|
||||
current_timeline_size: timeline_logical_size,
|
||||
ps_writelsn: write_lsn,
|
||||
ps_flushlsn: flush_lsn,
|
||||
ps_applylsn: apply_lsn,
|
||||
@@ -408,3 +398,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't want to report connectivity problems as real errors towards connection manager because
|
||||
/// 1. they happen frequently enough to make server logs hard to read and
|
||||
/// 2. the connection manager can retry other safekeeper.
|
||||
///
|
||||
/// If this function returns `Ok(pg_error)`, it's such an error.
|
||||
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
|
||||
/// Connection manager will then handle reconnections.
|
||||
///
|
||||
/// If this function returns an `Err()`, the caller can bubble it up using `?`.
|
||||
/// The connection manager will log the error at ERROR level.
|
||||
fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
|
||||
if pg_error.is_closed()
|
||||
|| pg_error
|
||||
.source()
|
||||
.and_then(|source| source.downcast_ref::<std::io::Error>())
|
||||
.map(is_expected_io_error)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return Ok(pg_error);
|
||||
} else if let Some(db_error) = pg_error.as_db_error() {
|
||||
if db_error.code() == &SqlState::CONNECTION_FAILURE
|
||||
&& db_error.message().contains("end streaming")
|
||||
{
|
||||
return Ok(pg_error);
|
||||
}
|
||||
}
|
||||
Err(pg_error).context("connection error")
|
||||
}
|
||||
|
||||
@@ -111,6 +111,7 @@ pageserver_connect()
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
|
||||
neon_log(ERROR, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
@@ -179,7 +180,10 @@ pageserver_disconnect(void)
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (pageserver_conn_wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request)
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = PQerrorMessage(pageserver_conn);
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "failed to send page request: %s", msg);
|
||||
@@ -239,29 +243,33 @@ pageserver_receive(void)
|
||||
PG_TRY();
|
||||
{
|
||||
/* read response */
|
||||
resp_buff.len = call_PQgetCopyData(&resp_buff.data);
|
||||
resp_buff.cursor = 0;
|
||||
int rc;
|
||||
|
||||
if (resp_buff.len < 0)
|
||||
rc = call_PQgetCopyData(&resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
if (resp_buff.len == -1)
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
pageserver_disconnect();
|
||||
return NULL;
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
else if (resp_buff.len == -2)
|
||||
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
|
||||
}
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
else if (rc == -1)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
pageserver_disconnect();
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
|
||||
else
|
||||
neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
@@ -420,7 +428,7 @@ pg_init_libpagestore(void)
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.safekeeper_token_env",
|
||||
"the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN",
|
||||
"the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
|
||||
NULL,
|
||||
&safekeeper_token_env,
|
||||
NULL,
|
||||
|
||||
@@ -52,6 +52,7 @@
|
||||
#include "access/xlogdefs.h"
|
||||
#include "catalog/pg_class.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "executor/instrument.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "postmaster/autovacuum.h"
|
||||
@@ -250,11 +251,6 @@ PrefetchState *MyPState;
|
||||
) \
|
||||
)
|
||||
|
||||
int n_prefetch_hits = 0;
|
||||
int n_prefetch_misses = 0;
|
||||
int n_prefetch_missed_caches = 0;
|
||||
int n_prefetch_dupes = 0;
|
||||
|
||||
XLogRecPtr prefetch_lsn = 0;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
@@ -770,7 +766,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
else
|
||||
{
|
||||
/* The buffered request is good enough, return that index */
|
||||
n_prefetch_dupes++;
|
||||
pgBufferUsage.prefetch.duplicates++;
|
||||
return ring_index;
|
||||
}
|
||||
}
|
||||
@@ -1845,7 +1841,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (slot->effective_request_lsn >= request_lsn)
|
||||
{
|
||||
ring_index = slot->my_ring_index;
|
||||
n_prefetch_hits += 1;
|
||||
pgBufferUsage.prefetch.hits += 1;
|
||||
}
|
||||
else /* the current prefetch LSN is not large enough, so drop the prefetch */
|
||||
{
|
||||
@@ -1860,7 +1856,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
/* drop caches */
|
||||
prefetch_set_unused(slot->my_ring_index);
|
||||
n_prefetch_missed_caches += 1;
|
||||
pgBufferUsage.prefetch.expired += 1;
|
||||
/* make it look like a prefetch cache miss */
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -1870,7 +1866,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
if (entry == NULL)
|
||||
{
|
||||
n_prefetch_misses += 1;
|
||||
pgBufferUsage.prefetch.misses += 1;
|
||||
|
||||
ring_index = prefetch_register_buffer(buftag, &request_latest,
|
||||
&request_lsn);
|
||||
|
||||
8
poetry.lock
generated
8
poetry.lock
generated
@@ -1418,7 +1418,7 @@ pbr = "*"
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "65.5.0"
|
||||
version = "65.5.1"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
category = "main"
|
||||
optional = false
|
||||
@@ -1426,7 +1426,7 @@ python-versions = ">=3.7"
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
|
||||
|
||||
[[package]]
|
||||
@@ -2283,8 +2283,8 @@ sarif-om = [
|
||||
{file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
|
||||
]
|
||||
setuptools = [
|
||||
{file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
|
||||
{file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
|
||||
{file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
|
||||
{file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
|
||||
]
|
||||
six = [
|
||||
{file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
@@ -16,12 +17,14 @@ hashbrown = "0.12"
|
||||
hex = "0.4.3"
|
||||
hmac = "0.12.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.8.1"
|
||||
itertools = "0.10.3"
|
||||
md5 = "0.7.0"
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2.7"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
|
||||
routerify = "3"
|
||||
rustls = "0.20.0"
|
||||
@@ -35,10 +38,12 @@ thiserror = "1.0.30"
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-rustls = "0.23.0"
|
||||
tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
|
||||
tracing = "0.1.36"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
webpki-roots = "0.22.5"
|
||||
x509-parser = "0.14"
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
|
||||
@@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError};
|
||||
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute, http, mgmt, stream, url,
|
||||
compute,
|
||||
console::messages::MetricsAuxInfo,
|
||||
http, mgmt, stream, url,
|
||||
waiters::{self, Waiter, Waiters},
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -126,25 +128,13 @@ pub struct AuthSuccess<T> {
|
||||
pub value: T,
|
||||
}
|
||||
|
||||
impl<T> AuthSuccess<T> {
|
||||
/// Very similar to [`std::option::Option::map`].
|
||||
/// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
|
||||
AuthSuccess {
|
||||
reported_auth_ok: self.reported_auth_ok,
|
||||
value: f(self.value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Info for establishing a connection to a compute node.
|
||||
/// This is what we get after auth succeeded, but not before!
|
||||
pub struct NodeInfo {
|
||||
/// Project from [`auth::ClientCredentials`].
|
||||
pub project: String,
|
||||
/// Compute node connection params.
|
||||
pub config: compute::ConnCfg,
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: MetricsAuxInfo,
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
@@ -159,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the project name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
let fetch_magic_payload = async {
|
||||
let fetch_magic_payload = |client| async {
|
||||
warn!("project name not specified, resorting to the password hack auth flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
@@ -171,38 +161,61 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
auth::Result::Ok(payload)
|
||||
};
|
||||
|
||||
// If we want to use cleartext password flow, we can read the password
|
||||
// from the client and pretend that it's a magic payload (PasswordHack hack).
|
||||
let fetch_plaintext_password = |client| async {
|
||||
info!("using cleartext password flow");
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
auth::Result::Ok(auth::password_hack::PasswordHackPayload {
|
||||
project: String::new(),
|
||||
password: payload,
|
||||
})
|
||||
};
|
||||
|
||||
// TODO: find a proper way to merge those very similar blocks.
|
||||
let (mut config, payload) = match self {
|
||||
let (mut node, payload) = match self {
|
||||
Console(endpoint, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload.await?;
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
|
||||
let mut creds = creds.as_ref();
|
||||
creds.project = Some(payload.project.as_str().into());
|
||||
let config = console::Api::new(endpoint, extra, &creds)
|
||||
let node = console::Api::new(endpoint, extra, &creds)
|
||||
.wake_compute()
|
||||
.await?;
|
||||
|
||||
(config, payload)
|
||||
(node, payload)
|
||||
}
|
||||
Console(endpoint, creds) if creds.use_cleartext_password_flow => {
|
||||
// This is a hack to allow cleartext password in secure connections (wss).
|
||||
let payload = fetch_plaintext_password(client).await?;
|
||||
let creds = creds.as_ref();
|
||||
let node = console::Api::new(endpoint, extra, &creds)
|
||||
.wake_compute()
|
||||
.await?;
|
||||
|
||||
(node, payload)
|
||||
}
|
||||
Postgres(endpoint, creds) if creds.project.is_none() => {
|
||||
let payload = fetch_magic_payload.await?;
|
||||
let payload = fetch_magic_payload(client).await?;
|
||||
|
||||
let mut creds = creds.as_ref();
|
||||
creds.project = Some(payload.project.as_str().into());
|
||||
let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
|
||||
let node = postgres::Api::new(endpoint, &creds).wake_compute().await?;
|
||||
|
||||
(config, payload)
|
||||
(node, payload)
|
||||
}
|
||||
_ => return Ok(None),
|
||||
};
|
||||
|
||||
config.password(payload.password);
|
||||
node.config.password(payload.password);
|
||||
Ok(Some(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: NodeInfo {
|
||||
project: payload.project,
|
||||
config,
|
||||
},
|
||||
value: node,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -233,10 +246,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
console::Api::new(&endpoint, extra, &creds)
|
||||
.handle_user(client)
|
||||
.await?
|
||||
.map(|config| NodeInfo {
|
||||
project: creds.project.unwrap().into_owned(),
|
||||
config,
|
||||
})
|
||||
}
|
||||
Postgres(endpoint, creds) => {
|
||||
info!("performing mock authentication using a local postgres instance");
|
||||
@@ -245,10 +254,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
postgres::Api::new(&endpoint, &creds)
|
||||
.handle_user(client)
|
||||
.await?
|
||||
.map(|config| NodeInfo {
|
||||
project: creds.project.unwrap().into_owned(),
|
||||
config,
|
||||
})
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
//! Cloud API V2.
|
||||
|
||||
use super::{AuthSuccess, ConsoleReqExtra};
|
||||
use super::{AuthSuccess, ConsoleReqExtra, NodeInfo};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
console::messages::{ConsoleError, GetRoleSecret, WakeCompute},
|
||||
error::{io_error, UserFacingError},
|
||||
http, sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use reqwest::StatusCode as HttpStatusCode;
|
||||
use serde::Deserialize;
|
||||
use std::future::Future;
|
||||
use thiserror::Error;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -136,24 +136,6 @@ impl UserFacingError for WakeComputeError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Console's response which holds client's auth secret.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct GetRoleSecret {
|
||||
role_secret: Box<str>,
|
||||
}
|
||||
|
||||
/// Console's response which holds compute node's `host:port` pair.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct WakeCompute {
|
||||
address: Box<str>,
|
||||
}
|
||||
|
||||
/// Console's error response with human-readable description.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct ConsoleError {
|
||||
error: Box<str>,
|
||||
}
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
pub enum AuthInfo {
|
||||
/// Md5 hash of user's password.
|
||||
@@ -194,7 +176,7 @@ impl<'a> Api<'a> {
|
||||
pub(super) async fn handle_user(
|
||||
&'a self,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
|
||||
}
|
||||
}
|
||||
@@ -238,7 +220,7 @@ impl Api<'_> {
|
||||
}
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
async {
|
||||
let request = self
|
||||
@@ -269,7 +251,10 @@ impl Api<'_> {
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
|
||||
Ok(config)
|
||||
Ok(NodeInfo {
|
||||
config,
|
||||
aux: body.aux,
|
||||
})
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
.instrument(info_span!("wake_compute", id = request_id))
|
||||
@@ -284,11 +269,11 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
|
||||
endpoint: &'a Endpoint,
|
||||
get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
|
||||
wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>>
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>>
|
||||
where
|
||||
Endpoint: AsRef<ClientCredentials<'a>>,
|
||||
GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
|
||||
WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
|
||||
WakeCompute: Future<Output = Result<NodeInfo, WakeComputeError>>,
|
||||
{
|
||||
let creds = endpoint.as_ref();
|
||||
|
||||
@@ -325,19 +310,20 @@ where
|
||||
}
|
||||
};
|
||||
|
||||
let mut config = wake_compute(endpoint).await?;
|
||||
let mut node = wake_compute(endpoint).await?;
|
||||
if let Some(keys) = scram_keys {
|
||||
config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
node.config.auth_keys(AuthKeys::ScramSha256(keys));
|
||||
}
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: config,
|
||||
value: node,
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse http response body, taking status code into account.
|
||||
async fn parse_body<T: for<'a> Deserialize<'a>>(
|
||||
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
response: reqwest::Response,
|
||||
) -> Result<T, ApiError> {
|
||||
let status = response.status();
|
||||
|
||||
@@ -86,8 +86,8 @@ pub async fn handle_user(
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: true,
|
||||
value: NodeInfo {
|
||||
project: db_info.project,
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use super::{
|
||||
console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
|
||||
AuthSuccess,
|
||||
AuthSuccess, NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::{self, ClientCredentials},
|
||||
@@ -57,7 +57,7 @@ impl<'a> Api<'a> {
|
||||
pub(super) async fn handle_user(
|
||||
&'a self,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
// We reuse user handling logic from a production module.
|
||||
console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
|
||||
}
|
||||
@@ -103,7 +103,7 @@ impl Api<'_> {
|
||||
}
|
||||
|
||||
/// We don't need to wake anything locally, so we just return the connection info.
|
||||
pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
|
||||
pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
.host(self.endpoint.host_str().unwrap_or("localhost"))
|
||||
@@ -111,7 +111,10 @@ impl Api<'_> {
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
|
||||
Ok(config)
|
||||
Ok(NodeInfo {
|
||||
config,
|
||||
aux: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
pub dbname: &'a str,
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
/// If `True`, we'll use the old cleartext password flow. This is used for
|
||||
/// websocket connections, which want to minimize the number of round trips.
|
||||
pub use_cleartext_password_flow: bool,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
@@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
user: self.user,
|
||||
dbname: self.dbname,
|
||||
project: self.project().map(Cow::Borrowed),
|
||||
use_cleartext_password_flow: self.use_cleartext_password_flow,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_name: Option<&str>,
|
||||
use_cleartext_password_flow: bool,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
@@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
user = user,
|
||||
dbname = dbname,
|
||||
project = project.as_deref(),
|
||||
use_cleartext_password_flow = use_cleartext_password_flow,
|
||||
"credentials"
|
||||
);
|
||||
|
||||
@@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
user,
|
||||
dbname,
|
||||
project,
|
||||
use_cleartext_password_flow,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -141,7 +148,7 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
// TODO: check that `creds.dbname` is None.
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
|
||||
Ok(())
|
||||
@@ -151,7 +158,7 @@ mod tests {
|
||||
fn parse_missing_project() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -166,7 +173,7 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
@@ -182,7 +189,7 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None, false)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -201,7 +208,7 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
@@ -220,7 +227,8 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -237,7 +245,8 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_name = Some("example.com");
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
let err =
|
||||
ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentSni { sni, cn } => {
|
||||
assert_eq!(sni, "project.localhost");
|
||||
|
||||
@@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack {
|
||||
}
|
||||
}
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword;
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
#[inline(always)]
|
||||
fn first_message(&self) -> BeMessage<'_> {
|
||||
Be::AuthenticationCleartextPassword
|
||||
}
|
||||
}
|
||||
|
||||
/// This wrapper for [`PqStream`] performs client authentication.
|
||||
#[must_use]
|
||||
pub struct AuthFlow<'a, Stream, State> {
|
||||
@@ -86,6 +97,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<Vec<u8>> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
|
||||
|
||||
Ok(password.to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
/// Stream wrapper for handling [SCRAM](crate::scram) auth.
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
|
||||
@@ -43,7 +43,7 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
|
||||
/// Eventually, `tokio_postgres` will be replaced with something better.
|
||||
/// Newtype allows us to implement methods on top of it.
|
||||
#[repr(transparent)]
|
||||
pub struct ConnCfg(pub tokio_postgres::Config);
|
||||
pub struct ConnCfg(Box<tokio_postgres::Config>);
|
||||
|
||||
impl ConnCfg {
|
||||
/// Construct a new connection config.
|
||||
|
||||
5
proxy/src/console.rs
Normal file
5
proxy/src/console.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
///! Various stuff for dealing with the Neon Console.
|
||||
///! Later we might move some API wrappers here.
|
||||
|
||||
/// Payloads used in the console's APIs.
|
||||
pub mod messages;
|
||||
190
proxy/src/console/messages.rs
Normal file
190
proxy/src/console/messages.rs
Normal file
@@ -0,0 +1,190 @@
|
||||
use serde::Deserialize;
|
||||
use std::fmt;
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ConsoleError {
|
||||
pub error: Box<str>,
|
||||
}
|
||||
|
||||
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
||||
/// Returned by the `/proxy_get_role_secret` API method.
|
||||
#[derive(Deserialize)]
|
||||
pub struct GetRoleSecret {
|
||||
pub role_secret: Box<str>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
impl fmt::Debug for GetRoleSecret {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("GetRoleSecret").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
/// Response which holds compute node's `host:port` pair.
|
||||
/// Returned by the `/proxy_wake_compute` API method.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct WakeCompute {
|
||||
pub address: Box<str>,
|
||||
pub aux: MetricsAuxInfo,
|
||||
}
|
||||
|
||||
/// Async response which concludes the link auth flow.
|
||||
/// Also known as `kickResponse` in the console.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct KickSession<'a> {
|
||||
/// Session ID is assigned by the proxy.
|
||||
pub session_id: &'a str,
|
||||
|
||||
/// Compute node connection params.
|
||||
#[serde(deserialize_with = "KickSession::parse_db_info")]
|
||||
pub result: DatabaseInfo,
|
||||
}
|
||||
|
||||
impl KickSession<'_> {
|
||||
fn parse_db_info<'de, D>(des: D) -> Result<DatabaseInfo, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
enum Wrapper {
|
||||
// Currently, console only reports `Success`.
|
||||
// `Failure(String)` used to be here... RIP.
|
||||
Success(DatabaseInfo),
|
||||
}
|
||||
|
||||
Wrapper::deserialize(des).map(|x| match x {
|
||||
Wrapper::Success(info) => info,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute node connection params.
|
||||
#[derive(Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
/// Console always provides a password, but it might
|
||||
/// be inconvenient for debug with local PG instance.
|
||||
pub password: Option<String>,
|
||||
pub aux: MetricsAuxInfo,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
impl fmt::Debug for DatabaseInfo {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("DatabaseInfo")
|
||||
.field("host", &self.host)
|
||||
.field("port", &self.port)
|
||||
.field("dbname", &self.dbname)
|
||||
.field("user", &self.user)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
/// Various labels for prometheus metrics.
|
||||
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
pub struct MetricsAuxInfo {
|
||||
pub endpoint_id: Box<str>,
|
||||
pub project_id: Box<str>,
|
||||
pub branch_id: Box<str>,
|
||||
}
|
||||
|
||||
impl MetricsAuxInfo {
|
||||
/// Definitions of labels for traffic metric.
|
||||
pub const TRAFFIC_LABELS: &'static [&'static str] = &[
|
||||
// Received (rx) / sent (tx).
|
||||
"direction",
|
||||
// ID of a project.
|
||||
"project_id",
|
||||
// ID of an endpoint within a project.
|
||||
"endpoint_id",
|
||||
// ID of a branch within a project (snapshot).
|
||||
"branch_id",
|
||||
];
|
||||
|
||||
/// Values of labels for traffic metric.
|
||||
// TODO: add more type safety (validate arity & positions).
|
||||
pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
|
||||
[
|
||||
direction,
|
||||
&self.project_id,
|
||||
&self.endpoint_id,
|
||||
&self.branch_id,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
fn dummy_aux() -> serde_json::Value {
|
||||
json!({
|
||||
"endpoint_id": "endpoint",
|
||||
"project_id": "project",
|
||||
"branch_id": "branch",
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_kick_session() -> anyhow::Result<()> {
|
||||
// This is what the console's kickResponse looks like.
|
||||
let json = json!({
|
||||
"session_id": "deadbeef",
|
||||
"result": {
|
||||
"Success": {
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"password": "password",
|
||||
"aux": dummy_aux(),
|
||||
}
|
||||
}
|
||||
});
|
||||
let _: KickSession = serde_json::from_str(&json.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_db_info() -> anyhow::Result<()> {
|
||||
// with password
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"password": "password",
|
||||
"aux": dummy_aux(),
|
||||
}))?;
|
||||
|
||||
// without password
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"aux": dummy_aux(),
|
||||
}))?;
|
||||
|
||||
// new field (forward compatibility)
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"project": "hello_world",
|
||||
"N.E.W": "forward compatibility check",
|
||||
"aux": dummy_aux(),
|
||||
}))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
pub mod server;
|
||||
pub mod websocket;
|
||||
|
||||
use crate::url::ApiUrl;
|
||||
|
||||
|
||||
263
proxy/src/http/websocket.rs
Normal file
263
proxy/src/http/websocket.rs
Normal file
@@ -0,0 +1,263 @@
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream, StreamExt};
|
||||
use hyper::server::accept::{self};
|
||||
use hyper::server::conn::AddrIncoming;
|
||||
use hyper::upgrade::Upgraded;
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use hyper_tungstenite::{tungstenite, WebSocketStream};
|
||||
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
|
||||
use pin_project_lite::pin_project;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use std::convert::Infallible;
|
||||
use std::future::ready;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
use tls_listener::TlsListener;
|
||||
|
||||
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
|
||||
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::http::{error::ApiError, json::json_response};
|
||||
|
||||
use crate::cancellation::CancelMap;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::proxy::handle_ws_client;
|
||||
|
||||
pin_project! {
|
||||
/// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
|
||||
pub struct WebSocketRW {
|
||||
#[pin]
|
||||
stream: WebSocketStream<Upgraded>,
|
||||
chunk: Option<bytes::Bytes>,
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: explain why this is safe or try to remove `unsafe impl`.
|
||||
unsafe impl Sync for WebSocketRW {}
|
||||
|
||||
impl WebSocketRW {
|
||||
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
chunk: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn has_chunk(&self) -> bool {
|
||||
if let Some(ref chunk) = self.chunk {
|
||||
chunk.remaining() > 0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn ws_err_into(e: tungstenite::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e.to_string())
|
||||
}
|
||||
|
||||
impl AsyncWrite for WebSocketRW {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, io::Error>> {
|
||||
let mut this = self.project();
|
||||
match this.stream.as_mut().poll_ready(cx) {
|
||||
Poll::Ready(Ok(())) => {
|
||||
if let Err(e) = this
|
||||
.stream
|
||||
.as_mut()
|
||||
.start_send(Message::Binary(buf.to_vec()))
|
||||
{
|
||||
Poll::Ready(Err(ws_err_into(e)))
|
||||
} else {
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
}
|
||||
Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
|
||||
Poll::Pending => {
|
||||
cx.waker().wake_by_ref();
|
||||
Poll::Pending
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().stream.poll_flush(cx).map_err(ws_err_into)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().stream.poll_close(cx).map_err(ws_err_into)
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncRead for WebSocketRW {
|
||||
fn poll_read(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if buf.remaining() == 0 {
|
||||
return Poll::Ready(Ok(()));
|
||||
}
|
||||
|
||||
let inner_buf = match self.as_mut().poll_fill_buf(cx) {
|
||||
Poll::Ready(Ok(buf)) => buf,
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
};
|
||||
let len = std::cmp::min(inner_buf.len(), buf.remaining());
|
||||
buf.put_slice(&inner_buf[..len]);
|
||||
|
||||
self.consume(len);
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncBufRead for WebSocketRW {
|
||||
fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
|
||||
loop {
|
||||
if self.as_mut().has_chunk() {
|
||||
let buf = self.project().chunk.as_ref().unwrap().chunk();
|
||||
return Poll::Ready(Ok(buf));
|
||||
} else {
|
||||
match self.as_mut().project().stream.poll_next(cx) {
|
||||
Poll::Ready(Some(Ok(message))) => match message {
|
||||
Message::Text(_) => {}
|
||||
Message::Binary(chunk) => {
|
||||
*self.as_mut().project().chunk = Some(Bytes::from(chunk));
|
||||
}
|
||||
Message::Ping(_) => {
|
||||
// No need to send a reply: tungstenite takes care of this for you.
|
||||
}
|
||||
Message::Pong(_) => {}
|
||||
Message::Close(_) => {
|
||||
// No need to send a reply: tungstenite takes care of this for you.
|
||||
return Poll::Ready(Ok(&[]));
|
||||
}
|
||||
Message::Frame(_) => {
|
||||
unreachable!();
|
||||
}
|
||||
},
|
||||
Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
|
||||
Poll::Ready(None) => return Poll::Ready(Ok(&[])),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn consume(self: Pin<&mut Self>, amt: usize) {
|
||||
if amt > 0 {
|
||||
self.project()
|
||||
.chunk
|
||||
.as_mut()
|
||||
.expect("No chunk present")
|
||||
.advance(amt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve_websocket(
|
||||
websocket: HyperWebsocket,
|
||||
config: &ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_ws_client(
|
||||
config,
|
||||
cancel_map,
|
||||
session_id,
|
||||
WebSocketRW::new(websocket),
|
||||
hostname,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn ws_handler(
|
||||
mut request: Request<Body>,
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
session_id: uuid::Uuid,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
.get("host")
|
||||
.and_then(|h| h.to_str().ok())
|
||||
.and_then(|h| h.split(':').next())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!("error in websocket connection: {:?}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
} else {
|
||||
json_response(StatusCode::OK, "Connect with a websocket client")
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
ws_listener: TcpListener,
|
||||
config: &'static ProxyConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
}
|
||||
|
||||
let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
|
||||
let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
|
||||
Some(config) => config.into(),
|
||||
None => {
|
||||
warn!("TLS config is missing, WebSocket Secure server will not be started");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
|
||||
|
||||
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
|
||||
if let Err(err) = conn {
|
||||
error!("failed to accept TLS connection for websockets: {:?}", err);
|
||||
ready(false)
|
||||
} else {
|
||||
ready(true)
|
||||
}
|
||||
});
|
||||
|
||||
let make_svc = hyper::service::make_service_fn(|_stream| async move {
|
||||
Ok::<_, Infallible>(hyper::service::service_fn(
|
||||
move |req: Request<Body>| async move {
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
ws_handler(req, config, cancel_map, session_id)
|
||||
.instrument(info_span!(
|
||||
"ws-client",
|
||||
session = format_args!("{session_id}")
|
||||
))
|
||||
.await
|
||||
},
|
||||
))
|
||||
});
|
||||
|
||||
hyper::Server::builder(accept::from_stream(tls_listener))
|
||||
.serve(make_svc)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -8,6 +8,7 @@ mod auth;
|
||||
mod cancellation;
|
||||
mod compute;
|
||||
mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod mgmt;
|
||||
@@ -109,12 +110,23 @@ async fn main() -> anyhow::Result<()> {
|
||||
info!("Starting proxy on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let tasks = [
|
||||
let mut tasks = vec![
|
||||
tokio::spawn(http::server::task_main(http_listener)),
|
||||
tokio::spawn(proxy::task_main(config, proxy_listener)),
|
||||
tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
|
||||
]
|
||||
.map(flatten_err);
|
||||
];
|
||||
|
||||
if let Some(wss_address) = arg_matches.get_one::<String>("wss") {
|
||||
let wss_address: SocketAddr = wss_address.parse()?;
|
||||
info!("Starting wss on {}", wss_address);
|
||||
let wss_listener = TcpListener::bind(wss_address).await?;
|
||||
tasks.push(tokio::spawn(http::websocket::task_main(
|
||||
wss_listener,
|
||||
config,
|
||||
)));
|
||||
}
|
||||
|
||||
let tasks = tasks.into_iter().map(flatten_err);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
// This will block until all tasks have completed.
|
||||
@@ -154,6 +166,11 @@ fn cli() -> clap::Command {
|
||||
.help("listen for incoming http connections (metrics, etc) on ip:port")
|
||||
.default_value("127.0.0.1:7001"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("wss")
|
||||
.long("wss")
|
||||
.help("listen for incoming wss connections on ip:port"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("uri")
|
||||
.short('u')
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
use crate::auth;
|
||||
use crate::{
|
||||
auth,
|
||||
console::messages::{DatabaseInfo, KickSession},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
|
||||
use serde::Deserialize;
|
||||
use std::{
|
||||
net::{TcpListener, TcpStream},
|
||||
thread,
|
||||
};
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::postgres_backend::{self, AuthType, PostgresBackend};
|
||||
use utils::{
|
||||
postgres_backend::{self, AuthType, PostgresBackend},
|
||||
postgres_backend_async::QueryError,
|
||||
};
|
||||
|
||||
/// Console management API listener thread.
|
||||
/// It spawns console response handlers needed for the link auth.
|
||||
@@ -45,68 +50,18 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
|
||||
fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
pgbackend.run(&mut MgmtHandler)
|
||||
}
|
||||
|
||||
/// Known as `kickResponse` in the console.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
}
|
||||
|
||||
/// A message received by `mgmt` when a compute node is ready.
|
||||
pub type ComputeReady = Result<DatabaseInfo, String>;
|
||||
|
||||
impl PsqlSessionResult {
|
||||
fn into_compute_ready(self) -> ComputeReady {
|
||||
match self {
|
||||
Self::Success(db_info) => Ok(db_info),
|
||||
Self::Failure(message) => Err(message),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute node connection params provided by the console.
|
||||
/// This struct and its parents are mgmt API implementation
|
||||
/// detail and thus should remain in this module.
|
||||
// TODO: restore deserialization tests from git history.
|
||||
#[derive(Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
/// Console always provides a password, but it might
|
||||
/// be inconvenient for debug with local PG instance.
|
||||
pub password: Option<String>,
|
||||
pub project: String,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
impl std::fmt::Debug for DatabaseInfo {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
fmt.debug_struct("DatabaseInfo")
|
||||
.field("host", &self.host)
|
||||
.field("port", &self.port)
|
||||
.field("dbname", &self.dbname)
|
||||
.field("user", &self.user)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: replace with an http-based protocol.
|
||||
struct MgmtHandler;
|
||||
impl postgres_backend::Handler for MgmtHandler {
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
|
||||
try_process_query(pgb, query).map_err(|e| {
|
||||
error!("failed to process response: {e:?}");
|
||||
e
|
||||
@@ -114,14 +69,14 @@ impl postgres_backend::Handler for MgmtHandler {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
|
||||
let resp: PsqlSessionResponse = serde_json::from_str(query)?;
|
||||
fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
|
||||
let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
|
||||
|
||||
let span = info_span!("event", session_id = resp.session_id);
|
||||
let _enter = span.enter();
|
||||
info!("got response: {:?}", resp.result);
|
||||
|
||||
match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
|
||||
match auth::backend::notify(resp.session_id, Ok(resp.result)) {
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
@@ -129,49 +84,9 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
|
||||
}
|
||||
Err(e) => {
|
||||
error!("failed to deliver response to per-client task");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn parse_db_info() -> anyhow::Result<()> {
|
||||
// with password
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"password": "password",
|
||||
"project": "hello_world",
|
||||
}))?;
|
||||
|
||||
// without password
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"project": "hello_world",
|
||||
}))?;
|
||||
|
||||
// new field (forward compatibility)
|
||||
let _: DatabaseInfo = serde_json::from_value(json!({
|
||||
"host": "localhost",
|
||||
"port": 5432,
|
||||
"dbname": "postgres",
|
||||
"user": "john_doe",
|
||||
"project": "hello_world",
|
||||
"N.E.W": "forward compatibility check",
|
||||
}))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage as Be, *};
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
@@ -39,12 +39,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_io_bytes_per_client",
|
||||
"Number of bytes sent/received between client and backend.",
|
||||
&[
|
||||
// Received (rx) / sent (tx).
|
||||
"direction",
|
||||
// Proxy can keep calling it `project` internally.
|
||||
"endpoint_id"
|
||||
]
|
||||
crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
@@ -87,6 +82,47 @@ pub async fn task_main(
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn handle_ws_client(
|
||||
config: &ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin + Send,
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
// The `closed` counter will increase when this future is destroyed.
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
|
||||
scopeguard::defer! {
|
||||
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
||||
}
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
let hostname = hostname.as_deref();
|
||||
|
||||
// TLS is None here, because the connection is already encrypted.
|
||||
let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake"));
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
config: &ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -113,7 +149,7 @@ async fn handle_client(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -271,19 +307,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
|
||||
|
||||
stream
|
||||
.write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
|
||||
.write_message(&BeMessage::ReadyForQuery)
|
||||
.write_message(&Be::ReadyForQuery)
|
||||
.await?;
|
||||
|
||||
// TODO: add more identifiers.
|
||||
let metric_id = node.project;
|
||||
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx"));
|
||||
let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
|
||||
// Number of bytes we sent to the client (outbound).
|
||||
m_sent.inc_by(cnt as u64);
|
||||
});
|
||||
|
||||
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
|
||||
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx"));
|
||||
let mut db = MeasuredStream::new(db.stream, |cnt| {
|
||||
// Number of bytes the client sent to the compute node (inbound).
|
||||
m_recv.inc_by(cnt as u64);
|
||||
|
||||
@@ -140,7 +140,7 @@ async fn dummy_proxy(
|
||||
stream
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&Be::CLIENT_ENCODING)?
|
||||
.write_message(&BeMessage::ReadyForQuery)
|
||||
.write_message(&Be::ReadyForQuery)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::error::UserFacingError;
|
||||
use anyhow::bail;
|
||||
use bytes::BytesMut;
|
||||
use pin_project_lite::pin_project;
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
|
||||
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
|
||||
use rustls::ServerConfig;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
@@ -47,18 +47,13 @@ fn err_connection() -> io::Error {
|
||||
io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
|
||||
}
|
||||
|
||||
// TODO: change error type of `FeMessage::read_fut`
|
||||
fn from_anyhow(e: anyhow::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e.to_string())
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + Unpin> PqStream<S> {
|
||||
/// Receive [`FeStartupPacket`], which is a first packet sent by a client.
|
||||
pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
|
||||
// TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
|
||||
let msg = FeStartupPacket::read_fut(&mut self.stream)
|
||||
.await
|
||||
.map_err(from_anyhow)?
|
||||
.map_err(ConnectionError::into_io_error)?
|
||||
.ok_or_else(err_connection)?;
|
||||
|
||||
match msg {
|
||||
@@ -80,7 +75,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
|
||||
async fn read_message(&mut self) -> io::Result<FeMessage> {
|
||||
FeMessage::read_fut(&mut self.stream)
|
||||
.await
|
||||
.map_err(from_anyhow)?
|
||||
.map_err(ConnectionError::into_io_error)?
|
||||
.ok_or_else(err_connection)
|
||||
}
|
||||
}
|
||||
@@ -112,7 +107,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
|
||||
/// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
|
||||
pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
|
||||
tracing::info!("forwarding error to user: {error}");
|
||||
self.write_message(&BeMessage::ErrorResponse(error)).await?;
|
||||
self.write_message(&BeMessage::ErrorResponse(error, None))
|
||||
.await?;
|
||||
bail!(error)
|
||||
}
|
||||
|
||||
@@ -124,7 +120,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
|
||||
{
|
||||
let msg = error.to_string_client();
|
||||
tracing::info!("forwarding error to user: {msg}");
|
||||
self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
|
||||
self.write_message(&BeMessage::ErrorResponse(&msg, None))
|
||||
.await?;
|
||||
bail!(error)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,8 +9,8 @@
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
|
||||
# Not every feature is supported in macOS builds, e.g. `profiling`,
|
||||
# avoid running regular linting script that checks every feature.
|
||||
# Not every feature is supported in macOS builds. Avoid running regular linting
|
||||
# script that checks every feature.
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# no extra features to test currently, add more here when needed
|
||||
cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
|
||||
|
||||
@@ -1,10 +1,5 @@
|
||||
[toolchain]
|
||||
# We try to stick to a toolchain version that is widely available on popular distributions, so that most people
|
||||
# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later
|
||||
# version, we can consider updating.
|
||||
# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package,
|
||||
# we use "unstable" version number as the highest version used in the project by default.
|
||||
channel = "1.62.1"
|
||||
channel = "1.66.1"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
name = "safekeeper"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
|
||||
@@ -143,6 +143,19 @@ fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let auth = match args.auth_validation_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading JWT auth key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
let conf = SafeKeeperConf {
|
||||
workdir,
|
||||
my_id: id,
|
||||
@@ -156,7 +169,7 @@ fn main() -> anyhow::Result<()> {
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
backup_runtime_threads: args.wal_backup_threads,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
auth_validation_public_key_path: args.auth_validation_public_key_path,
|
||||
auth,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
@@ -186,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
e
|
||||
})?;
|
||||
|
||||
let auth = match conf.auth_validation_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading JWT auth key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
@@ -212,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let auth_ = auth.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
let router = http::make_router(conf_, auth_);
|
||||
let router = http::make_router(conf_);
|
||||
endpoint::serve_thread_main(
|
||||
router,
|
||||
http_listener,
|
||||
@@ -230,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let conf_cloned = conf.clone();
|
||||
let safekeeper_thread = thread::Builder::new()
|
||||
.name("safekeeper thread".into())
|
||||
.spawn(|| {
|
||||
if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
|
||||
info!("safekeeper thread terminated: {e}");
|
||||
}
|
||||
})
|
||||
.spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
|
||||
.unwrap();
|
||||
|
||||
threads.push(safekeeper_thread);
|
||||
@@ -244,7 +239,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
thread::Builder::new()
|
||||
.name("broker thread".into())
|
||||
.spawn(|| {
|
||||
// TODO: add auth?
|
||||
broker::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
|
||||
@@ -8,16 +8,16 @@ use crate::receive_wal::ReceiveWalConn;
|
||||
use crate::send_wal::ReplicationConn;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::Context;
|
||||
|
||||
use postgres_ffi::PG_TLI;
|
||||
use regex::Regex;
|
||||
|
||||
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::auth::{Claims, JwtAuth, Scope};
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::postgres_backend_async::QueryError;
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -32,7 +32,6 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub ttid: TenantTimelineId,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
}
|
||||
|
||||
@@ -44,7 +43,7 @@ enum SafekeeperPostgresCommand {
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
@@ -64,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
|
||||
cmd: serde_json::from_str(cmd)?,
|
||||
})
|
||||
} else {
|
||||
bail!("unsupported command {}", cmd);
|
||||
anyhow::bail!("unsupported command {cmd}");
|
||||
}
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
// tenant_id and timeline_id are passed in connection string params
|
||||
fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
|
||||
fn startup(
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
if let FeStartupPacket::StartupMessage { params, .. } = sm {
|
||||
if let Some(options) = params.options_raw() {
|
||||
for opt in options {
|
||||
@@ -79,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
// https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
|
||||
match opt.split_once('=') {
|
||||
Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
|
||||
self.tenant_id = Some(value.parse()?);
|
||||
self.tenant_id = Some(value.parse().with_context(|| {
|
||||
format!("Failed to parse {value} as tenant id")
|
||||
})?);
|
||||
}
|
||||
Some(("ztimelineid", value)) | Some(("timeline_id", value)) => {
|
||||
self.timeline_id = Some(value.parse()?);
|
||||
self.timeline_id = Some(value.parse().with_context(|| {
|
||||
format!("Failed to parse {value} as timeline id")
|
||||
})?);
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
@@ -95,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("Safekeeper received unexpected initial message: {:?}", sm);
|
||||
Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Safekeeper received unexpected initial message: {sm:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,20 +112,20 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
_pgb: &mut PostgresBackend,
|
||||
jwt_response: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), QueryError> {
|
||||
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
|
||||
// which requires auth to be present
|
||||
let data = self
|
||||
.conf
|
||||
.auth
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.decode(str::from_utf8(jwt_response)?)?;
|
||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
||||
|
||||
if matches!(data.claims.scope, Scope::Tenant) {
|
||||
ensure!(
|
||||
data.claims.tenant_id.is_some(),
|
||||
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"jwt token scope is Tenant, but tenant id is missing"
|
||||
)
|
||||
)));
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -128,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
if query_string
|
||||
.to_ascii_lowercase()
|
||||
.starts_with("set datestyle to ")
|
||||
@@ -149,39 +162,45 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
match cmd {
|
||||
let res = match cmd {
|
||||
SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
|
||||
SafekeeperPostgresCommand::StartReplication { start_lsn } => {
|
||||
ReplicationConn::new(pgb).run(self, pgb, start_lsn)
|
||||
}
|
||||
SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
|
||||
SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
|
||||
}
|
||||
.context(format!(
|
||||
"Failed to process query for timeline {timeline_id}"
|
||||
))?;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
match res {
|
||||
Ok(()) => Ok(()),
|
||||
Err(QueryError::Disconnected(connection_error)) => {
|
||||
info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
|
||||
Err(QueryError::Disconnected(connection_error))
|
||||
}
|
||||
Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
|
||||
"Failed to process query for timeline {}",
|
||||
self.ttid
|
||||
)))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(conf: SafeKeeperConf, auth: Option<Arc<JwtAuth>>) -> Self {
|
||||
pub fn new(conf: SafeKeeperConf) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
appname: None,
|
||||
tenant_id: None,
|
||||
timeline_id: None,
|
||||
ttid: TenantTimelineId::empty(),
|
||||
auth,
|
||||
claims: None,
|
||||
}
|
||||
}
|
||||
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
|
||||
if self.auth.is_none() {
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
||||
if self.conf.auth.is_none() {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(());
|
||||
}
|
||||
@@ -198,7 +217,7 @@ impl SafekeeperPostgresHandler {
|
||||
///
|
||||
/// Handle IDENTIFY_SYSTEM replication command
|
||||
///
|
||||
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
|
||||
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid)?;
|
||||
|
||||
let lsn = if self.is_walproposer_recovery() {
|
||||
|
||||
@@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(
|
||||
conf: SafeKeeperConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
if auth.is_some() {
|
||||
if conf.auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
#[allow(clippy::mutable_key_type)]
|
||||
static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
|
||||
@@ -298,6 +295,7 @@ pub fn make_router(
|
||||
|
||||
// NB: on any changes do not forget to update the OpenAPI spec
|
||||
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
|
||||
let auth = conf.auth.clone();
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user