mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 18:02:56 +00:00
Merge pull request #10053 from neondatabase/rc/release/2024-12-09
Storage release 2024-12-09
This commit is contained in:
@@ -43,7 +43,8 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
|
||||
[ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
@@ -23,7 +23,8 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
|
||||
[ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
2
.github/workflows/_create-release-pr.yml
vendored
2
.github/workflows/_create-release-pr.yml
vendored
@@ -21,7 +21,7 @@ defaults:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
create-release-branch:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -249,7 +249,7 @@ jobs:
|
||||
|
||||
# Post both success and failure to the Slack channel
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule }}
|
||||
if: ${{ github.event.schedule && !cancelled() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
|
||||
|
||||
38
.github/workflows/build_and_test.yml
vendored
38
.github/workflows/build_and_test.yml
vendored
@@ -6,6 +6,7 @@ on:
|
||||
- main
|
||||
- release
|
||||
- release-proxy
|
||||
- release-compute
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -70,8 +71,10 @@ jobs:
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
shell: bash
|
||||
@@ -513,7 +516,7 @@ jobs:
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
|
||||
needs: [ check-permissions, promote-images, tag ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
@@ -669,7 +672,7 @@ jobs:
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version.pg == 'v16'
|
||||
if: matrix.version.pg >= 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
@@ -684,8 +687,7 @@ jobs:
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
@@ -708,7 +710,7 @@ jobs:
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
@@ -744,7 +746,7 @@ jobs:
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Create multi-arch neon-test-extensions image
|
||||
if: matrix.version.pg == 'v16'
|
||||
if: matrix.version.pg >= 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
@@ -833,6 +835,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
pg_version: [v16, v17]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
@@ -871,7 +874,10 @@ jobs:
|
||||
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 20
|
||||
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
env:
|
||||
TAG: ${{needs.tag.outputs.build-tag}}
|
||||
TEST_VERSION_ONLY: ${{ matrix.pg_version }}
|
||||
run: ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
@@ -931,7 +937,7 @@ jobs:
|
||||
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Configure AWS-prod credentials
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
@@ -940,12 +946,12 @@ jobs:
|
||||
|
||||
- name: Login to prod ECR
|
||||
uses: docker/login-action@v3
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
with:
|
||||
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
|
||||
|
||||
- name: Copy all images to prod ECR
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
run: |
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
|
||||
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
@@ -965,7 +971,7 @@ jobs:
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
push-to-acr-prod:
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
needs: [ tag, promote-images ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
with:
|
||||
@@ -1053,7 +1059,7 @@ jobs:
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
@@ -1102,13 +1108,15 @@ jobs:
|
||||
-f deployProxyAuthBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
|
||||
1
.github/workflows/ingest_benchmark.yml
vendored
1
.github/workflows/ingest_benchmark.yml
vendored
@@ -26,6 +26,7 @@ concurrency:
|
||||
jobs:
|
||||
ingest:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
target_project: [new_empty_project, large_existing_project]
|
||||
permissions:
|
||||
|
||||
23
.github/workflows/release.yml
vendored
23
.github/workflows/release.yml
vendored
@@ -15,6 +15,10 @@ on:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
create-compute-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Compute release PR'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
@@ -25,20 +29,20 @@ defaults:
|
||||
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Storage & Compute'
|
||||
component-name: 'Storage'
|
||||
release-branch: 'release'
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
@@ -49,3 +53,16 @@ jobs:
|
||||
release-branch: 'release-proxy'
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
create-compute-release-branch:
|
||||
if: inputs.create-compute-release-branch
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Compute'
|
||||
release-branch: 'release-compute'
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
2
.github/workflows/trigger-e2e-tests.yml
vendored
2
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -51,6 +51,8 @@ jobs:
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
|
||||
echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
|
||||
32
CODEOWNERS
32
CODEOWNERS
@@ -1,15 +1,29 @@
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
# Autoscaling
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
|
||||
# DevProd
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
|
||||
# Compute
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/vendor/ @neondatabase/compute
|
||||
/compute/ @neondatabase/compute
|
||||
/compute_tools/ @neondatabase/compute
|
||||
|
||||
# Proxy
|
||||
/libs/proxy/ @neondatabase/proxy
|
||||
/proxy/ @neondatabase/proxy
|
||||
|
||||
# Storage
|
||||
/pageserver/ @neondatabase/storage
|
||||
/safekeeper/ @neondatabase/storage
|
||||
/storage_controller @neondatabase/storage
|
||||
/storage_scrubber @neondatabase/storage
|
||||
/vendor/ @neondatabase/compute
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
|
||||
# Shared
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
|
||||
328
Cargo.lock
generated
328
Cargo.lock
generated
@@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.3.2"
|
||||
version = "0.6.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
|
||||
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is-terminal",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
@@ -123,19 +123,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "1.0.1"
|
||||
version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
|
||||
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.71"
|
||||
version = "1.0.94"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
|
||||
checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
@@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
@@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -301,7 +301,7 @@ dependencies = [
|
||||
"aws-smithy-types",
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"fastrand 2.2.0",
|
||||
"hex",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.30",
|
||||
@@ -341,7 +341,7 @@ dependencies = [
|
||||
"aws-smithy-types",
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"fastrand 2.2.0",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"once_cell",
|
||||
@@ -417,7 +417,7 @@ dependencies = [
|
||||
"aws-smithy-xml",
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"fastrand 2.2.0",
|
||||
"hex",
|
||||
"hmac",
|
||||
"http 0.2.9",
|
||||
@@ -621,7 +621,7 @@ dependencies = [
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"fastrand 2.2.0",
|
||||
"h2 0.3.26",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
@@ -969,7 +969,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.5.0"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
|
||||
checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -1167,45 +1167,43 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.3.0"
|
||||
version = "4.5.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
|
||||
checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.3.0"
|
||||
version = "4.5.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
|
||||
checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"bitflags 1.3.2",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
"strsim 0.11.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.3.0"
|
||||
version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck 0.4.1",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.5.0"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
|
||||
checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
@@ -1614,8 +1612,8 @@ dependencies = [
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.52",
|
||||
"strsim 0.10.0",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1626,7 +1624,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1749,7 +1747,7 @@ dependencies = [
|
||||
"dsl_auto_type",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1769,7 +1767,7 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
|
||||
dependencies = [
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1792,7 +1790,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1812,10 +1810,10 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1947,7 +1945,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1980,7 +1978,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2054,9 +2052,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.0.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
|
||||
checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
|
||||
|
||||
[[package]]
|
||||
name = "ff"
|
||||
@@ -2234,7 +2232,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2337,7 +2335,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2465,12 +2463,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -2888,6 +2880,12 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.5"
|
||||
@@ -2912,6 +2910,23 @@ version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "jemalloc_pprof"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
"mappings",
|
||||
"once_cell",
|
||||
"pprof_util",
|
||||
"tempfile",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.32"
|
||||
@@ -3022,9 +3037,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
version = "0.2.167"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
|
||||
checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
@@ -3044,9 +3059,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.13"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
|
||||
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
@@ -3079,6 +3094,19 @@ dependencies = [
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mappings"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"pprof_util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.1.0"
|
||||
@@ -3139,10 +3167,10 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3346,6 +3374,7 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
@@ -3434,6 +3463,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
@@ -3497,9 +3527,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.18.0"
|
||||
version = "1.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
|
||||
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
@@ -3515,9 +3545,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.24.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
|
||||
checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@@ -3529,9 +3559,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.13.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab"
|
||||
checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -3542,9 +3572,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.17.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727"
|
||||
checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-core",
|
||||
@@ -3560,9 +3590,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.7.0"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
|
||||
checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
@@ -3572,15 +3602,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.16.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05"
|
||||
checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.24.1"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
|
||||
checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-channel",
|
||||
@@ -3954,7 +3984,7 @@ dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4056,7 +4086,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4139,7 +4169,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4152,7 +4182,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4165,7 +4195,6 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4177,7 +4206,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"hmac",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
@@ -4188,7 +4216,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4298,6 +4326,19 @@ dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pprof_util"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"flate2",
|
||||
"num",
|
||||
"paste",
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
@@ -4334,7 +4375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4348,9 +4389,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.78"
|
||||
version = "1.0.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
|
||||
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -4414,7 +4455,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -4424,7 +4465,7 @@ dependencies = [
|
||||
"prost",
|
||||
"prost-types",
|
||||
"regex",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
@@ -4438,7 +4479,7 @@ dependencies = [
|
||||
"itertools 0.12.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4567,6 +4608,7 @@ dependencies = [
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres2",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-tungstenite",
|
||||
@@ -4992,9 +5034,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-middleware"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
|
||||
checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -5007,13 +5049,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-retry"
|
||||
version = "0.5.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
|
||||
checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"chrono",
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
@@ -5022,6 +5063,7 @@ dependencies = [
|
||||
"reqwest 0.12.4",
|
||||
"reqwest-middleware",
|
||||
"retry-policies",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"wasm-timer",
|
||||
@@ -5029,9 +5071,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.5.3"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45"
|
||||
checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -5047,12 +5089,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "retry-policies"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
|
||||
checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
@@ -5176,7 +5216,7 @@ dependencies = [
|
||||
"regex",
|
||||
"relative-path",
|
||||
"rustc_version",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
@@ -5222,14 +5262,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.28"
|
||||
version = "0.38.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
|
||||
checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.4.13",
|
||||
"linux-raw-sys 0.4.14",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
@@ -5684,7 +5724,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5766,7 +5806,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6123,6 +6163,12 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.26.3"
|
||||
@@ -6135,11 +6181,11 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6190,9 +6236,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.52"
|
||||
version = "2.0.90"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
|
||||
checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6222,7 +6268,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6253,13 +6299,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.9.0"
|
||||
version = "3.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
|
||||
checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand 2.0.0",
|
||||
"redox_syscall 0.4.1",
|
||||
"fastrand 2.2.0",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
@@ -6300,27 +6346,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.57"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
|
||||
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.57"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
|
||||
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6494,13 +6540,13 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6719,7 +6765,7 @@ dependencies = [
|
||||
"prost-build",
|
||||
"prost-types",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6756,9 +6802,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.40"
|
||||
version = "0.1.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"pin-project-lite",
|
||||
@@ -6779,20 +6825,20 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.27"
|
||||
version = "0.1.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
||||
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.32"
|
||||
version = "0.1.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
@@ -6821,9 +6867,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.25.0"
|
||||
version = "0.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
|
||||
checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
@@ -6839,9 +6885,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-serde"
|
||||
version = "0.1.3"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
|
||||
checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"tracing-core",
|
||||
@@ -6849,9 +6895,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.18"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
||||
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"once_cell",
|
||||
@@ -7060,6 +7106,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"jemalloc_pprof",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -7258,7 +7305,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -7292,7 +7339,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -7646,8 +7693,12 @@ dependencies = [
|
||||
"memchr",
|
||||
"nix 0.26.4",
|
||||
"nom",
|
||||
"num",
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"parquet",
|
||||
@@ -7669,8 +7720,9 @@ dependencies = [
|
||||
"smallvec",
|
||||
"spki 0.7.3",
|
||||
"subtle",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
"sync_wrapper 0.1.2",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemalloc-sys",
|
||||
"time",
|
||||
"time-macros",
|
||||
@@ -7769,7 +7821,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7790,7 +7842,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
21
Cargo.toml
21
Cargo.toml
@@ -74,7 +74,7 @@ bindgen = "0.70"
|
||||
bit_field = "0.10.2"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
bytes = "1.9"
|
||||
camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
@@ -115,6 +115,7 @@ indoc = "2"
|
||||
ipnet = "2.10.0"
|
||||
itertools = "0.10"
|
||||
itoa = "1.0.11"
|
||||
jemalloc_pprof = "0.6"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
@@ -127,10 +128,10 @@ notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.24"
|
||||
opentelemetry_sdk = "0.24"
|
||||
opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.16"
|
||||
opentelemetry = "0.26"
|
||||
opentelemetry_sdk = "0.26"
|
||||
opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.26"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
@@ -144,9 +145,9 @@ rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
|
||||
reqwest-middleware = "0.3.0"
|
||||
reqwest-retry = "0.5"
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
|
||||
reqwest-middleware = "0.4"
|
||||
reqwest-retry = "0.7"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
@@ -175,7 +176,7 @@ sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["stats"] }
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
@@ -191,7 +192,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-opentelemetry = "0.25"
|
||||
tracing-opentelemetry = "0.27"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
|
||||
@@ -115,7 +115,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.13.1
|
||||
ENV SQL_EXPORTER_VERSION=0.16.0
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
|
||||
@@ -358,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
# because we build the images on different machines than where we run them.
|
||||
# Pass OPTFLAGS="" to remove it.
|
||||
#
|
||||
# vector 0.7.4 supports v17
|
||||
# last release v0.7.4 - Aug 5, 2024
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
|
||||
echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
|
||||
# vector >0.7.4 supports v17
|
||||
# last release v0.8.0 - Oct 30, 2024
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1367,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
|
||||
|
||||
FROM neon-pg-ext-build AS neon-pg-ext-test
|
||||
ARG PG_VERSION
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
mkdir /ext-src
|
||||
RUN mkdir /ext-src
|
||||
|
||||
#COPY --from=postgis-build /postgis.tar.gz /ext-src/
|
||||
#COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-build /plv8.tar.gz /ext-src/
|
||||
COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
|
||||
#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
|
||||
COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
@@ -1395,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY compute/patches/pg_hint_plan.patch /ext-src
|
||||
COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY compute/patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
@@ -1405,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
|
||||
COPY compute/patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
cd /ext-src/ && for f in *.tar.gz; \
|
||||
RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
patch -p1 </ext-src/pg_anon.patch
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
patch -p1 </ext-src/pg_cron.patch
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
ENV PGHOST=compute
|
||||
ENV PGPORT=55433
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
|
||||
import 'sql_exporter/compute_max_connections.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_subscriptions_count.libsonnet',
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
;; pgbouncer propagates application_name (if it's specified) to the server, but some
|
||||
;; clients don't set it. We set default application_name=pgbouncer to make it
|
||||
;; easier to identify pgbouncer connections in Postgres. If client sets
|
||||
;; application_name, it will be used instead.
|
||||
*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
SELECT
|
||||
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
|
||||
@@ -0,0 +1,17 @@
|
||||
local neon = import 'neon.libsonnet';
|
||||
|
||||
local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
|
||||
local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
|
||||
|
||||
{
|
||||
metric_name: 'compute_logical_snapshots_bytes',
|
||||
type: 'gauge',
|
||||
help: 'Size of the pg_logical/snapshots directory, not including temporary files',
|
||||
key_labels: [
|
||||
'timeline_id',
|
||||
],
|
||||
values: [
|
||||
'logical_snapshots_bytes',
|
||||
],
|
||||
query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
|
||||
-- These temporary snapshot files are renamed to the actual snapshot files
|
||||
-- after they are completely built. We only WAL-log the completely built
|
||||
-- snapshot files
|
||||
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
|
||||
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
|
||||
) AS logical_snapshots_bytes;
|
||||
174
compute/patches/pg_hint_plan_v17.patch
Normal file
174
compute/patches/pg_hint_plan_v17.patch
Normal file
@@ -0,0 +1,174 @@
|
||||
diff --git a/expected/ut-A.out b/expected/ut-A.out
|
||||
index e7d68a1..65a056c 100644
|
||||
--- a/expected/ut-A.out
|
||||
+++ b/expected/ut-A.out
|
||||
@@ -9,13 +9,16 @@ SET search_path TO public;
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
-- No.A-1-2-3
|
||||
DROP EXTENSION pg_hint_plan;
|
||||
-- No.A-1-1-4
|
||||
CREATE SCHEMA other_schema;
|
||||
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
diff --git a/expected/ut-J.out b/expected/ut-J.out
|
||||
index 2fa3c70..314e929 100644
|
||||
--- a/expected/ut-J.out
|
||||
+++ b/expected/ut-J.out
|
||||
@@ -789,38 +789,6 @@ NestLoop(st1 st2)
|
||||
MergeJoin(t1 t2)
|
||||
not used hint:
|
||||
duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-NestLoop(st1 st2)
|
||||
-MergeJoin(t1 t2)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-NestLoop(st1 st2)
|
||||
-MergeJoin(t1 t2)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-NestLoop(st1 st2)
|
||||
-MergeJoin(t1 t2)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-NestLoop(st1 st2)
|
||||
-MergeJoin(t1 t2)
|
||||
-duplication hint:
|
||||
error hint:
|
||||
|
||||
explain_filter
|
||||
diff --git a/expected/ut-S.out b/expected/ut-S.out
|
||||
index 0bfcfb8..e75f581 100644
|
||||
--- a/expected/ut-S.out
|
||||
+++ b/expected/ut-S.out
|
||||
@@ -4415,34 +4415,6 @@ used hint:
|
||||
IndexScan(ti1 ti1_pred)
|
||||
not used hint:
|
||||
duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(ti1 ti1_pred)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(ti1 ti1_pred)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(ti1 ti1_pred)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(ti1 ti1_pred)
|
||||
-duplication hint:
|
||||
error hint:
|
||||
|
||||
explain_filter
|
||||
diff --git a/expected/ut-W.out b/expected/ut-W.out
|
||||
index a09bd34..0ad227c 100644
|
||||
--- a/expected/ut-W.out
|
||||
+++ b/expected/ut-W.out
|
||||
@@ -1341,54 +1341,6 @@ IndexScan(ft1)
|
||||
IndexScan(t)
|
||||
Parallel(s1 3 hard)
|
||||
duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(*VALUES*)
|
||||
-SeqScan(cte1)
|
||||
-IndexScan(ft1)
|
||||
-IndexScan(t)
|
||||
-Parallel(p1 5 hard)
|
||||
-Parallel(s1 3 hard)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(*VALUES*)
|
||||
-SeqScan(cte1)
|
||||
-IndexScan(ft1)
|
||||
-IndexScan(t)
|
||||
-Parallel(p1 5 hard)
|
||||
-Parallel(s1 3 hard)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(*VALUES*)
|
||||
-SeqScan(cte1)
|
||||
-IndexScan(ft1)
|
||||
-IndexScan(t)
|
||||
-Parallel(p1 5 hard)
|
||||
-Parallel(s1 3 hard)
|
||||
-duplication hint:
|
||||
-error hint:
|
||||
-
|
||||
-LOG: pg_hint_plan:
|
||||
-used hint:
|
||||
-not used hint:
|
||||
-IndexScan(*VALUES*)
|
||||
-SeqScan(cte1)
|
||||
-IndexScan(ft1)
|
||||
-IndexScan(t)
|
||||
-Parallel(p1 5 hard)
|
||||
-Parallel(s1 3 hard)
|
||||
-duplication hint:
|
||||
error hint:
|
||||
|
||||
explain_filter
|
||||
diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
|
||||
index 017fa4b..98d989b 100644
|
||||
--- a/expected/ut-fdw.out
|
||||
+++ b/expected/ut-fdw.out
|
||||
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
CREATE EXTENSION file_fdw;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
@@ -335,6 +335,7 @@ fn wait_spec(
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
pgversion: get_pg_version_string(pgbin),
|
||||
http_port,
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
@@ -389,7 +390,6 @@ fn wait_spec(
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
|
||||
})
|
||||
@@ -397,8 +397,6 @@ fn wait_spec(
|
||||
|
||||
struct WaitSpecResult {
|
||||
compute: Arc<ComputeNode>,
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<String>,
|
||||
}
|
||||
@@ -408,7 +406,6 @@ fn start_postgres(
|
||||
#[allow(unused_variables)] matches: &clap::ArgMatches,
|
||||
WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
}: WaitSpecResult,
|
||||
@@ -481,12 +478,10 @@ fn start_postgres(
|
||||
}
|
||||
}
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute(extension_server_port) {
|
||||
pg = match compute.start_compute() {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
|
||||
@@ -79,6 +79,8 @@ pub struct ComputeNode {
|
||||
/// - we push spec and it does configuration
|
||||
/// - but then it is restarted without any spec again
|
||||
pub live_config_allowed: bool,
|
||||
/// The port that the compute's HTTP server listens on
|
||||
pub http_port: u16,
|
||||
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
|
||||
/// To allow HTTP API server to serving status requests, while configuration
|
||||
/// is in progress, lock should be held only for short periods of time to do
|
||||
@@ -611,11 +613,7 @@ impl ComputeNode {
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip_all)]
|
||||
pub fn prepare_pgdata(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
@@ -625,7 +623,7 @@ impl ComputeNode {
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
Some(extension_server_port),
|
||||
self.http_port,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1243,14 +1241,9 @@ impl ComputeNode {
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
|
||||
|
||||
// TODO(ololobus): We need a concurrency during reconfiguration as well,
|
||||
// but DB is already running and used by user. We can easily get out of
|
||||
// `max_connections` limit, and the current code won't handle that.
|
||||
// let compute_state = self.state.lock().unwrap().clone();
|
||||
// let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
|
||||
let max_concurrent_connections = 1;
|
||||
let max_concurrent_connections = spec.reconfigure_concurrency;
|
||||
|
||||
// Temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
@@ -1284,10 +1277,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(
|
||||
&self,
|
||||
extension_server_port: u16,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -1362,7 +1352,7 @@ impl ComputeNode {
|
||||
info!("{:?}", remote_ext_metrics);
|
||||
}
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
@@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
pub fn write_postgres_conf(
|
||||
path: &Path,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: Option<u16>,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
@@ -127,9 +127,7 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
if let Some(port) = extension_server_port {
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
|
||||
|
||||
// This is essential to keep this line at the end of the file,
|
||||
// because it is intended to override any settings above.
|
||||
|
||||
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
@@ -310,6 +311,10 @@ impl Endpoint {
|
||||
conf.append("wal_log_hints", "off");
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
// Set to 1MB to both exercise getPage requests/LFC, and still have enough room for
|
||||
// Postgres to operate. Everything smaller might be not enough for Postgres under load,
|
||||
// and can cause errors like 'no unpinned buffers available', see
|
||||
// <https://github.com/neondatabase/neon/issues/9956>
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
@@ -614,6 +619,7 @@ impl Endpoint {
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: 1,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -813,6 +819,7 @@ impl Endpoint {
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
))
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.body(format!(
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::error::Error as _;
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
@@ -26,7 +27,7 @@ use crate::{
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
#[error("Reqwest error: {0}")]
|
||||
#[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
|
||||
Transport(#[from] reqwest::Error),
|
||||
|
||||
#[error("Error: {0}")]
|
||||
|
||||
@@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
let TenantDescribeResponse {
|
||||
tenant_id,
|
||||
shards,
|
||||
stripe_size,
|
||||
policy,
|
||||
config,
|
||||
} = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
println!("Tenant {tenant_id}");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.add_row(["Policy", &format!("{:?}", policy)]);
|
||||
table.add_row(["Stripe size", &format!("{:?}", stripe_size)]);
|
||||
table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]);
|
||||
println!("{table}");
|
||||
println!("Shards:");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
for shard in shards {
|
||||
|
||||
@@ -4,14 +4,16 @@ ARG TAG=latest
|
||||
|
||||
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
|
||||
|
||||
ARG COMPUTE_IMAGE
|
||||
|
||||
USER root
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl \
|
||||
jq \
|
||||
python3-pip \
|
||||
netcat
|
||||
netcat-openbsd
|
||||
#Faker is required for the pg_anon test
|
||||
RUN pip3 install Faker
|
||||
RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
|
||||
|
||||
|
||||
@@ -30,10 +30,17 @@ cleanup() {
|
||||
docker compose --profile test-extensions -f $COMPOSE_FILE down
|
||||
}
|
||||
|
||||
for pg_version in 14 15 16; do
|
||||
for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
pg_version=${pg_version/v/}
|
||||
echo "clean up containers if exists"
|
||||
cleanup
|
||||
PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
|
||||
PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
|
||||
# The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
|
||||
if [ $pg_version -eq 17 ]; then
|
||||
SPEC_PATH="compute_wrapper/var/db/postgres/specs"
|
||||
mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
|
||||
jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json
|
||||
fi
|
||||
PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
|
||||
|
||||
echo "wait until the compute is ready. timeout after 60s. "
|
||||
@@ -54,8 +61,7 @@ for pg_version in 14 15 16; do
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $pg_version -ge 16 ]
|
||||
then
|
||||
if [ $pg_version -ge 16 ]; then
|
||||
echo Enabling trust connection
|
||||
docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
|
||||
echo Adding postgres role
|
||||
@@ -68,10 +74,13 @@ for pg_version in 14 15 16; do
|
||||
# The test assumes that it is running on the same host with the postgres engine.
|
||||
# In our case it's not true, that's why we are copying files to the compute node
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
|
||||
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
|
||||
# Add support for pg_anon for pg_v16
|
||||
if [ $pg_version -ne 17 ]; then
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
|
||||
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
|
||||
rm -rf $TMPDIR
|
||||
fi
|
||||
TMPDIR=$(mktemp -d)
|
||||
# The following block does the same for the pg_hintplan test
|
||||
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
|
||||
@@ -97,4 +106,8 @@ for pg_version in 14 15 16; do
|
||||
fi
|
||||
fi
|
||||
cleanup
|
||||
# The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
|
||||
if [ $pg_version -eq 17 ]; then
|
||||
mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -19,6 +19,10 @@ pub type PgIdent = String;
|
||||
/// String type alias representing Postgres extension version
|
||||
pub type ExtVersion = String;
|
||||
|
||||
fn default_reconfigure_concurrency() -> usize {
|
||||
1
|
||||
}
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
/// An optinal hint that can be passed to speed up startup time if we know
|
||||
/// An optional hint that can be passed to speed up startup time if we know
|
||||
/// that no pg catalog mutations (like role creation, database creation,
|
||||
/// extension creation) need to be done on the actual database to start.
|
||||
#[serde(default)] // Default false
|
||||
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
|
||||
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
#[serde(default)]
|
||||
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
|
||||
/// Local Proxy configuration used for JWT authentication
|
||||
#[serde(default)]
|
||||
pub local_proxy_config: Option<LocalProxySpec>,
|
||||
|
||||
/// Number of concurrent connections during the parallel RunInEachDatabase
|
||||
/// phase of the apply config process.
|
||||
///
|
||||
/// We need a higher concurrency during reconfiguration in case of many DBs,
|
||||
/// but instance is already running and used by client. We can easily get out of
|
||||
/// `max_connections` limit, and the current code won't handle that.
|
||||
///
|
||||
/// Default is 1, but also allow control plane to override this value for specific
|
||||
/// projects. It's also recommended to bump `superuser_reserved_connections` +=
|
||||
/// `reconfigure_concurrency` for such projects to ensure that we always have
|
||||
/// enough spare connections for reconfiguration process to succeed.
|
||||
#[serde(default = "default_reconfigure_concurrency")]
|
||||
pub reconfigure_concurrency: usize,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -315,6 +331,9 @@ mod tests {
|
||||
|
||||
// Features list defaults to empty vector.
|
||||
assert!(spec.features.is_empty());
|
||||
|
||||
// Reconfigure concurrency defaults to 1.
|
||||
assert_eq!(spec.reconfigure_concurrency, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -442,7 +442,14 @@ impl Default for ConfigToml {
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
page_service_pipelining: PageServicePipeliningConfig::Serial,
|
||||
page_service_pipelining: if !cfg!(test) {
|
||||
PageServicePipeliningConfig::Serial
|
||||
} else {
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
|
||||
})
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct AvailabilityZone(pub String);
|
||||
|
||||
impl Display for AvailabilityZone {
|
||||
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling policy enables us to selectively disable some automatic actions that the
|
||||
/// controller performs on a tenant shard. This is only set to a non-default value by
|
||||
/// human intervention, and it is reset to the default value (Active) when the tenant's
|
||||
/// placement policy is modified away from Attached.
|
||||
///
|
||||
/// The typical use of a non-Active scheduling policy is one of:
|
||||
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
|
||||
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
|
||||
///
|
||||
/// If you're not sure which policy to use to pin a shard to its current location, you probably
|
||||
/// want Pause.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
|
||||
@@ -770,6 +770,11 @@ impl Key {
|
||||
&& self.field6 == 1
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_aux_file_key(&self) -> bool {
|
||||
self.field1 == AUX_KEY_PREFIX
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
|
||||
@@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ThrottleConfig {
|
||||
pub task_kinds: Vec<String>, // TaskKind
|
||||
/// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`.
|
||||
#[serde(rename = "task_kinds")]
|
||||
pub enabled: ThrottleConfigTaskKinds,
|
||||
pub initial: u32,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refill_interval: Duration,
|
||||
@@ -509,10 +511,38 @@ pub struct ThrottleConfig {
|
||||
pub max: u32,
|
||||
}
|
||||
|
||||
/// Before <https://github.com/neondatabase/neon/pull/9962>
|
||||
/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call.
|
||||
/// The `task_kinds` field controlled which Pageserver "Task Kind"s
|
||||
/// were subject to the throttle.
|
||||
///
|
||||
/// After that PR, the throttle is applied at pagestream request level
|
||||
/// and the `task_kinds` field does not apply since the only task kind
|
||||
/// that us subject to the throttle is that of the page service.
|
||||
///
|
||||
/// However, we don't want to make a breaking config change right now
|
||||
/// because it means we have to migrate all the tenant configs.
|
||||
/// This will be done in a future PR.
|
||||
///
|
||||
/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds`
|
||||
/// field to determine if the throttle is enabled or not.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
#[serde(transparent)]
|
||||
pub struct ThrottleConfigTaskKinds(Vec<String>);
|
||||
|
||||
impl ThrottleConfigTaskKinds {
|
||||
pub fn disabled() -> Self {
|
||||
Self(vec![])
|
||||
}
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
!self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl ThrottleConfig {
|
||||
pub fn disabled() -> Self {
|
||||
Self {
|
||||
task_kinds: vec![], // effectively disables the throttle
|
||||
enabled: ThrottleConfigTaskKinds::disabled(),
|
||||
// other values don't matter with emtpy `task_kinds`.
|
||||
initial: 0,
|
||||
refill_interval: Duration::from_millis(1),
|
||||
@@ -526,6 +556,30 @@ impl ThrottleConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod throttle_config_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_disabled_is_disabled() {
|
||||
let config = ThrottleConfig::disabled();
|
||||
assert!(!config.enabled.is_enabled());
|
||||
}
|
||||
#[test]
|
||||
fn test_enabled_backwards_compat() {
|
||||
let input = serde_json::json!({
|
||||
"task_kinds": ["PageRequestHandler"],
|
||||
"initial": 40000,
|
||||
"refill_interval": "50ms",
|
||||
"refill_amount": 1000,
|
||||
"max": 40000,
|
||||
"fair": true
|
||||
});
|
||||
let config: ThrottleConfig = serde_json::from_value(input).unwrap();
|
||||
assert!(config.enabled.is_enabled());
|
||||
}
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
|
||||
@@ -158,7 +158,8 @@ impl ShardIdentity {
|
||||
key_to_shard_number(self.count, self.stripe_size, key)
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
/// Return true if the key is stored only on this shard. This does not include
|
||||
/// global keys, see is_key_global().
|
||||
///
|
||||
/// Shards must ingest _at least_ keys which return true from this check.
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
@@ -170,19 +171,37 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be stored on all shards, not just one.
|
||||
pub fn is_key_global(&self, key: &Key) -> bool {
|
||||
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
|
||||
// Special keys that are only stored on shard 0
|
||||
false
|
||||
} else if key.is_rel_block_key() {
|
||||
// Ordinary relation blocks are distributed across shards
|
||||
false
|
||||
} else if key.is_rel_size_key() {
|
||||
// All shards maintain rel size keys (although only shard 0 is responsible for
|
||||
// keeping it strictly accurate, other shards just reflect the highest block they've ingested)
|
||||
true
|
||||
} else {
|
||||
// For everything else, we assume it must be kept everywhere, because ingest code
|
||||
// might assume this -- this covers functionality where the ingest code has
|
||||
// not (yet) been made fully shard aware.
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A1: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A2: because key_is_shard0 also covers relation size keys, which are written
|
||||
// on all shards even though they're only maintained accurately on shard 0.
|
||||
if self.count < ShardCount(2) {
|
||||
// Fast path: unsharded tenant doesn't dispose of anything
|
||||
return false;
|
||||
}
|
||||
|
||||
if self.is_key_global(key) {
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
|
||||
@@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder {
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct StartupMessageParams {
|
||||
params: Bytes,
|
||||
pub params: Bytes,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
@@ -565,6 +565,8 @@ pub enum BeMessage<'a> {
|
||||
/// Batch of interpreted, shard filtered WAL records,
|
||||
/// ready for the pageserver to ingest
|
||||
InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
|
||||
|
||||
Raw(u8, &'a [u8]),
|
||||
}
|
||||
|
||||
/// Common shorthands.
|
||||
@@ -754,6 +756,10 @@ impl BeMessage<'_> {
|
||||
/// one more buffer.
|
||||
pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
|
||||
match message {
|
||||
BeMessage::Raw(code, data) => {
|
||||
buf.put_u8(*code);
|
||||
write_body(buf, |b| b.put_slice(data))
|
||||
}
|
||||
BeMessage::AuthenticationOk => {
|
||||
buf.put_u8(b'R');
|
||||
write_body(buf, |buf| {
|
||||
|
||||
@@ -10,7 +10,6 @@ byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
hmac.workspace = true
|
||||
md-5 = "0.10"
|
||||
memchr = "2.0"
|
||||
rand.workspace = true
|
||||
sha2.workspace = true
|
||||
|
||||
@@ -1,37 +1,2 @@
|
||||
//! Authentication protocol support.
|
||||
use md5::{Digest, Md5};
|
||||
|
||||
pub mod sasl;
|
||||
|
||||
/// Hashes authentication information in a way suitable for use in response
|
||||
/// to an `AuthenticationMd5Password` message.
|
||||
///
|
||||
/// The resulting string should be sent back to the database in a
|
||||
/// `PasswordMessage` message.
|
||||
#[inline]
|
||||
pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
|
||||
let mut md5 = Md5::new();
|
||||
md5.update(password);
|
||||
md5.update(username);
|
||||
let output = md5.finalize_reset();
|
||||
md5.update(format!("{:x}", output));
|
||||
md5.update(salt);
|
||||
format!("md5{:x}", md5.finalize())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn md5() {
|
||||
let username = b"md5_user";
|
||||
let password = b"password";
|
||||
let salt = [0x2a, 0x3d, 0x8f, 0xe0];
|
||||
|
||||
assert_eq!(
|
||||
md5_hash(username, password, salt),
|
||||
"md562af4dd09bbb41884907a838a3233294"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +117,7 @@ enum Credentials<const N: usize> {
|
||||
/// A regular password as a vector of bytes.
|
||||
Password(Vec<u8>),
|
||||
/// A precomputed pair of keys.
|
||||
Keys(Box<ScramKeys<N>>),
|
||||
Keys(ScramKeys<N>),
|
||||
}
|
||||
|
||||
enum State {
|
||||
@@ -176,7 +176,7 @@ impl ScramSha256 {
|
||||
|
||||
/// Constructs a new instance which will use the provided key pair for authentication.
|
||||
pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
|
||||
let password = Credentials::Keys(keys.into());
|
||||
let password = Credentials::Keys(keys);
|
||||
ScramSha256::new_inner(password, channel_binding, nonce())
|
||||
}
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ pub enum Message {
|
||||
AuthenticationCleartextPassword,
|
||||
AuthenticationGss,
|
||||
AuthenticationKerberosV5,
|
||||
AuthenticationMd5Password(AuthenticationMd5PasswordBody),
|
||||
AuthenticationMd5Password,
|
||||
AuthenticationOk,
|
||||
AuthenticationScmCredential,
|
||||
AuthenticationSspi,
|
||||
@@ -191,11 +191,7 @@ impl Message {
|
||||
0 => Message::AuthenticationOk,
|
||||
2 => Message::AuthenticationKerberosV5,
|
||||
3 => Message::AuthenticationCleartextPassword,
|
||||
5 => {
|
||||
let mut salt = [0; 4];
|
||||
buf.read_exact(&mut salt)?;
|
||||
Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
|
||||
}
|
||||
5 => Message::AuthenticationMd5Password,
|
||||
6 => Message::AuthenticationScmCredential,
|
||||
7 => Message::AuthenticationGss,
|
||||
8 => Message::AuthenticationGssContinue,
|
||||
@@ -541,6 +537,10 @@ impl NoticeResponseBody {
|
||||
pub fn fields(&self) -> ErrorFields<'_> {
|
||||
ErrorFields { buf: &self.storage }
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.storage
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NotificationResponseBody {
|
||||
|
||||
@@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = (&'a str, &'a str)>,
|
||||
{
|
||||
pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> {
|
||||
write_body(buf, |buf| {
|
||||
// postgres protocol version 3.0(196608) in bigger-endian
|
||||
buf.put_i32(0x00_03_00_00);
|
||||
for (key, value) in parameters {
|
||||
write_cstr(key.as_bytes(), buf)?;
|
||||
write_cstr(value.as_bytes(), buf)?;
|
||||
}
|
||||
buf.put_slice(¶meters.params);
|
||||
buf.put_u8(0);
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||
pub struct StartupMessageParams {
|
||||
pub params: BytesMut,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Set parameter's value by its name.
|
||||
pub fn insert(&mut self, name: &str, value: &str) {
|
||||
if name.contains('\0') || value.contains('\0') {
|
||||
panic!("startup parameter name or value contained a null")
|
||||
}
|
||||
self.params.put_slice(name.as_bytes());
|
||||
self.params.put_u8(0);
|
||||
self.params.put_slice(value.as_bytes());
|
||||
self.params.put_u8(0);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sync(buf: &mut BytesMut) {
|
||||
buf.put_u8(b'S');
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
use crate::authentication::sasl;
|
||||
use hmac::{Hmac, Mac};
|
||||
use md5::Md5;
|
||||
use rand::RngCore;
|
||||
use sha2::digest::FixedOutput;
|
||||
use sha2::{Digest, Sha256};
|
||||
@@ -88,20 +87,3 @@ pub(crate) async fn scram_sha_256_salt(
|
||||
base64::encode(server_key)
|
||||
)
|
||||
}
|
||||
|
||||
/// **Not recommended, as MD5 is not considered to be secure.**
|
||||
///
|
||||
/// Hash password using MD5 with the username as the salt.
|
||||
///
|
||||
/// The client may assume the returned string doesn't contain any
|
||||
/// special characters that would require escaping.
|
||||
pub fn md5(password: &[u8], username: &str) -> String {
|
||||
// salt password with username
|
||||
let mut salted_password = Vec::from(password);
|
||||
salted_password.extend_from_slice(username.as_bytes());
|
||||
|
||||
let mut hash = Md5::new();
|
||||
hash.update(&salted_password);
|
||||
let digest = hash.finalize();
|
||||
format!("md5{:x}", digest)
|
||||
}
|
||||
|
||||
@@ -9,11 +9,3 @@ async fn test_encrypt_scram_sha_256() {
|
||||
"SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encrypt_md5() {
|
||||
assert_eq!(
|
||||
password::md5(b"secret", "foo"),
|
||||
"md54ab2c5d00339c4b2a4e921d2dc4edec7"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -10,10 +10,10 @@ use tokio::net::TcpStream;
|
||||
/// connection.
|
||||
#[derive(Clone)]
|
||||
pub struct CancelToken {
|
||||
pub(crate) socket_config: Option<SocketConfig>,
|
||||
pub(crate) ssl_mode: SslMode,
|
||||
pub(crate) process_id: i32,
|
||||
pub(crate) secret_key: i32,
|
||||
pub socket_config: Option<SocketConfig>,
|
||||
pub ssl_mode: SslMode,
|
||||
pub process_id: i32,
|
||||
pub secret_key: i32,
|
||||
}
|
||||
|
||||
impl CancelToken {
|
||||
|
||||
@@ -138,7 +138,7 @@ impl InnerClient {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SocketConfig {
|
||||
pub struct SocketConfig {
|
||||
pub host: Host,
|
||||
pub port: u16,
|
||||
pub connect_timeout: Option<Duration>,
|
||||
@@ -152,7 +152,7 @@ pub(crate) struct SocketConfig {
|
||||
pub struct Client {
|
||||
inner: Arc<InnerClient>,
|
||||
|
||||
socket_config: Option<SocketConfig>,
|
||||
socket_config: SocketConfig,
|
||||
ssl_mode: SslMode,
|
||||
process_id: i32,
|
||||
secret_key: i32,
|
||||
@@ -161,6 +161,7 @@ pub struct Client {
|
||||
impl Client {
|
||||
pub(crate) fn new(
|
||||
sender: mpsc::UnboundedSender<Request>,
|
||||
socket_config: SocketConfig,
|
||||
ssl_mode: SslMode,
|
||||
process_id: i32,
|
||||
secret_key: i32,
|
||||
@@ -172,7 +173,7 @@ impl Client {
|
||||
buffer: Default::default(),
|
||||
}),
|
||||
|
||||
socket_config: None,
|
||||
socket_config,
|
||||
ssl_mode,
|
||||
process_id,
|
||||
secret_key,
|
||||
@@ -188,10 +189,6 @@ impl Client {
|
||||
&self.inner
|
||||
}
|
||||
|
||||
pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
|
||||
self.socket_config = Some(socket_config);
|
||||
}
|
||||
|
||||
/// Creates a new prepared statement.
|
||||
///
|
||||
/// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
|
||||
@@ -412,7 +409,7 @@ impl Client {
|
||||
/// connection associated with this client.
|
||||
pub fn cancel_token(&self) -> CancelToken {
|
||||
CancelToken {
|
||||
socket_config: self.socket_config.clone(),
|
||||
socket_config: Some(self.socket_config.clone()),
|
||||
ssl_mode: self.ssl_mode,
|
||||
process_id: self.process_id,
|
||||
secret_key: self.secret_key,
|
||||
|
||||
@@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresCodec {
|
||||
pub max_message_size: Option<usize>,
|
||||
}
|
||||
pub struct PostgresCodec;
|
||||
|
||||
impl Encoder<FrontendMessage> for PostgresCodec {
|
||||
type Error = io::Error;
|
||||
@@ -66,15 +64,6 @@ impl Decoder for PostgresCodec {
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(max) = self.max_message_size {
|
||||
if len > max {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"message too large",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
match header.tag() {
|
||||
backend::NOTICE_RESPONSE_TAG
|
||||
| backend::NOTIFICATION_RESPONSE_TAG
|
||||
|
||||
@@ -2,29 +2,19 @@
|
||||
|
||||
use crate::connect::connect;
|
||||
use crate::connect_raw::connect_raw;
|
||||
use crate::connect_raw::RawConnection;
|
||||
use crate::tls::MakeTlsConnect;
|
||||
use crate::tls::TlsConnect;
|
||||
use crate::{Client, Connection, Error};
|
||||
use std::borrow::Cow;
|
||||
use postgres_protocol2::message::frontend::StartupMessageParams;
|
||||
use std::fmt;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use std::{error, fmt, iter, mem};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
|
||||
pub use postgres_protocol2::authentication::sasl::ScramKeys;
|
||||
use tokio::net::TcpStream;
|
||||
|
||||
/// Properties required of a session.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
pub enum TargetSessionAttrs {
|
||||
/// No special properties are required.
|
||||
Any,
|
||||
/// The session must allow writes.
|
||||
ReadWrite,
|
||||
}
|
||||
|
||||
/// TLS configuration.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
@@ -74,119 +64,37 @@ pub enum AuthKeys {
|
||||
}
|
||||
|
||||
/// Connection configuration.
|
||||
///
|
||||
/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
|
||||
///
|
||||
/// # Key-Value
|
||||
///
|
||||
/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
|
||||
/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
|
||||
///
|
||||
/// ## Keys
|
||||
///
|
||||
/// * `user` - The username to authenticate with. Required.
|
||||
/// * `password` - The password to authenticate with.
|
||||
/// * `dbname` - The name of the database to connect to. Defaults to the username.
|
||||
/// * `options` - Command line options used to configure the server.
|
||||
/// * `application_name` - Sets the `application_name` parameter on the server.
|
||||
/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
|
||||
/// if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
|
||||
/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
|
||||
/// path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
|
||||
/// can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
|
||||
/// with the `connect` method.
|
||||
/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
|
||||
/// either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
|
||||
/// omitted or the empty string.
|
||||
/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
|
||||
/// can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
|
||||
/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
|
||||
/// the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
|
||||
/// in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
|
||||
/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
|
||||
/// binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
|
||||
/// If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
|
||||
///
|
||||
/// ## Examples
|
||||
///
|
||||
/// ```not_rust
|
||||
/// host=localhost user=postgres connect_timeout=10 keepalives=0
|
||||
/// ```
|
||||
///
|
||||
/// ```not_rust
|
||||
/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
|
||||
/// ```
|
||||
///
|
||||
/// ```not_rust
|
||||
/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
|
||||
/// ```
|
||||
///
|
||||
/// # Url
|
||||
///
|
||||
/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
|
||||
/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
|
||||
/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
|
||||
/// as the path component of the URL specifies the database name.
|
||||
///
|
||||
/// ## Examples
|
||||
///
|
||||
/// ```not_rust
|
||||
/// postgresql://user@localhost
|
||||
/// ```
|
||||
///
|
||||
/// ```not_rust
|
||||
/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
|
||||
/// ```
|
||||
///
|
||||
/// ```not_rust
|
||||
/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
|
||||
/// ```
|
||||
///
|
||||
/// ```not_rust
|
||||
/// postgresql:///mydb?user=user&host=/var/lib/postgresql
|
||||
/// ```
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct Config {
|
||||
pub(crate) user: Option<String>,
|
||||
pub(crate) host: Host,
|
||||
pub(crate) port: u16,
|
||||
|
||||
pub(crate) password: Option<Vec<u8>>,
|
||||
pub(crate) auth_keys: Option<Box<AuthKeys>>,
|
||||
pub(crate) dbname: Option<String>,
|
||||
pub(crate) options: Option<String>,
|
||||
pub(crate) application_name: Option<String>,
|
||||
pub(crate) ssl_mode: SslMode,
|
||||
pub(crate) host: Vec<Host>,
|
||||
pub(crate) port: Vec<u16>,
|
||||
pub(crate) connect_timeout: Option<Duration>,
|
||||
pub(crate) target_session_attrs: TargetSessionAttrs,
|
||||
pub(crate) channel_binding: ChannelBinding,
|
||||
pub(crate) replication_mode: Option<ReplicationMode>,
|
||||
pub(crate) max_backend_message_size: Option<usize>,
|
||||
}
|
||||
pub(crate) server_params: StartupMessageParams,
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Config {
|
||||
Config::new()
|
||||
}
|
||||
database: bool,
|
||||
username: bool,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Creates a new configuration.
|
||||
pub fn new() -> Config {
|
||||
pub fn new(host: String, port: u16) -> Config {
|
||||
Config {
|
||||
user: None,
|
||||
host: Host::Tcp(host),
|
||||
port,
|
||||
password: None,
|
||||
auth_keys: None,
|
||||
dbname: None,
|
||||
options: None,
|
||||
application_name: None,
|
||||
ssl_mode: SslMode::Prefer,
|
||||
host: vec![],
|
||||
port: vec![],
|
||||
connect_timeout: None,
|
||||
target_session_attrs: TargetSessionAttrs::Any,
|
||||
channel_binding: ChannelBinding::Prefer,
|
||||
replication_mode: None,
|
||||
max_backend_message_size: None,
|
||||
server_params: StartupMessageParams::default(),
|
||||
|
||||
database: false,
|
||||
username: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,14 +102,13 @@ impl Config {
|
||||
///
|
||||
/// Required.
|
||||
pub fn user(&mut self, user: &str) -> &mut Config {
|
||||
self.user = Some(user.to_string());
|
||||
self
|
||||
self.set_param("user", user)
|
||||
}
|
||||
|
||||
/// Gets the user to authenticate with, if one has been configured with
|
||||
/// the `user` method.
|
||||
pub fn get_user(&self) -> Option<&str> {
|
||||
self.user.as_deref()
|
||||
pub fn user_is_set(&self) -> bool {
|
||||
self.username
|
||||
}
|
||||
|
||||
/// Sets the password to authenticate with.
|
||||
@@ -237,40 +144,26 @@ impl Config {
|
||||
///
|
||||
/// Defaults to the user.
|
||||
pub fn dbname(&mut self, dbname: &str) -> &mut Config {
|
||||
self.dbname = Some(dbname.to_string());
|
||||
self
|
||||
self.set_param("database", dbname)
|
||||
}
|
||||
|
||||
/// Gets the name of the database to connect to, if one has been configured
|
||||
/// with the `dbname` method.
|
||||
pub fn get_dbname(&self) -> Option<&str> {
|
||||
self.dbname.as_deref()
|
||||
pub fn db_is_set(&self) -> bool {
|
||||
self.database
|
||||
}
|
||||
|
||||
/// Sets command line options used to configure the server.
|
||||
pub fn options(&mut self, options: &str) -> &mut Config {
|
||||
self.options = Some(options.to_string());
|
||||
pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config {
|
||||
if name == "database" {
|
||||
self.database = true;
|
||||
} else if name == "user" {
|
||||
self.username = true;
|
||||
}
|
||||
|
||||
self.server_params.insert(name, value);
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the command line options used to configure the server, if the
|
||||
/// options have been set with the `options` method.
|
||||
pub fn get_options(&self) -> Option<&str> {
|
||||
self.options.as_deref()
|
||||
}
|
||||
|
||||
/// Sets the value of the `application_name` runtime parameter.
|
||||
pub fn application_name(&mut self, application_name: &str) -> &mut Config {
|
||||
self.application_name = Some(application_name.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the value of the `application_name` runtime parameter, if it has
|
||||
/// been set with the `application_name` method.
|
||||
pub fn get_application_name(&self) -> Option<&str> {
|
||||
self.application_name.as_deref()
|
||||
}
|
||||
|
||||
/// Sets the SSL configuration.
|
||||
///
|
||||
/// Defaults to `prefer`.
|
||||
@@ -284,32 +177,14 @@ impl Config {
|
||||
self.ssl_mode
|
||||
}
|
||||
|
||||
/// Adds a host to the configuration.
|
||||
///
|
||||
/// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
|
||||
pub fn host(&mut self, host: &str) -> &mut Config {
|
||||
self.host.push(Host::Tcp(host.to_string()));
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the hosts that have been added to the configuration with `host`.
|
||||
pub fn get_hosts(&self) -> &[Host] {
|
||||
pub fn get_host(&self) -> &Host {
|
||||
&self.host
|
||||
}
|
||||
|
||||
/// Adds a port to the configuration.
|
||||
///
|
||||
/// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
|
||||
/// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
|
||||
/// as hosts.
|
||||
pub fn port(&mut self, port: u16) -> &mut Config {
|
||||
self.port.push(port);
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the ports that have been added to the configuration with `port`.
|
||||
pub fn get_ports(&self) -> &[u16] {
|
||||
&self.port
|
||||
pub fn get_port(&self) -> u16 {
|
||||
self.port
|
||||
}
|
||||
|
||||
/// Sets the timeout applied to socket-level connection attempts.
|
||||
@@ -327,23 +202,6 @@ impl Config {
|
||||
self.connect_timeout.as_ref()
|
||||
}
|
||||
|
||||
/// Sets the requirements of the session.
|
||||
///
|
||||
/// This can be used to connect to the primary server in a clustered database rather than one of the read-only
|
||||
/// secondary servers. Defaults to `Any`.
|
||||
pub fn target_session_attrs(
|
||||
&mut self,
|
||||
target_session_attrs: TargetSessionAttrs,
|
||||
) -> &mut Config {
|
||||
self.target_session_attrs = target_session_attrs;
|
||||
self
|
||||
}
|
||||
|
||||
/// Gets the requirements of the session.
|
||||
pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
|
||||
self.target_session_attrs
|
||||
}
|
||||
|
||||
/// Sets the channel binding behavior.
|
||||
///
|
||||
/// Defaults to `prefer`.
|
||||
@@ -357,121 +215,6 @@ impl Config {
|
||||
self.channel_binding
|
||||
}
|
||||
|
||||
/// Set replication mode.
|
||||
pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
|
||||
self.replication_mode = Some(replication_mode);
|
||||
self
|
||||
}
|
||||
|
||||
/// Get replication mode.
|
||||
pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
|
||||
self.replication_mode
|
||||
}
|
||||
|
||||
/// Set limit for backend messages size.
|
||||
pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
|
||||
self.max_backend_message_size = Some(max_backend_message_size);
|
||||
self
|
||||
}
|
||||
|
||||
/// Get limit for backend messages size.
|
||||
pub fn get_max_backend_message_size(&self) -> Option<usize> {
|
||||
self.max_backend_message_size
|
||||
}
|
||||
|
||||
fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
|
||||
match key {
|
||||
"user" => {
|
||||
self.user(value);
|
||||
}
|
||||
"password" => {
|
||||
self.password(value);
|
||||
}
|
||||
"dbname" => {
|
||||
self.dbname(value);
|
||||
}
|
||||
"options" => {
|
||||
self.options(value);
|
||||
}
|
||||
"application_name" => {
|
||||
self.application_name(value);
|
||||
}
|
||||
"sslmode" => {
|
||||
let mode = match value {
|
||||
"disable" => SslMode::Disable,
|
||||
"prefer" => SslMode::Prefer,
|
||||
"require" => SslMode::Require,
|
||||
_ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
|
||||
};
|
||||
self.ssl_mode(mode);
|
||||
}
|
||||
"host" => {
|
||||
for host in value.split(',') {
|
||||
self.host(host);
|
||||
}
|
||||
}
|
||||
"port" => {
|
||||
for port in value.split(',') {
|
||||
let port = if port.is_empty() {
|
||||
5432
|
||||
} else {
|
||||
port.parse()
|
||||
.map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
|
||||
};
|
||||
self.port(port);
|
||||
}
|
||||
}
|
||||
"connect_timeout" => {
|
||||
let timeout = value
|
||||
.parse::<i64>()
|
||||
.map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
|
||||
if timeout > 0 {
|
||||
self.connect_timeout(Duration::from_secs(timeout as u64));
|
||||
}
|
||||
}
|
||||
"target_session_attrs" => {
|
||||
let target_session_attrs = match value {
|
||||
"any" => TargetSessionAttrs::Any,
|
||||
"read-write" => TargetSessionAttrs::ReadWrite,
|
||||
_ => {
|
||||
return Err(Error::config_parse(Box::new(InvalidValue(
|
||||
"target_session_attrs",
|
||||
))));
|
||||
}
|
||||
};
|
||||
self.target_session_attrs(target_session_attrs);
|
||||
}
|
||||
"channel_binding" => {
|
||||
let channel_binding = match value {
|
||||
"disable" => ChannelBinding::Disable,
|
||||
"prefer" => ChannelBinding::Prefer,
|
||||
"require" => ChannelBinding::Require,
|
||||
_ => {
|
||||
return Err(Error::config_parse(Box::new(InvalidValue(
|
||||
"channel_binding",
|
||||
))))
|
||||
}
|
||||
};
|
||||
self.channel_binding(channel_binding);
|
||||
}
|
||||
"max_backend_message_size" => {
|
||||
let limit = value.parse::<usize>().map_err(|_| {
|
||||
Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
|
||||
})?;
|
||||
if limit > 0 {
|
||||
self.max_backend_message_size(limit);
|
||||
}
|
||||
}
|
||||
key => {
|
||||
return Err(Error::config_parse(Box::new(UnknownOption(
|
||||
key.to_string(),
|
||||
))));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Opens a connection to a PostgreSQL database.
|
||||
///
|
||||
/// Requires the `runtime` Cargo feature (enabled by default).
|
||||
@@ -485,14 +228,11 @@ impl Config {
|
||||
connect(tls, self).await
|
||||
}
|
||||
|
||||
/// Connects to a PostgreSQL database over an arbitrary stream.
|
||||
///
|
||||
/// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
|
||||
pub async fn connect_raw<S, T>(
|
||||
&self,
|
||||
stream: S,
|
||||
tls: T,
|
||||
) -> Result<(Client, Connection<S, T::Stream>), Error>
|
||||
) -> Result<RawConnection<S, T::Stream>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
@@ -501,17 +241,6 @@ impl Config {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for Config {
|
||||
type Err = Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Config, Error> {
|
||||
match UrlParser::parse(s)? {
|
||||
Some(config) => Ok(config),
|
||||
None => Parser::parse(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Omit password from debug output
|
||||
impl fmt::Debug for Config {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
@@ -523,375 +252,13 @@ impl fmt::Debug for Config {
|
||||
}
|
||||
|
||||
f.debug_struct("Config")
|
||||
.field("user", &self.user)
|
||||
.field("password", &self.password.as_ref().map(|_| Redaction {}))
|
||||
.field("dbname", &self.dbname)
|
||||
.field("options", &self.options)
|
||||
.field("application_name", &self.application_name)
|
||||
.field("ssl_mode", &self.ssl_mode)
|
||||
.field("host", &self.host)
|
||||
.field("port", &self.port)
|
||||
.field("connect_timeout", &self.connect_timeout)
|
||||
.field("target_session_attrs", &self.target_session_attrs)
|
||||
.field("channel_binding", &self.channel_binding)
|
||||
.field("replication", &self.replication_mode)
|
||||
.field("server_params", &self.server_params)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct UnknownOption(String);
|
||||
|
||||
impl fmt::Display for UnknownOption {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(fmt, "unknown option `{}`", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for UnknownOption {}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct InvalidValue(&'static str);
|
||||
|
||||
impl fmt::Display for InvalidValue {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(fmt, "invalid value for option `{}`", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for InvalidValue {}
|
||||
|
||||
struct Parser<'a> {
|
||||
s: &'a str,
|
||||
it: iter::Peekable<str::CharIndices<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> Parser<'a> {
|
||||
fn parse(s: &'a str) -> Result<Config, Error> {
|
||||
let mut parser = Parser {
|
||||
s,
|
||||
it: s.char_indices().peekable(),
|
||||
};
|
||||
|
||||
let mut config = Config::new();
|
||||
|
||||
while let Some((key, value)) = parser.parameter()? {
|
||||
config.param(key, &value)?;
|
||||
}
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
fn skip_ws(&mut self) {
|
||||
self.take_while(char::is_whitespace);
|
||||
}
|
||||
|
||||
fn take_while<F>(&mut self, f: F) -> &'a str
|
||||
where
|
||||
F: Fn(char) -> bool,
|
||||
{
|
||||
let start = match self.it.peek() {
|
||||
Some(&(i, _)) => i,
|
||||
None => return "",
|
||||
};
|
||||
|
||||
loop {
|
||||
match self.it.peek() {
|
||||
Some(&(_, c)) if f(c) => {
|
||||
self.it.next();
|
||||
}
|
||||
Some(&(i, _)) => return &self.s[start..i],
|
||||
None => return &self.s[start..],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn eat(&mut self, target: char) -> Result<(), Error> {
|
||||
match self.it.next() {
|
||||
Some((_, c)) if c == target => Ok(()),
|
||||
Some((i, c)) => {
|
||||
let m = format!(
|
||||
"unexpected character at byte {}: expected `{}` but got `{}`",
|
||||
i, target, c
|
||||
);
|
||||
Err(Error::config_parse(m.into()))
|
||||
}
|
||||
None => Err(Error::config_parse("unexpected EOF".into())),
|
||||
}
|
||||
}
|
||||
|
||||
fn eat_if(&mut self, target: char) -> bool {
|
||||
match self.it.peek() {
|
||||
Some(&(_, c)) if c == target => {
|
||||
self.it.next();
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn keyword(&mut self) -> Option<&'a str> {
|
||||
let s = self.take_while(|c| match c {
|
||||
c if c.is_whitespace() => false,
|
||||
'=' => false,
|
||||
_ => true,
|
||||
});
|
||||
|
||||
if s.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(s)
|
||||
}
|
||||
}
|
||||
|
||||
fn value(&mut self) -> Result<String, Error> {
|
||||
let value = if self.eat_if('\'') {
|
||||
let value = self.quoted_value()?;
|
||||
self.eat('\'')?;
|
||||
value
|
||||
} else {
|
||||
self.simple_value()?
|
||||
};
|
||||
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
fn simple_value(&mut self) -> Result<String, Error> {
|
||||
let mut value = String::new();
|
||||
|
||||
while let Some(&(_, c)) = self.it.peek() {
|
||||
if c.is_whitespace() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.it.next();
|
||||
if c == '\\' {
|
||||
if let Some((_, c2)) = self.it.next() {
|
||||
value.push(c2);
|
||||
}
|
||||
} else {
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
if value.is_empty() {
|
||||
return Err(Error::config_parse("unexpected EOF".into()));
|
||||
}
|
||||
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
fn quoted_value(&mut self) -> Result<String, Error> {
|
||||
let mut value = String::new();
|
||||
|
||||
while let Some(&(_, c)) = self.it.peek() {
|
||||
if c == '\'' {
|
||||
return Ok(value);
|
||||
}
|
||||
|
||||
self.it.next();
|
||||
if c == '\\' {
|
||||
if let Some((_, c2)) = self.it.next() {
|
||||
value.push(c2);
|
||||
}
|
||||
} else {
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
Err(Error::config_parse(
|
||||
"unterminated quoted connection parameter value".into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
|
||||
self.skip_ws();
|
||||
let keyword = match self.keyword() {
|
||||
Some(keyword) => keyword,
|
||||
None => return Ok(None),
|
||||
};
|
||||
self.skip_ws();
|
||||
self.eat('=')?;
|
||||
self.skip_ws();
|
||||
let value = self.value()?;
|
||||
|
||||
Ok(Some((keyword, value)))
|
||||
}
|
||||
}
|
||||
|
||||
// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
|
||||
struct UrlParser<'a> {
|
||||
s: &'a str,
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl<'a> UrlParser<'a> {
|
||||
fn parse(s: &'a str) -> Result<Option<Config>, Error> {
|
||||
let s = match Self::remove_url_prefix(s) {
|
||||
Some(s) => s,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let mut parser = UrlParser {
|
||||
s,
|
||||
config: Config::new(),
|
||||
};
|
||||
|
||||
parser.parse_credentials()?;
|
||||
parser.parse_host()?;
|
||||
parser.parse_path()?;
|
||||
parser.parse_params()?;
|
||||
|
||||
Ok(Some(parser.config))
|
||||
}
|
||||
|
||||
fn remove_url_prefix(s: &str) -> Option<&str> {
|
||||
for prefix in &["postgres://", "postgresql://"] {
|
||||
if let Some(stripped) = s.strip_prefix(prefix) {
|
||||
return Some(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
|
||||
match self.s.find(end) {
|
||||
Some(pos) => {
|
||||
let (head, tail) = self.s.split_at(pos);
|
||||
self.s = tail;
|
||||
Some(head)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn take_all(&mut self) -> &'a str {
|
||||
mem::take(&mut self.s)
|
||||
}
|
||||
|
||||
fn eat_byte(&mut self) {
|
||||
self.s = &self.s[1..];
|
||||
}
|
||||
|
||||
fn parse_credentials(&mut self) -> Result<(), Error> {
|
||||
let creds = match self.take_until(&['@']) {
|
||||
Some(creds) => creds,
|
||||
None => return Ok(()),
|
||||
};
|
||||
self.eat_byte();
|
||||
|
||||
let mut it = creds.splitn(2, ':');
|
||||
let user = self.decode(it.next().unwrap())?;
|
||||
self.config.user(&user);
|
||||
|
||||
if let Some(password) = it.next() {
|
||||
let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
|
||||
self.config.password(password);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_host(&mut self) -> Result<(), Error> {
|
||||
let host = match self.take_until(&['/', '?']) {
|
||||
Some(host) => host,
|
||||
None => self.take_all(),
|
||||
};
|
||||
|
||||
if host.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for chunk in host.split(',') {
|
||||
let (host, port) = if chunk.starts_with('[') {
|
||||
let idx = match chunk.find(']') {
|
||||
Some(idx) => idx,
|
||||
None => return Err(Error::config_parse(InvalidValue("host").into())),
|
||||
};
|
||||
|
||||
let host = &chunk[1..idx];
|
||||
let remaining = &chunk[idx + 1..];
|
||||
let port = if let Some(port) = remaining.strip_prefix(':') {
|
||||
Some(port)
|
||||
} else if remaining.is_empty() {
|
||||
None
|
||||
} else {
|
||||
return Err(Error::config_parse(InvalidValue("host").into()));
|
||||
};
|
||||
|
||||
(host, port)
|
||||
} else {
|
||||
let mut it = chunk.splitn(2, ':');
|
||||
(it.next().unwrap(), it.next())
|
||||
};
|
||||
|
||||
self.host_param(host)?;
|
||||
let port = self.decode(port.unwrap_or("5432"))?;
|
||||
self.config.param("port", &port)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_path(&mut self) -> Result<(), Error> {
|
||||
if !self.s.starts_with('/') {
|
||||
return Ok(());
|
||||
}
|
||||
self.eat_byte();
|
||||
|
||||
let dbname = match self.take_until(&['?']) {
|
||||
Some(dbname) => dbname,
|
||||
None => self.take_all(),
|
||||
};
|
||||
|
||||
if !dbname.is_empty() {
|
||||
self.config.dbname(&self.decode(dbname)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_params(&mut self) -> Result<(), Error> {
|
||||
if !self.s.starts_with('?') {
|
||||
return Ok(());
|
||||
}
|
||||
self.eat_byte();
|
||||
|
||||
while !self.s.is_empty() {
|
||||
let key = match self.take_until(&['=']) {
|
||||
Some(key) => self.decode(key)?,
|
||||
None => return Err(Error::config_parse("unterminated parameter".into())),
|
||||
};
|
||||
self.eat_byte();
|
||||
|
||||
let value = match self.take_until(&['&']) {
|
||||
Some(value) => {
|
||||
self.eat_byte();
|
||||
value
|
||||
}
|
||||
None => self.take_all(),
|
||||
};
|
||||
|
||||
if key == "host" {
|
||||
self.host_param(value)?;
|
||||
} else {
|
||||
let value = self.decode(value)?;
|
||||
self.config.param(&key, &value)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn host_param(&mut self, s: &str) -> Result<(), Error> {
|
||||
let s = self.decode(s)?;
|
||||
self.config.param("host", &s)
|
||||
}
|
||||
|
||||
fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
|
||||
percent_encoding::percent_decode(s.as_bytes())
|
||||
.decode_utf8()
|
||||
.map_err(|e| Error::config_parse(e.into()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use crate::client::SocketConfig;
|
||||
use crate::config::{Host, TargetSessionAttrs};
|
||||
use crate::codec::BackendMessage;
|
||||
use crate::config::Host;
|
||||
use crate::connect_raw::connect_raw;
|
||||
use crate::connect_socket::connect_socket;
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
|
||||
use futures_util::{future, pin_mut, Future, FutureExt, Stream};
|
||||
use std::io;
|
||||
use std::task::Poll;
|
||||
use crate::{Client, Config, Connection, Error, RawConnection};
|
||||
use postgres_protocol2::message::backend::Message;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
pub async fn connect<T>(
|
||||
mut tls: T,
|
||||
@@ -16,38 +16,18 @@ pub async fn connect<T>(
|
||||
where
|
||||
T: MakeTlsConnect<TcpStream>,
|
||||
{
|
||||
if config.host.is_empty() {
|
||||
return Err(Error::config("host missing".into()));
|
||||
let hostname = match &config.host {
|
||||
Host::Tcp(host) => host.as_str(),
|
||||
};
|
||||
|
||||
let tls = tls
|
||||
.make_tls_connect(hostname)
|
||||
.map_err(|e| Error::tls(e.into()))?;
|
||||
|
||||
match connect_once(&config.host, config.port, tls, config).await {
|
||||
Ok((client, connection)) => Ok((client, connection)),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
|
||||
if config.port.len() > 1 && config.port.len() != config.host.len() {
|
||||
return Err(Error::config("invalid number of ports".into()));
|
||||
}
|
||||
|
||||
let mut error = None;
|
||||
for (i, host) in config.host.iter().enumerate() {
|
||||
let port = config
|
||||
.port
|
||||
.get(i)
|
||||
.or_else(|| config.port.first())
|
||||
.copied()
|
||||
.unwrap_or(5432);
|
||||
|
||||
let hostname = match host {
|
||||
Host::Tcp(host) => host.as_str(),
|
||||
};
|
||||
|
||||
let tls = tls
|
||||
.make_tls_connect(hostname)
|
||||
.map_err(|e| Error::tls(e.into()))?;
|
||||
|
||||
match connect_once(host, port, tls, config).await {
|
||||
Ok((client, connection)) => return Ok((client, connection)),
|
||||
Err(e) => error = Some(e),
|
||||
}
|
||||
}
|
||||
|
||||
Err(error.unwrap())
|
||||
}
|
||||
|
||||
async fn connect_once<T>(
|
||||
@@ -60,53 +40,36 @@ where
|
||||
T: TlsConnect<TcpStream>,
|
||||
{
|
||||
let socket = connect_socket(host, port, config.connect_timeout).await?;
|
||||
let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
|
||||
let RawConnection {
|
||||
stream,
|
||||
parameters,
|
||||
delayed_notice,
|
||||
process_id,
|
||||
secret_key,
|
||||
} = connect_raw(socket, tls, config).await?;
|
||||
|
||||
if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
|
||||
let rows = client.simple_query_raw("SHOW transaction_read_only");
|
||||
pin_mut!(rows);
|
||||
|
||||
let rows = future::poll_fn(|cx| {
|
||||
if connection.poll_unpin(cx)?.is_ready() {
|
||||
return Poll::Ready(Err(Error::closed()));
|
||||
}
|
||||
|
||||
rows.as_mut().poll(cx)
|
||||
})
|
||||
.await?;
|
||||
pin_mut!(rows);
|
||||
|
||||
loop {
|
||||
let next = future::poll_fn(|cx| {
|
||||
if connection.poll_unpin(cx)?.is_ready() {
|
||||
return Poll::Ready(Some(Err(Error::closed())));
|
||||
}
|
||||
|
||||
rows.as_mut().poll_next(cx)
|
||||
});
|
||||
|
||||
match next.await.transpose()? {
|
||||
Some(SimpleQueryMessage::Row(row)) => {
|
||||
if row.try_get(0)? == Some("on") {
|
||||
return Err(Error::connect(io::Error::new(
|
||||
io::ErrorKind::PermissionDenied,
|
||||
"database does not allow writes",
|
||||
)));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Some(_) => {}
|
||||
None => return Err(Error::unexpected_message()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
client.set_socket_config(SocketConfig {
|
||||
let socket_config = SocketConfig {
|
||||
host: host.clone(),
|
||||
port,
|
||||
connect_timeout: config.connect_timeout,
|
||||
});
|
||||
};
|
||||
|
||||
let (sender, receiver) = mpsc::unbounded_channel();
|
||||
let client = Client::new(
|
||||
sender,
|
||||
socket_config,
|
||||
config.ssl_mode,
|
||||
process_id,
|
||||
secret_key,
|
||||
);
|
||||
|
||||
// delayed notices are always sent as "Async" messages.
|
||||
let delayed = delayed_notice
|
||||
.into_iter()
|
||||
.map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
|
||||
.collect();
|
||||
|
||||
let connection = Connection::new(stream, delayed, parameters, receiver);
|
||||
|
||||
Ok((client, connection))
|
||||
}
|
||||
|
||||
@@ -1,29 +1,27 @@
|
||||
use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
|
||||
use crate::config::{self, AuthKeys, Config, ReplicationMode};
|
||||
use crate::config::{self, AuthKeys, Config};
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::maybe_tls_stream::MaybeTlsStream;
|
||||
use crate::tls::{TlsConnect, TlsStream};
|
||||
use crate::{Client, Connection, Error};
|
||||
use crate::Error;
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
|
||||
use postgres_protocol2::authentication;
|
||||
use postgres_protocol2::authentication::sasl;
|
||||
use postgres_protocol2::authentication::sasl::ScramSha256;
|
||||
use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
|
||||
use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
|
||||
use postgres_protocol2::message::frontend;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_util::codec::Framed;
|
||||
|
||||
pub struct StartupStream<S, T> {
|
||||
inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
buf: BackendMessages,
|
||||
delayed: VecDeque<BackendMessage>,
|
||||
delayed_notice: Vec<NoticeResponseBody>,
|
||||
}
|
||||
|
||||
impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
|
||||
@@ -78,11 +76,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RawConnection<S, T> {
|
||||
pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
pub parameters: HashMap<String, String>,
|
||||
pub delayed_notice: Vec<NoticeResponseBody>,
|
||||
pub process_id: i32,
|
||||
pub secret_key: i32,
|
||||
}
|
||||
|
||||
pub async fn connect_raw<S, T>(
|
||||
stream: S,
|
||||
tls: T,
|
||||
config: &Config,
|
||||
) -> Result<(Client, Connection<S, T::Stream>), Error>
|
||||
) -> Result<RawConnection<S, T::Stream>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
@@ -90,25 +96,22 @@ where
|
||||
let stream = connect_tls(stream, config.ssl_mode, tls).await?;
|
||||
|
||||
let mut stream = StartupStream {
|
||||
inner: Framed::new(
|
||||
stream,
|
||||
PostgresCodec {
|
||||
max_message_size: config.max_backend_message_size,
|
||||
},
|
||||
),
|
||||
inner: Framed::new(stream, PostgresCodec),
|
||||
buf: BackendMessages::empty(),
|
||||
delayed: VecDeque::new(),
|
||||
delayed_notice: Vec::new(),
|
||||
};
|
||||
|
||||
startup(&mut stream, config).await?;
|
||||
authenticate(&mut stream, config).await?;
|
||||
let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
|
||||
|
||||
let (sender, receiver) = mpsc::unbounded_channel();
|
||||
let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
|
||||
let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
|
||||
|
||||
Ok((client, connection))
|
||||
Ok(RawConnection {
|
||||
stream: stream.inner,
|
||||
parameters,
|
||||
delayed_notice: stream.delayed_notice,
|
||||
process_id,
|
||||
secret_key,
|
||||
})
|
||||
}
|
||||
|
||||
async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
|
||||
@@ -116,28 +119,8 @@ where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let mut params = vec![("client_encoding", "UTF8")];
|
||||
if let Some(user) = &config.user {
|
||||
params.push(("user", &**user));
|
||||
}
|
||||
if let Some(dbname) = &config.dbname {
|
||||
params.push(("database", &**dbname));
|
||||
}
|
||||
if let Some(options) = &config.options {
|
||||
params.push(("options", &**options));
|
||||
}
|
||||
if let Some(application_name) = &config.application_name {
|
||||
params.push(("application_name", &**application_name));
|
||||
}
|
||||
if let Some(replication_mode) = &config.replication_mode {
|
||||
match replication_mode {
|
||||
ReplicationMode::Physical => params.push(("replication", "true")),
|
||||
ReplicationMode::Logical => params.push(("replication", "database")),
|
||||
}
|
||||
}
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
|
||||
frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
|
||||
|
||||
stream
|
||||
.send(FrontendMessage::Raw(buf.freeze()))
|
||||
@@ -165,25 +148,11 @@ where
|
||||
|
||||
authenticate_password(stream, pass).await?;
|
||||
}
|
||||
Some(Message::AuthenticationMd5Password(body)) => {
|
||||
can_skip_channel_binding(config)?;
|
||||
|
||||
let user = config
|
||||
.user
|
||||
.as_ref()
|
||||
.ok_or_else(|| Error::config("user missing".into()))?;
|
||||
let pass = config
|
||||
.password
|
||||
.as_ref()
|
||||
.ok_or_else(|| Error::config("password missing".into()))?;
|
||||
|
||||
let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
|
||||
authenticate_password(stream, output.as_bytes()).await?;
|
||||
}
|
||||
Some(Message::AuthenticationSasl(body)) => {
|
||||
authenticate_sasl(stream, body, config).await?;
|
||||
}
|
||||
Some(Message::AuthenticationKerberosV5)
|
||||
Some(Message::AuthenticationMd5Password)
|
||||
| Some(Message::AuthenticationKerberosV5)
|
||||
| Some(Message::AuthenticationScmCredential)
|
||||
| Some(Message::AuthenticationGss)
|
||||
| Some(Message::AuthenticationSspi) => {
|
||||
@@ -347,9 +316,7 @@ where
|
||||
body.value().map_err(Error::parse)?.to_string(),
|
||||
);
|
||||
}
|
||||
Some(msg @ Message::NoticeResponse(_)) => {
|
||||
stream.delayed.push_back(BackendMessage::Async(msg))
|
||||
}
|
||||
Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body),
|
||||
Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
|
||||
Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
|
||||
Some(_) => return Err(Error::unexpected_message()),
|
||||
|
||||
@@ -349,7 +349,6 @@ enum Kind {
|
||||
Parse,
|
||||
Encode,
|
||||
Authentication,
|
||||
ConfigParse,
|
||||
Config,
|
||||
Connect,
|
||||
Timeout,
|
||||
@@ -386,7 +385,6 @@ impl fmt::Display for Error {
|
||||
Kind::Parse => fmt.write_str("error parsing response from server")?,
|
||||
Kind::Encode => fmt.write_str("error encoding message to server")?,
|
||||
Kind::Authentication => fmt.write_str("authentication error")?,
|
||||
Kind::ConfigParse => fmt.write_str("invalid connection string")?,
|
||||
Kind::Config => fmt.write_str("invalid configuration")?,
|
||||
Kind::Connect => fmt.write_str("error connecting to server")?,
|
||||
Kind::Timeout => fmt.write_str("timeout waiting for server")?,
|
||||
@@ -482,10 +480,6 @@ impl Error {
|
||||
Error::new(Kind::Authentication, Some(e))
|
||||
}
|
||||
|
||||
pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
|
||||
Error::new(Kind::ConfigParse, Some(e))
|
||||
}
|
||||
|
||||
pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
|
||||
Error::new(Kind::Config, Some(e))
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//! An asynchronous, pipelined, PostgreSQL client.
|
||||
#![warn(rust_2018_idioms, clippy::all, missing_docs)]
|
||||
#![warn(rust_2018_idioms, clippy::all)]
|
||||
|
||||
pub use crate::cancel_token::CancelToken;
|
||||
pub use crate::client::Client;
|
||||
pub use crate::client::{Client, SocketConfig};
|
||||
pub use crate::config::Config;
|
||||
pub use crate::connect_raw::RawConnection;
|
||||
pub use crate::connection::Connection;
|
||||
use crate::error::DbError;
|
||||
pub use crate::error::Error;
|
||||
@@ -12,14 +13,12 @@ pub use crate::query::RowStream;
|
||||
pub use crate::row::{Row, SimpleQueryRow};
|
||||
pub use crate::simple_query::SimpleQueryStream;
|
||||
pub use crate::statement::{Column, Statement};
|
||||
use crate::tls::MakeTlsConnect;
|
||||
pub use crate::tls::NoTls;
|
||||
pub use crate::to_statement::ToStatement;
|
||||
pub use crate::transaction::Transaction;
|
||||
pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
|
||||
use crate::types::ToSql;
|
||||
use postgres_protocol2::message::backend::ReadyForQueryBody;
|
||||
use tokio::net::TcpStream;
|
||||
|
||||
/// After executing a query, the connection will be in one of these states
|
||||
#[derive(Clone, Copy, Debug, PartialEq)]
|
||||
@@ -71,24 +70,6 @@ mod transaction;
|
||||
mod transaction_builder;
|
||||
pub mod types;
|
||||
|
||||
/// A convenience function which parses a connection string and connects to the database.
|
||||
///
|
||||
/// See the documentation for [`Config`] for details on the connection string format.
|
||||
///
|
||||
/// Requires the `runtime` Cargo feature (enabled by default).
|
||||
///
|
||||
/// [`Config`]: config/struct.Config.html
|
||||
pub async fn connect<T>(
|
||||
config: &str,
|
||||
tls: T,
|
||||
) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
|
||||
where
|
||||
T: MakeTlsConnect<TcpStream>,
|
||||
{
|
||||
let config = config.parse::<Config>()?;
|
||||
config.connect(tls).await
|
||||
}
|
||||
|
||||
/// An asynchronous notification.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Notification {
|
||||
|
||||
@@ -26,6 +26,7 @@ humantime.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jemalloc_pprof.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
@@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates heap profiles.
|
||||
///
|
||||
/// This only works with jemalloc on Linux.
|
||||
pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
enum Format {
|
||||
Jemalloc,
|
||||
Pprof,
|
||||
}
|
||||
|
||||
// Parameters.
|
||||
let format = match get_query_param(&req, "format")?.as_deref() {
|
||||
None => Format::Pprof,
|
||||
Some("jemalloc") => Format::Jemalloc,
|
||||
Some("pprof") => Format::Pprof,
|
||||
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
|
||||
};
|
||||
|
||||
// Obtain profiler handle.
|
||||
let mut prof_ctl = jemalloc_pprof::PROF_CTL
|
||||
.as_ref()
|
||||
.ok_or(ApiError::InternalServerError(anyhow!(
|
||||
"heap profiling not enabled"
|
||||
)))?
|
||||
.lock()
|
||||
.await;
|
||||
if !prof_ctl.activated() {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"heap profiling not enabled"
|
||||
)));
|
||||
}
|
||||
|
||||
// Take and return the profile.
|
||||
match format {
|
||||
Format::Jemalloc => {
|
||||
// NB: file is an open handle to a tempfile that's already deleted.
|
||||
let file = tokio::task::spawn_blocking(move || prof_ctl.dump())
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let stream = ReaderStream::new(tokio::fs::File::from_std(file));
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "application/octet-stream")
|
||||
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"")
|
||||
.body(Body::wrap_stream(stream))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
|
||||
Format::Pprof => {
|
||||
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "application/octet-stream")
|
||||
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
|
||||
.body(Body::from(data))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
|
||||
@@ -164,6 +164,12 @@ impl TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardNumber {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ShardSlug<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
pub mod heavier_once_cell;
|
||||
|
||||
pub mod duplex;
|
||||
pub mod gate;
|
||||
|
||||
pub mod spsc_fold;
|
||||
|
||||
1
libs/utils/src/sync/duplex.rs
Normal file
1
libs/utils/src/sync/duplex.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod mpsc;
|
||||
36
libs/utils/src/sync/duplex/mpsc.rs
Normal file
36
libs/utils/src/sync/duplex/mpsc.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
/// A bi-directional channel.
|
||||
pub struct Duplex<S, R> {
|
||||
pub tx: mpsc::Sender<S>,
|
||||
pub rx: mpsc::Receiver<R>,
|
||||
}
|
||||
|
||||
/// Creates a bi-directional channel.
|
||||
///
|
||||
/// The channel will buffer up to the provided number of messages. Once the buffer is full,
|
||||
/// attempts to send new messages will wait until a message is received from the channel.
|
||||
/// The provided buffer capacity must be at least 1.
|
||||
pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
|
||||
let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
|
||||
let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
|
||||
|
||||
(Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
|
||||
}
|
||||
|
||||
impl<S: Send, R: Send> Duplex<S, R> {
|
||||
/// Sends a value, waiting until there is capacity.
|
||||
///
|
||||
/// A successful send occurs when it is determined that the other end of the channel has not hung up already.
|
||||
pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
|
||||
self.tx.send(x).await
|
||||
}
|
||||
|
||||
/// Receives the next value for this receiver.
|
||||
///
|
||||
/// This method returns `None` if the channel has been closed and there are
|
||||
/// no remaining messages in the channel's buffer.
|
||||
pub async fn recv(&mut self) -> Option<R> {
|
||||
self.rx.recv().await
|
||||
}
|
||||
}
|
||||
@@ -112,30 +112,38 @@ impl MetadataRecord {
|
||||
};
|
||||
|
||||
// Next, filter the metadata record by shard.
|
||||
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
if let Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
|
||||
) = metadata_record
|
||||
{
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
clear_vm_bits.old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
clear_vm_bits.new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
|
||||
metadata_record = None
|
||||
match metadata_record {
|
||||
Some(
|
||||
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
|
||||
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
|
||||
) => {
|
||||
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
|
||||
// of the main relation. These are sharded and managed just like regular relation pages.
|
||||
// See: https://github.com/neondatabase/neon/issues/9855
|
||||
let is_local_vm_page = |heap_blk| {
|
||||
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
|
||||
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
|
||||
};
|
||||
// Send the old and new VM page updates to their respective shards.
|
||||
clear_vm_bits.old_heap_blkno = clear_vm_bits
|
||||
.old_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
clear_vm_bits.new_heap_blkno = clear_vm_bits
|
||||
.new_heap_blkno
|
||||
.filter(|&blkno| is_local_vm_page(blkno));
|
||||
// If neither VM page belongs to this shard, discard the record.
|
||||
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
|
||||
{
|
||||
metadata_record = None
|
||||
}
|
||||
}
|
||||
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
|
||||
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
|
||||
if !shard.is_shard_zero() {
|
||||
metadata_record = None;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(metadata_record)
|
||||
|
||||
@@ -62,10 +62,8 @@ async fn ingest(
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::{collections::HashMap, error::Error as _};
|
||||
|
||||
use bytes::Bytes;
|
||||
use detach_ancestor::AncestorDetached;
|
||||
@@ -25,10 +25,10 @@ pub struct Client {
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("send request: {0}")]
|
||||
#[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
|
||||
SendRequest(reqwest::Error),
|
||||
|
||||
#[error("receive body: {0}")]
|
||||
#[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
|
||||
ReceiveBody(reqwest::Error),
|
||||
|
||||
#[error("receive error body: {0}")]
|
||||
|
||||
@@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG);
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
const FEATURES: &[&str] = &[
|
||||
@@ -127,6 +132,7 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
@@ -302,7 +308,7 @@ fn start_pageserver(
|
||||
pageserver::metrics::tokio_epoll_uring::Collector::new(),
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::preinitialize_metrics(conf);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
@@ -630,45 +636,59 @@ fn start_pageserver(
|
||||
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||
});
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
let signal_token = CancellationToken::new();
|
||||
let signal_cancel = signal_token.child_token();
|
||||
|
||||
{
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
// Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
|
||||
// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
|
||||
// https://github.com/neondatabase/neon/issues/9740.
|
||||
tokio::spawn(async move {
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
|
||||
loop {
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => "SIGINT",
|
||||
_ = sigterm.recv() => "SIGTERM",
|
||||
};
|
||||
|
||||
if !signal_token.is_cancelled() {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
|
||||
signal_token.cancel();
|
||||
} else {
|
||||
info!("Got signal {signal}. Already shutting down.");
|
||||
}
|
||||
_ = sigint.recv() => { "SIGINT" },
|
||||
_ = sigterm.recv() => { "SIGTERM" },
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
// Wait for cancellation signal and shut down the pageserver.
|
||||
//
|
||||
// This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
|
||||
// reach very far, and `task_mgr` is used instead. The plan is to change that over time.
|
||||
signal_cancel.cancelled().await;
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
background_purges,
|
||||
deletion_queue.clone(),
|
||||
secondary_controller_tasks,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
unreachable!()
|
||||
})
|
||||
}
|
||||
shutdown_pageserver.cancel();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
background_purges,
|
||||
deletion_queue.clone(),
|
||||
secondary_controller_tasks,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
unreachable!();
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_remote_storage_client(
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::error::Error as _;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError {
|
||||
|
||||
match self {
|
||||
Rejected(code) => write!(f, "server rejected the metrics with {code}"),
|
||||
Reqwest(e) => write!(f, "request failed: {e}"),
|
||||
Reqwest(e) => write!(
|
||||
f,
|
||||
"request failed: {e}{}",
|
||||
e.source().map(|e| format!(": {e}")).unwrap_or_default()
|
||||
),
|
||||
Cancelled => write!(f, "cancelled"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,8 +91,6 @@
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
pub(crate) mod optional_counter;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Debug)]
|
||||
pub struct RequestContext {
|
||||
@@ -100,7 +98,6 @@ pub struct RequestContext {
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
|
||||
}
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
@@ -158,7 +155,6 @@ impl RequestContextBuilder {
|
||||
download_behavior: DownloadBehavior::Download,
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
page_content_kind: PageContentKind::Unknown,
|
||||
micros_spent_throttled: Default::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -172,7 +168,6 @@ impl RequestContextBuilder {
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
page_content_kind: original.page_content_kind,
|
||||
micros_spent_throttled: Default::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
use std::{
|
||||
sync::atomic::{AtomicU32, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CounterU32 {
|
||||
inner: AtomicU32,
|
||||
}
|
||||
impl Default for CounterU32 {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: AtomicU32::new(u32::MAX),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl CounterU32 {
|
||||
pub fn open(&self) -> Result<(), &'static str> {
|
||||
match self
|
||||
.inner
|
||||
.compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
|
||||
{
|
||||
Ok(_) => Ok(()),
|
||||
Err(_) => Err("open() called on clsoed state"),
|
||||
}
|
||||
}
|
||||
pub fn close(&self) -> Result<u32, &'static str> {
|
||||
match self.inner.swap(u32::MAX, Ordering::Relaxed) {
|
||||
u32::MAX => Err("close() called on closed state"),
|
||||
x => Ok(x),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&self, count: u32) -> Result<(), &'static str> {
|
||||
if count == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
let mut had_err = None;
|
||||
self.inner
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
|
||||
u32::MAX => {
|
||||
had_err = Some("add() called on closed state");
|
||||
None
|
||||
}
|
||||
x => {
|
||||
let (new, overflowed) = x.overflowing_add(count);
|
||||
if new == u32::MAX || overflowed {
|
||||
had_err = Some("add() overflowed the counter");
|
||||
None
|
||||
} else {
|
||||
Some(new)
|
||||
}
|
||||
}
|
||||
})
|
||||
.map_err(|_| had_err.expect("we set it whenever the function returns None"))
|
||||
.map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct MicroSecondsCounterU32 {
|
||||
inner: CounterU32,
|
||||
}
|
||||
|
||||
impl MicroSecondsCounterU32 {
|
||||
pub fn open(&self) -> Result<(), &'static str> {
|
||||
self.inner.open()
|
||||
}
|
||||
pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
|
||||
match duration.as_micros().try_into() {
|
||||
Ok(x) => self.inner.add(x),
|
||||
Err(_) => Err("add(): duration conversion error"),
|
||||
}
|
||||
}
|
||||
pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
|
||||
let val = self.inner.close()?;
|
||||
let val = Duration::from_micros(val as u64);
|
||||
let subbed = match from.checked_sub(val) {
|
||||
Some(v) => v,
|
||||
None => return Err("Duration::checked_sub"),
|
||||
};
|
||||
Ok(subbed)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_basic() {
|
||||
let counter = MicroSecondsCounterU32::default();
|
||||
counter.open().unwrap();
|
||||
counter.add(Duration::from_micros(23)).unwrap();
|
||||
let res = counter
|
||||
.close_and_checked_sub_from(Duration::from_micros(42))
|
||||
.unwrap();
|
||||
assert_eq!(res, Duration::from_micros(42 - 23));
|
||||
}
|
||||
}
|
||||
@@ -115,6 +115,10 @@ impl ControllerUpcallClient {
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub(crate) fn base_url(&self) -> &Url {
|
||||
&self.base_url
|
||||
}
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
@@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
|
||||
let request = ReAttachRequest {
|
||||
node_id: self.node_id,
|
||||
register,
|
||||
register: register.clone(),
|
||||
};
|
||||
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
response.tenants.len()
|
||||
"Received re-attach response with {} tenants (node {}, register: {:?})",
|
||||
response.tenants.len(),
|
||||
self.node_id,
|
||||
register,
|
||||
);
|
||||
|
||||
failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
|
||||
|
||||
@@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::profile_cpu_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::endpoint::{
|
||||
profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
|
||||
};
|
||||
use utils::http::request::must_parse_query_param;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
|
||||
use crate::tenant::timeline::offload::OffloadError;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactOptions;
|
||||
use crate::tenant::timeline::CompactRange;
|
||||
use crate::tenant::timeline::CompactRequest;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
@@ -155,6 +155,7 @@ impl State {
|
||||
"/swagger.yml",
|
||||
"/metrics",
|
||||
"/profile/cpu",
|
||||
"/profile/heap",
|
||||
];
|
||||
Ok(Self {
|
||||
conf,
|
||||
@@ -278,7 +279,10 @@ impl From<TenantStateError> for ApiError {
|
||||
impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()),
|
||||
GetTenantError::ShardNotFound(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {tid}").into())
|
||||
}
|
||||
GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
@@ -386,6 +390,16 @@ impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
|
||||
fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError {
|
||||
use crate::tenant::secondary::SecondaryTenantError;
|
||||
match ste {
|
||||
SecondaryTenantError::GetTenant(gte) => gte.into(),
|
||||
SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
@@ -1046,9 +1060,11 @@ async fn timeline_delete_handler(
|
||||
match e {
|
||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||
// want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
|
||||
GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
|
||||
"Requested tenant is missing".to_string().into_boxed_str(),
|
||||
),
|
||||
GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => {
|
||||
ApiError::PreconditionFailed(
|
||||
"Requested tenant is missing".to_string().into_boxed_str(),
|
||||
)
|
||||
}
|
||||
e => e.into(),
|
||||
}
|
||||
})?;
|
||||
@@ -1962,6 +1978,26 @@ async fn timeline_gc_handler(
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
|
||||
// Cancel scheduled compaction tasks
|
||||
async fn timeline_cancel_compact_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.cancel_scheduled_compaction(timeline_id);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
async fn timeline_compact_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1971,7 +2007,7 @@ async fn timeline_compact_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
|
||||
let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
@@ -1996,22 +2032,50 @@ async fn timeline_compact_handler(
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
let wait_until_scheduled_compaction_done =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
|
||||
.unwrap_or(false);
|
||||
|
||||
let sub_compaction = compact_request
|
||||
.as_ref()
|
||||
.map(|r| r.sub_compaction)
|
||||
.unwrap_or(false);
|
||||
let options = CompactOptions {
|
||||
compact_range,
|
||||
compact_range: compact_request
|
||||
.as_ref()
|
||||
.and_then(|r| r.compact_range.clone()),
|
||||
compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
|
||||
flags,
|
||||
sub_compaction,
|
||||
};
|
||||
|
||||
let scheduled = compact_request
|
||||
.as_ref()
|
||||
.map(|r| r.scheduled)
|
||||
.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact_with_options(&cancel, options, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await
|
||||
// XXX map to correct ApiError for the cases where it's due to shutdown
|
||||
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
||||
if scheduled {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let rx = tenant.schedule_compaction(timeline_id, options).await;
|
||||
if wait_until_scheduled_compaction_done {
|
||||
// It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
|
||||
rx.await.ok();
|
||||
}
|
||||
} else {
|
||||
timeline
|
||||
.compact_with_options(&cancel, options, &ctx)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
if wait_until_uploaded {
|
||||
timeline.remote_client.wait_completion().await
|
||||
// XXX map to correct ApiError for the cases where it's due to shutdown
|
||||
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
}
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -2092,16 +2156,20 @@ async fn timeline_checkpoint_handler(
|
||||
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
|
||||
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
|
||||
|
||||
let wait_until_flushed: bool =
|
||||
parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
|
||||
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if wait_until_flushed {
|
||||
timeline.freeze_and_flush().await
|
||||
} else {
|
||||
timeline.freeze().await.and(Ok(()))
|
||||
}.map_err(|e| {
|
||||
match e {
|
||||
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
|
||||
other => ApiError::InternalServerError(other.into()),
|
||||
@@ -2461,8 +2529,7 @@ async fn secondary_upload_handler(
|
||||
state
|
||||
.secondary_controller
|
||||
.upload_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -2577,7 +2644,7 @@ async fn secondary_download_handler(
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
|
||||
Ok(Err(e)) => return Err(e.into()),
|
||||
// A timeout is not an error: we have started the download, we're just not done
|
||||
// yet. The caller will get a response body indicating status.
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
@@ -3203,6 +3270,7 @@ pub fn make_router(
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
.get("/profile/heap", |r| request_span(r, profile_heap_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
@@ -3285,6 +3353,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| api_handler(r, timeline_cancel_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
|
||||
|r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
|
||||
|
||||
@@ -575,18 +575,24 @@ async fn import_file(
|
||||
} else if file_path.starts_with("pg_xact") {
|
||||
let slru = SlruKind::Clog;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported clog slru");
|
||||
if modification.tline.tenant_shard_id.is_shard_zero() {
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported clog slru");
|
||||
}
|
||||
} else if file_path.starts_with("pg_multixact/offsets") {
|
||||
let slru = SlruKind::MultiXactOffsets;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact offsets slru");
|
||||
if modification.tline.tenant_shard_id.is_shard_zero() {
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact offsets slru");
|
||||
}
|
||||
} else if file_path.starts_with("pg_multixact/members") {
|
||||
let slru = SlruKind::MultiXactMembers;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact members slru");
|
||||
if modification.tline.tenant_shard_id.is_shard_zero() {
|
||||
import_slru(modification, slru, file_path, reader, len, ctx).await?;
|
||||
debug!("imported multixact members slru");
|
||||
}
|
||||
} else if file_path.starts_with("pg_twophase") {
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
|
||||
|
||||
@@ -7,6 +7,10 @@ use metrics::{
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
@@ -213,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> {
|
||||
ScanLatencyOngoingRecording { parent, start }
|
||||
}
|
||||
|
||||
pub(crate) fn observe(self, throttled: Option<Duration>) {
|
||||
pub(crate) fn observe(self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = if let Some(throttled) = throttled {
|
||||
elapsed.checked_sub(throttled)
|
||||
} else {
|
||||
Some(elapsed)
|
||||
};
|
||||
if let Some(ex_throttled) = ex_throttled {
|
||||
self.parent.observe(ex_throttled.as_secs_f64());
|
||||
} else {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
self.parent.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
|
||||
"Time spent in get_vectored.",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
@@ -260,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
|
||||
pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_scan_seconds",
|
||||
"Time spent in scan, excluding time spent in timeline_get_throttle.",
|
||||
"Time spent in scan.",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
@@ -475,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_disk_consistent_lsn",
|
||||
"Disk consistent LSN grouped by timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_projected_remote_consistent_lsn",
|
||||
"Projected remote consistent LSN grouped by timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_pitr_history_size",
|
||||
@@ -1216,28 +1223,62 @@ pub(crate) mod virtual_file_io_engine {
|
||||
});
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
global_latency_histo: &'a Histogram,
|
||||
pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
|
||||
pub(crate) struct SmgrOpTimerInner {
|
||||
global_latency_histo: Histogram,
|
||||
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
per_timeline_latency_histo: Option<&'a Histogram>,
|
||||
per_timeline_latency_histo: Option<Histogram>,
|
||||
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
start: Instant,
|
||||
throttled: Duration,
|
||||
op: SmgrQueryType,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(res) => res,
|
||||
Err(error) => {
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
base: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
|
||||
let Some(throttle) = throttle else {
|
||||
return;
|
||||
};
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
inner.throttled += *throttle;
|
||||
}
|
||||
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
base: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
let elapsed = now - inner.start;
|
||||
|
||||
let elapsed = match elapsed.checked_sub(inner.throttled) {
|
||||
Some(elapsed) => elapsed,
|
||||
None => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
@@ -1246,19 +1287,62 @@ impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[self.op];
|
||||
let rate_limit = &mut guard[inner.op];
|
||||
rate_limit.call(|| {
|
||||
warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
|
||||
});
|
||||
elapsed
|
||||
elapsed // un-throttled time, more info than just saturating to 0
|
||||
}
|
||||
};
|
||||
|
||||
for _ in 0..self.count {
|
||||
self.global_latency_histo
|
||||
.observe(ex_throttled.as_secs_f64());
|
||||
if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
|
||||
let elapsed = elapsed.as_secs_f64();
|
||||
|
||||
inner.global_latency_histo.observe(elapsed);
|
||||
if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(elapsed);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
self.smgr_op_end();
|
||||
}
|
||||
}
|
||||
|
||||
impl SmgrOpFlushInProgress {
|
||||
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let elapsed = now - self.base;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.per_timeline_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.base = now;
|
||||
},
|
||||
|mut observe| {
|
||||
observe();
|
||||
},
|
||||
);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
|
||||
Ok(v) => return v,
|
||||
Err(_timeout) => {
|
||||
(*observe_guard)();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1289,6 +1373,10 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
per_timeline_getpage_started: IntCounter,
|
||||
per_timeline_getpage_latency: Histogram,
|
||||
global_batch_size: Histogram,
|
||||
per_timeline_batch_size: Histogram,
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1381,6 +1469,96 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
|
||||
(1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
|
||||
.map(|v| v.into())
|
||||
.collect()
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_page_service_batch_size_global",
|
||||
"Batch size of pageserver page service requests",
|
||||
PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
|
||||
let mut buckets = Vec::new();
|
||||
for i in 0.. {
|
||||
let bucket = 1 << i;
|
||||
if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
|
||||
break;
|
||||
}
|
||||
buckets.push(bucket.into());
|
||||
}
|
||||
buckets
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_page_service_batch_size",
|
||||
"Batch size of pageserver page service requests",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_page_service_config_max_batch_size",
|
||||
"Configured maximum batch size for the server-side batching functionality of page_service. \
|
||||
Labels expose more of the configuration parameters.",
|
||||
&["mode", "execution"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
|
||||
let (label_values, value) = match conf {
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
}) => {
|
||||
let mode = "pipelined";
|
||||
let execution = match execution {
|
||||
PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
|
||||
"concurrent-futures"
|
||||
}
|
||||
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
|
||||
};
|
||||
([mode, execution], max_batch_size.get())
|
||||
}
|
||||
};
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
|
||||
.with_label_values(&label_values)
|
||||
.set(value.try_into().unwrap());
|
||||
}
|
||||
|
||||
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_service_pagestream_flush_in_progress_micros",
|
||||
"Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
|
||||
If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
|
||||
easily discoverable in monitoring. \
|
||||
Hence, this is NOT a completion latency historgram.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_page_service_pagestream_flush_in_progress_micros_global",
|
||||
"Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
@@ -1416,78 +1594,65 @@ impl SmgrQueryTimePerTimeline {
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
|
||||
let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_flush_in_progress_micros =
|
||||
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
|
||||
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
global_started,
|
||||
global_latency,
|
||||
per_timeline_getpage_latency,
|
||||
per_timeline_getpage_started,
|
||||
global_batch_size,
|
||||
per_timeline_batch_size,
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + 'a> {
|
||||
self.start_timer_many(op, 1, ctx)
|
||||
}
|
||||
pub(crate) fn start_timer_many<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
count: usize,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + 'a> {
|
||||
let start = Instant::now();
|
||||
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
|
||||
self.global_started[op as usize].inc();
|
||||
|
||||
// We subtract time spent throttled from the observed latency.
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[op];
|
||||
rate_limit.call(|| {
|
||||
warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
self.per_timeline_getpage_started.inc();
|
||||
Some(&self.per_timeline_getpage_latency)
|
||||
Some(self.per_timeline_getpage_latency.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Some(GlobalAndPerTimelineHistogramTimer {
|
||||
global_latency_histo: &self.global_latency[op as usize],
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_latency_histo,
|
||||
ctx,
|
||||
start,
|
||||
start: started_at,
|
||||
op,
|
||||
count,
|
||||
})
|
||||
throttled: Duration::ZERO,
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use std::time::Instant;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
@@ -1531,8 +1696,7 @@ mod smgr_query_time_tests {
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
let timer = metrics.start_timer(*op, &ctx);
|
||||
let timer = metrics.start_smgr_op(*op, Instant::now());
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
@@ -1579,58 +1743,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
|
||||
parent: &'a BasebackupQueryTime,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
|
||||
pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
BasebackupQueryTimeOngoingRecording {
|
||||
parent: self,
|
||||
ctx,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
impl BasebackupQueryTimeOngoingRecording<'_> {
|
||||
pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(ex_throttled) => ex_throttled,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let elapsed = self.start.elapsed().as_secs_f64();
|
||||
// If you want to change categorize of a specific error, also change it in `log_query_error`.
|
||||
let metric = match res {
|
||||
Ok(_) => &self.parent.ok,
|
||||
@@ -1641,7 +1771,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
}
|
||||
Err(_) => &self.parent.error,
|
||||
};
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
metric.observe(elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2181,6 +2311,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_timeline_wal_records_received",
|
||||
"Number of WAL records received per shard",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
@@ -2389,7 +2528,8 @@ pub(crate) struct TimelineMetrics {
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub last_record_lsn_gauge: IntGauge,
|
||||
pub disk_consistent_lsn_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
pub(crate) layer_size_image: UIntGauge,
|
||||
@@ -2407,6 +2547,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
pub wal_records_received: IntCounter,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -2470,7 +2611,11 @@ impl TimelineMetrics {
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
let last_record_lsn_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
@@ -2560,6 +2705,10 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -2573,7 +2722,8 @@ impl TimelineMetrics {
|
||||
garbage_collect_histo,
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
last_record_lsn_gauge,
|
||||
disk_consistent_lsn_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
layer_size_image,
|
||||
@@ -2591,6 +2741,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
valid_lsn_lease_count_gauge,
|
||||
wal_records_received,
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
@@ -2637,6 +2788,7 @@ impl TimelineMetrics {
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
{
|
||||
@@ -2722,6 +2874,21 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2747,10 +2914,12 @@ use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::Timeline;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
@@ -2793,6 +2962,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
@@ -2807,6 +2977,10 @@ impl RemoteTimelineClientMetrics {
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
|
||||
.unwrap();
|
||||
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id_str,
|
||||
shard_id: shard_id_str,
|
||||
@@ -2815,6 +2989,7 @@ impl RemoteTimelineClientMetrics {
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge,
|
||||
projected_remote_consistent_lsn_gauge,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3028,6 +3203,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
calls,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
projected_remote_consistent_lsn_gauge,
|
||||
} = self;
|
||||
for ((a, b), _) in calls.get_mut().unwrap().drain() {
|
||||
let mut res = [Ok(()), Ok(())];
|
||||
@@ -3057,6 +3233,14 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
{
|
||||
let _ = projected_remote_consistent_lsn_gauge;
|
||||
let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3307,7 +3491,7 @@ pub(crate) mod tenant_throttling {
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::tenant::{self, throttle::Metric};
|
||||
use crate::tenant::{self};
|
||||
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
@@ -3326,7 +3510,7 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
pub(crate) struct Metrics<const KIND: usize> {
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
@@ -3399,40 +3583,41 @@ pub(crate) mod tenant_throttling {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
const KIND: &str = "timeline_get";
|
||||
const KINDS: &[&str] = &["pagestream"];
|
||||
pub type Pagestream = Metrics<0>;
|
||||
|
||||
impl TimelineGet {
|
||||
impl<const KIND: usize> Metrics<KIND> {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
|
||||
let per_tenant_label_values = &[
|
||||
KIND,
|
||||
KINDS[KIND],
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
];
|
||||
TimelineGet {
|
||||
Metrics {
|
||||
count_accounted_start: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
|
||||
global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
|
||||
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_accounted_finish: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
|
||||
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
|
||||
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
wait_time: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_USECS.with_label_values(&[KIND]),
|
||||
global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
|
||||
per_tenant: WAIT_USECS_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
},
|
||||
count_throttled: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_COUNT.with_label_values(&[KIND]),
|
||||
global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
|
||||
per_tenant: WAIT_COUNT_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
}
|
||||
@@ -3455,15 +3640,17 @@ pub(crate) mod tenant_throttling {
|
||||
&WAIT_USECS_PER_TENANT,
|
||||
&WAIT_COUNT_PER_TENANT,
|
||||
] {
|
||||
let _ = m.remove_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]);
|
||||
for kind in KINDS {
|
||||
let _ = m.remove_label_values(&[
|
||||
kind,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Metric for TimelineGet {
|
||||
impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
@@ -3562,7 +3749,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
|
||||
.set(u64::try_from(num_threads.get()).unwrap());
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
set_page_service_config_max_batch_size(&conf.page_service_pipelining);
|
||||
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
|
||||
@@ -3630,6 +3819,7 @@ pub fn preinitialize_metrics() {
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
&PAGE_SERVICE_BATCH_SIZE_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|h| {
|
||||
|
||||
@@ -51,7 +51,7 @@ use crate::auth::check_permission;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{self};
|
||||
use crate::metrics::{self, SmgrOpTimer};
|
||||
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -540,11 +540,13 @@ impl From<WaitLsnError> for QueryError {
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamExistsRequest,
|
||||
},
|
||||
Nblocks {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamNblocksRequest,
|
||||
},
|
||||
@@ -552,15 +554,17 @@ enum BatchedFeMessage {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamDbSizeRequest,
|
||||
},
|
||||
GetSlruSegment {
|
||||
span: Span,
|
||||
timer: SmgrOpTimer,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
@@ -570,6 +574,41 @@ enum BatchedFeMessage {
|
||||
},
|
||||
}
|
||||
|
||||
impl BatchedFeMessage {
|
||||
async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
|
||||
let (shard, tokens, timers) = match self {
|
||||
BatchedFeMessage::Exists { shard, timer, .. }
|
||||
| BatchedFeMessage::Nblocks { shard, timer, .. }
|
||||
| BatchedFeMessage::DbSize { shard, timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
|
||||
(
|
||||
shard,
|
||||
// 1 token is probably under-estimating because these
|
||||
// request handlers typically do several Timeline::get calls.
|
||||
1,
|
||||
itertools::Either::Left(std::iter::once(timer)),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage { shard, pages, .. } => (
|
||||
shard,
|
||||
pages.len(),
|
||||
itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
|
||||
),
|
||||
BatchedFeMessage::RespondError { .. } => return Ok(()),
|
||||
};
|
||||
let throttled = tokio::select! {
|
||||
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
};
|
||||
for timer in timers {
|
||||
timer.deduct_throttle(&throttled);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
@@ -632,6 +671,8 @@ impl PageServerHandler {
|
||||
msg = pgb.read_message() => { msg }
|
||||
};
|
||||
|
||||
let received_at = Instant::now();
|
||||
|
||||
let copy_data_bytes = match msg? {
|
||||
Some(FeMessage::CopyData(bytes)) => bytes,
|
||||
Some(FeMessage::Terminate) => {
|
||||
@@ -660,7 +701,15 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
BatchedFeMessage::Exists { span, shard, req }
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
@@ -668,7 +717,15 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
BatchedFeMessage::Nblocks { span, shard, req }
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
@@ -676,7 +733,15 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
BatchedFeMessage::DbSize { span, shard, req }
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
@@ -684,7 +749,15 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
BatchedFeMessage::GetSlruSegment { span, shard, req }
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
request_lsn,
|
||||
@@ -728,6 +801,14 @@ impl PageServerHandler {
|
||||
return respond_error!(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
// It's important to start the timer before waiting for the LSN
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for LSNs.
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
|
||||
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
request_lsn,
|
||||
@@ -747,7 +828,7 @@ impl PageServerHandler {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![(rel, blkno)],
|
||||
pages: smallvec::smallvec![(rel, blkno, timer)],
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -832,90 +913,112 @@ impl PageServerHandler {
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
// invoke handler function
|
||||
let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
|
||||
match batch {
|
||||
BatchedFeMessage::Exists { span, shard, req } => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
(
|
||||
vec![
|
||||
self.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::Nblocks { span, shard, req } => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
(
|
||||
vec![
|
||||
self.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage {
|
||||
let (handler_results, span): (
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
|
||||
_,
|
||||
) = match batch {
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
(
|
||||
{
|
||||
let npages = pages.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::DbSize { span, shard, req } => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
(
|
||||
vec![
|
||||
self.handle_db_size_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetSlruSegment { span, shard, req } => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
(
|
||||
vec![
|
||||
self.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await,
|
||||
],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
(vec![Err(error)], span)
|
||||
}
|
||||
};
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
(
|
||||
{
|
||||
let npages = pages.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
(
|
||||
vec![self
|
||||
.handle_db_size_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
shard,
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
(
|
||||
vec![self
|
||||
.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
(vec![Err(error)], span)
|
||||
}
|
||||
};
|
||||
|
||||
// Map handler result to protocol behavior.
|
||||
// Some handler errors cause exit from pagestream protocol.
|
||||
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
|
||||
for handler_result in handler_results {
|
||||
let response_msg = match handler_result {
|
||||
let (response_msg, timer) = match handler_result {
|
||||
Err(e) => match &e {
|
||||
PageStreamError::Shutdown => {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
@@ -939,27 +1042,65 @@ impl PageServerHandler {
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
(
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
}),
|
||||
None, // TODO: measure errors
|
||||
)
|
||||
}
|
||||
},
|
||||
Ok(response_msg) => response_msg,
|
||||
Ok((response_msg, timer)) => (response_msg, Some(timer)),
|
||||
};
|
||||
|
||||
//
|
||||
// marshal & transmit response message
|
||||
//
|
||||
|
||||
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
}
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
res = pgb_writer.flush() => {
|
||||
res?;
|
||||
|
||||
// We purposefully don't count flush time into the timer.
|
||||
//
|
||||
// The reason is that current compute client will not perform protocol processing
|
||||
// if the postgres backend process is doing things other than `->smgr_read()`.
|
||||
// This is especially the case for prefetch.
|
||||
//
|
||||
// If the compute doesn't read from the connection, eventually TCP will backpressure
|
||||
// all the way into our flush call below.
|
||||
//
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
// metric for how long flushing takes
|
||||
let flush_fut = match flushing_timer {
|
||||
Some(flushing_timer) => {
|
||||
futures::future::Either::Left(flushing_timer.measure(flush_fut))
|
||||
}
|
||||
None => futures::future::Either::Right(flush_fut),
|
||||
};
|
||||
// do it while respecting cancellation
|
||||
let _: () = async move {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
res = flush_fut => {
|
||||
res?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
// and log the info! line inside the request span
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1081,13 +1222,18 @@ impl PageServerHandler {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => break e,
|
||||
};
|
||||
let msg = match msg {
|
||||
let mut msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
return ((pgb_reader, timeline_handles), Ok(()));
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(cancelled) = msg.throttle(&self.cancel).await {
|
||||
break cancelled;
|
||||
}
|
||||
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
|
||||
.await;
|
||||
@@ -1245,12 +1391,13 @@ impl PageServerHandler {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let batch = match batch {
|
||||
let mut batch = match batch {
|
||||
Ok(batch) => batch,
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
batch.throttle(&self.cancel).await?;
|
||||
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
@@ -1423,10 +1570,6 @@ impl PageServerHandler {
|
||||
req: &PagestreamExistsRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
@@ -1453,10 +1596,6 @@ impl PageServerHandler {
|
||||
req: &PagestreamNblocksRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
@@ -1483,10 +1622,6 @@ impl PageServerHandler {
|
||||
req: &PagestreamDbSizeRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
@@ -1512,26 +1647,41 @@ impl PageServerHandler {
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<PagestreamBeMessage, PageStreamError>> {
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let _timer = timeline.query_metrics.start_timer_many(
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
pages.len(),
|
||||
ctx,
|
||||
);
|
||||
|
||||
let pages = timeline
|
||||
.get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
|
||||
timeline
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
|
||||
effective_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(results.len(), requests.len());
|
||||
|
||||
Vec::from_iter(pages.into_iter().map(|page| {
|
||||
page.map(|page| {
|
||||
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
|
||||
})
|
||||
.map_err(PageStreamError::from)
|
||||
}))
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|((_, _, timer), res)| {
|
||||
res.map(|page| {
|
||||
(
|
||||
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
|
||||
page,
|
||||
}),
|
||||
timer,
|
||||
)
|
||||
})
|
||||
.map_err(PageStreamError::from)
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
@@ -1541,10 +1691,6 @@ impl PageServerHandler {
|
||||
req: &PagestreamGetSlruSegmentRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
@@ -2045,7 +2191,7 @@ where
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
|
||||
@@ -203,9 +203,13 @@ impl Timeline {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match version {
|
||||
Version::Lsn(effective_lsn) => {
|
||||
let pages = smallvec::smallvec![(tag, blknum)];
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages.iter().map(|(tag, blknum)| (tag, blknum)),
|
||||
effective_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(res.len(), 1);
|
||||
res.into_iter().next().unwrap()
|
||||
@@ -240,7 +244,7 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
|
||||
effective_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
@@ -254,7 +258,7 @@ impl Timeline {
|
||||
let result_slots = result.spare_capacity_mut();
|
||||
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
|
||||
for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() {
|
||||
for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -265,7 +269,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let nblocks = match self
|
||||
.get_rel_size(tag, Version::Lsn(effective_lsn), ctx)
|
||||
.get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
|
||||
.await
|
||||
{
|
||||
Ok(nblocks) => nblocks,
|
||||
@@ -276,7 +280,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
if blknum >= nblocks {
|
||||
if *blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, effective_lsn, nblocks
|
||||
@@ -286,7 +290,7 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
let key = rel_block_to_key(*tag, *blknum);
|
||||
|
||||
let key_slots = keys_slots.entry(key).or_default();
|
||||
key_slots.push(response_slot_idx);
|
||||
@@ -526,6 +530,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
@@ -548,6 +553,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn, ctx).await
|
||||
}
|
||||
@@ -560,6 +566,7 @@ impl Timeline {
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
@@ -573,6 +580,7 @@ impl Timeline {
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
@@ -1043,26 +1051,28 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Iterate SLRUs next
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactMembers,
|
||||
SlruKind::MultiXactOffsets,
|
||||
] {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(segsize_key, lsn, ctx).await?;
|
||||
let segsize = buf.get_u32_le();
|
||||
if self.tenant_shard_id.is_shard_zero() {
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactMembers,
|
||||
SlruKind::MultiXactOffsets,
|
||||
] {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(segsize_key, lsn, ctx).await?;
|
||||
let segsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(
|
||||
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
|
||||
);
|
||||
result.add_key(segsize_key);
|
||||
result.add_range(
|
||||
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
|
||||
);
|
||||
result.add_key(segsize_key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1464,6 +1474,10 @@ impl<'a> DatadirModification<'a> {
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
if !self.tline.tenant_shard_id.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.put(
|
||||
slru_block_to_key(kind, segno, blknum),
|
||||
Value::WalRecord(rec),
|
||||
@@ -1497,6 +1511,8 @@ impl<'a> DatadirModification<'a> {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
@@ -1538,6 +1554,7 @@ impl<'a> DatadirModification<'a> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
@@ -1849,6 +1866,8 @@ impl<'a> DatadirModification<'a> {
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Add it to the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -1881,6 +1900,8 @@ impl<'a> DatadirModification<'a> {
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Put size
|
||||
let size_key = slru_segment_size_to_key(kind, segno);
|
||||
let buf = nblocks.to_le_bytes();
|
||||
|
||||
@@ -37,14 +37,19 @@ use remote_timeline_client::manifest::{
|
||||
};
|
||||
use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::ScheduledCompactionTask;
|
||||
use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
use timeline::CompactFlags;
|
||||
use timeline::CompactOptions;
|
||||
use timeline::CompactionError;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
@@ -339,6 +344,11 @@ pub struct Tenant {
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
|
||||
/// a manual gc-compaction from the manual compaction API.
|
||||
scheduled_compaction_tasks:
|
||||
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
|
||||
|
||||
/// If the tenant is in Activating state, notify this to encourage it
|
||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||
/// background warmup.
|
||||
@@ -357,8 +367,8 @@ pub struct Tenant {
|
||||
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) timeline_get_throttle:
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -1678,7 +1688,7 @@ impl Tenant {
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
LoadTimelineCause::Attach,
|
||||
@@ -2953,27 +2963,100 @@ impl Tenant {
|
||||
|
||||
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
|
||||
{
|
||||
// pending_task_left == None: cannot compact, maybe still pending tasks
|
||||
// pending_task_left == Some(true): compaction task left
|
||||
// pending_task_left == Some(false): no compaction task left
|
||||
let pending_task_left = if *can_compact {
|
||||
Some(
|
||||
timeline
|
||||
.compact(cancel, EnumSet::empty(), ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await
|
||||
.inspect_err(|e| match e {
|
||||
timeline::CompactionError::ShuttingDown => (),
|
||||
timeline::CompactionError::Offload(_) => {
|
||||
// Failures to offload timelines do not trip the circuit breaker, because
|
||||
// they do not do lots of writes the way compaction itself does: it is cheap
|
||||
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
|
||||
let has_pending_l0_compaction_task = timeline
|
||||
.compact(cancel, EnumSet::empty(), ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await
|
||||
.inspect_err(|e| match e {
|
||||
timeline::CompactionError::ShuttingDown => (),
|
||||
timeline::CompactionError::Offload(_) => {
|
||||
// Failures to offload timelines do not trip the circuit breaker, because
|
||||
// they do not do lots of writes the way compaction itself does: it is cheap
|
||||
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
|
||||
}
|
||||
timeline::CompactionError::Other(e) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
|
||||
}
|
||||
})?;
|
||||
if has_pending_l0_compaction_task {
|
||||
Some(true)
|
||||
} else {
|
||||
let mut has_pending_scheduled_compaction_task;
|
||||
let next_scheduled_compaction_task = {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
|
||||
if !tline_pending_tasks.is_empty() {
|
||||
info!(
|
||||
"{} tasks left in the compaction schedule queue",
|
||||
tline_pending_tasks.len()
|
||||
);
|
||||
}
|
||||
timeline::CompactionError::Other(e) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
|
||||
let next_task = tline_pending_tasks.pop_front();
|
||||
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
|
||||
next_task
|
||||
} else {
|
||||
has_pending_scheduled_compaction_task = false;
|
||||
None
|
||||
}
|
||||
};
|
||||
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
|
||||
{
|
||||
if !next_scheduled_compaction_task
|
||||
.options
|
||||
.flags
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
|
||||
} else if next_scheduled_compaction_task.options.sub_compaction {
|
||||
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs = timeline
|
||||
.gc_compaction_split_jobs(next_scheduled_compaction_task.options)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
} else {
|
||||
has_pending_scheduled_compaction_task = true;
|
||||
let jobs_len = jobs.len();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options: job,
|
||||
result_tx: if idx == jobs_len - 1 {
|
||||
// The last compaction job sends the completion signal
|
||||
next_scheduled_compaction_task.result_tx.take()
|
||||
} else {
|
||||
None
|
||||
},
|
||||
});
|
||||
}
|
||||
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
|
||||
}
|
||||
})?,
|
||||
)
|
||||
} else {
|
||||
let _ = timeline
|
||||
.compact_with_options(
|
||||
cancel,
|
||||
next_scheduled_compaction_task.options,
|
||||
ctx,
|
||||
)
|
||||
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
|
||||
.await?;
|
||||
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
|
||||
// TODO: we can send compaction statistics in the future
|
||||
tx.send(()).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(has_pending_scheduled_compaction_task)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -2993,6 +3076,36 @@ impl Tenant {
|
||||
Ok(has_pending_task)
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
pub(crate) fn cancel_scheduled_compaction(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Vec<ScheduledCompactionTask> {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
|
||||
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
|
||||
current_tline_pending_tasks.into_iter().collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Schedule a compaction task for a timeline.
|
||||
pub(crate) async fn schedule_compaction(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
options: CompactOptions,
|
||||
) -> tokio::sync::oneshot::Receiver<()> {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(timeline_id).or_default();
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: Some(tx),
|
||||
});
|
||||
rx
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
// this happens during ingest: this background housekeeping is for freezing layers
|
||||
// that are open but haven't been written to for some time.
|
||||
@@ -3422,7 +3535,7 @@ impl Tenant {
|
||||
r.map_err(
|
||||
|_e: tokio::sync::watch::error::RecvError|
|
||||
// Tenant existed but was dropped: report it as non-existent
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
|
||||
GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id))
|
||||
)?
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => {
|
||||
@@ -3835,7 +3948,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_timeline_get_throttle_config(
|
||||
fn get_pagestream_throttle_config(
|
||||
psconf: &'static PageServerConf,
|
||||
overrides: &TenantConfOpt,
|
||||
) -> throttle::Config {
|
||||
@@ -3846,8 +3959,8 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
|
||||
self.timeline_get_throttle.reconfigure(conf)
|
||||
let conf = Self::get_pagestream_throttle_config(self.conf, new_conf);
|
||||
self.pagestream_throttle.reconfigure(conf)
|
||||
}
|
||||
|
||||
/// Helper function to create a new Timeline struct.
|
||||
@@ -4005,13 +4118,14 @@ impl Tenant {
|
||||
// use an extremely long backoff.
|
||||
Some(Duration::from_secs(3600 * 24)),
|
||||
)),
|
||||
scheduled_compaction_tasks: Mutex::new(Default::default()),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::default(),
|
||||
timeline_get_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
|
||||
pagestream_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
@@ -4909,7 +5023,7 @@ impl Tenant {
|
||||
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
timeline_get_throttle: self.timeline_get_throttle.clone(),
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
@@ -9163,6 +9277,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -9399,6 +9514,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -9885,7 +10001,15 @@ mod tests {
|
||||
|
||||
// Do a partial compaction on key range 0..2
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(0)..get_key(2)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
@@ -9924,7 +10048,15 @@ mod tests {
|
||||
|
||||
// Do a partial compaction on key range 2..4
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(2)..get_key(4)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
@@ -9968,7 +10100,15 @@ mod tests {
|
||||
|
||||
// Do a partial compaction on key range 4..9
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(4)..get_key(9)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
@@ -10011,7 +10151,15 @@ mod tests {
|
||||
|
||||
// Do a partial compaction on key range 9..10
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(9)..get_key(10)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
@@ -10059,7 +10207,15 @@ mod tests {
|
||||
|
||||
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(0)..get_key(10)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
|
||||
@@ -8,10 +8,8 @@ use crate::page_cache;
|
||||
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
|
||||
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::owned_buffers_io::write::Buffer;
|
||||
use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -20,6 +18,7 @@ use tracing::error;
|
||||
|
||||
use std::io;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::Arc;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
pub struct EphemeralFile {
|
||||
@@ -27,10 +26,7 @@ pub struct EphemeralFile {
|
||||
_timeline_id: TimelineId,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
bytes_written: u64,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
BytesMut,
|
||||
size_tracking_writer::Writer<VirtualFile>,
|
||||
>,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
@@ -42,9 +38,9 @@ impl EphemeralFile {
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
) -> anyhow::Result<EphemeralFile> {
|
||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||
let filename_disambiguator =
|
||||
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
@@ -55,15 +51,17 @@ impl EphemeralFile {
|
||||
"ephemeral-{filename_disambiguator}"
|
||||
)));
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
virtual_file::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let file = Arc::new(
|
||||
VirtualFile::open_with_options_v2(
|
||||
&filename,
|
||||
virtual_file::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
|
||||
|
||||
@@ -73,10 +71,12 @@ impl EphemeralFile {
|
||||
page_cache_file_id,
|
||||
bytes_written: 0,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
|
||||
size_tracking_writer::Writer::new(file),
|
||||
BytesMut::with_capacity(TAIL_SZ),
|
||||
file,
|
||||
|| IoBufferMut::with_capacity(TAIL_SZ),
|
||||
gate.enter()?,
|
||||
ctx,
|
||||
),
|
||||
_gate_guard: gate_guard,
|
||||
_gate_guard: gate.enter()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = self.buffered_writer.as_inner().as_inner().path();
|
||||
let path = self.buffered_writer.as_inner().path();
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
@@ -132,6 +132,18 @@ impl EphemeralFile {
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<u64> {
|
||||
let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
|
||||
if let Some(control) = control {
|
||||
control.release().await;
|
||||
}
|
||||
Ok(pos)
|
||||
}
|
||||
|
||||
async fn write_raw_controlled(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
|
||||
let pos = self.bytes_written;
|
||||
|
||||
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
|
||||
@@ -145,9 +157,9 @@ impl EphemeralFile {
|
||||
})?;
|
||||
|
||||
// Write the payload
|
||||
let nwritten = self
|
||||
let (nwritten, control) = self
|
||||
.buffered_writer
|
||||
.write_buffered_borrowed(srcbuf, ctx)
|
||||
.write_buffered_borrowed_controlled(srcbuf, ctx)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
nwritten,
|
||||
@@ -157,7 +169,7 @@ impl EphemeralFile {
|
||||
|
||||
self.bytes_written = new_bytes_written;
|
||||
|
||||
Ok(pos)
|
||||
Ok((pos, control))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
dst: tokio_epoll_uring::Slice<B>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
|
||||
let file_size_tracking_writer = self.buffered_writer.as_inner();
|
||||
let flushed_offset = file_size_tracking_writer.bytes_written();
|
||||
let submitted_offset = self.buffered_writer.bytes_submitted();
|
||||
|
||||
let buffer = self.buffered_writer.inspect_buffer();
|
||||
let buffered = &buffer[0..buffer.pending()];
|
||||
let mutable = self.buffered_writer.inspect_mutable();
|
||||
let mutable = &mutable[0..mutable.pending()];
|
||||
|
||||
let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
|
||||
|
||||
let dst_cap = dst.bytes_total().into_u64();
|
||||
let end = {
|
||||
@@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
}
|
||||
}
|
||||
}
|
||||
let written_range = Range(start, std::cmp::min(end, flushed_offset));
|
||||
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
|
||||
|
||||
let (written_range, maybe_flushed_range) = {
|
||||
if maybe_flushed.is_some() {
|
||||
// [ written ][ maybe_flushed ][ mutable ]
|
||||
// <- TAIL_SZ -><- TAIL_SZ ->
|
||||
// ^
|
||||
// `submitted_offset`
|
||||
// <++++++ on disk +++++++????????????????>
|
||||
(
|
||||
Range(
|
||||
start,
|
||||
std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
|
||||
),
|
||||
Range(
|
||||
std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
|
||||
std::cmp::min(end, submitted_offset),
|
||||
),
|
||||
)
|
||||
} else {
|
||||
// [ written ][ mutable ]
|
||||
// <- TAIL_SZ ->
|
||||
// ^
|
||||
// `submitted_offset`
|
||||
// <++++++ on disk +++++++++++++++++++++++>
|
||||
(
|
||||
Range(start, std::cmp::min(end, submitted_offset)),
|
||||
// zero len
|
||||
Range(submitted_offset, u64::MIN),
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
|
||||
|
||||
let dst = if written_range.len() > 0 {
|
||||
let file: &VirtualFile = file_size_tracking_writer.as_inner();
|
||||
let file: &VirtualFile = self.buffered_writer.as_inner();
|
||||
let bounds = dst.bounds();
|
||||
let slice = file
|
||||
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||
@@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
dst
|
||||
};
|
||||
|
||||
let dst = if buffered_range.len() > 0 {
|
||||
let offset_in_buffer = buffered_range
|
||||
let dst = if maybe_flushed_range.len() > 0 {
|
||||
let offset_in_buffer = maybe_flushed_range
|
||||
.0
|
||||
.checked_sub(flushed_offset)
|
||||
.checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
|
||||
// Checked previously the buffer is Some.
|
||||
let maybe_flushed = maybe_flushed.unwrap();
|
||||
let to_copy = &maybe_flushed
|
||||
[offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
|
||||
let bounds = dst.bounds();
|
||||
let mut view = dst.slice({
|
||||
let start = written_range.len().into_usize();
|
||||
let end = start
|
||||
.checked_add(buffered_range.len().into_usize())
|
||||
.checked_add(maybe_flushed_range.len().into_usize())
|
||||
.unwrap();
|
||||
start..end
|
||||
});
|
||||
@@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
|
||||
dst
|
||||
};
|
||||
|
||||
let dst = if mutable_range.len() > 0 {
|
||||
let offset_in_buffer = mutable_range
|
||||
.0
|
||||
.checked_sub(submitted_offset)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
|
||||
let bounds = dst.bounds();
|
||||
let mut view = dst.slice({
|
||||
let start =
|
||||
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
|
||||
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
|
||||
start..end
|
||||
});
|
||||
view.as_mut_rust_slice_full_zeroed()
|
||||
.copy_from_slice(to_copy);
|
||||
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
|
||||
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
|
||||
|
||||
Ok((dst, (end - start).into_usize()))
|
||||
@@ -295,7 +363,7 @@ mod tests {
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -326,14 +394,15 @@ mod tests {
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
let mutable = file.buffered_writer.inspect_mutable();
|
||||
let cap = mutable.capacity();
|
||||
let align = mutable.align();
|
||||
|
||||
let write_nbytes = cap + cap / 2;
|
||||
let write_nbytes = cap * 2 + cap / 2;
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
@@ -341,30 +410,39 @@ mod tests {
|
||||
.collect();
|
||||
|
||||
let mut value_offsets = Vec::new();
|
||||
for i in 0..write_nbytes {
|
||||
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
|
||||
for range in (0..write_nbytes)
|
||||
.step_by(align)
|
||||
.map(|start| start..(start + align).min(write_nbytes))
|
||||
{
|
||||
let off = file.write_raw(&content[range], &ctx).await.unwrap();
|
||||
value_offsets.push(off);
|
||||
}
|
||||
|
||||
assert!(file.len() as usize == write_nbytes);
|
||||
for i in 0..write_nbytes {
|
||||
assert_eq!(value_offsets[i], i.into_u64());
|
||||
let buf = IoBufferMut::with_capacity(1);
|
||||
assert_eq!(file.len() as usize, write_nbytes);
|
||||
for (i, range) in (0..write_nbytes)
|
||||
.step_by(align)
|
||||
.map(|start| start..(start + align).min(write_nbytes))
|
||||
.enumerate()
|
||||
{
|
||||
assert_eq!(value_offsets[i], range.start.into_u64());
|
||||
let buf = IoBufferMut::with_capacity(range.len());
|
||||
let (buf_slice, nread) = file
|
||||
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
|
||||
.read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let buf = buf_slice.into_inner();
|
||||
assert_eq!(nread, 1);
|
||||
assert_eq!(&buf, &content[i..i + 1]);
|
||||
assert_eq!(nread, range.len());
|
||||
assert_eq!(&buf, &content[range]);
|
||||
}
|
||||
|
||||
let file_contents =
|
||||
std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
|
||||
assert_eq!(file_contents, &content[0..cap]);
|
||||
let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
|
||||
assert!(file_contents == content[0..cap * 2]);
|
||||
|
||||
let buffer_contents = file.buffered_writer.inspect_buffer();
|
||||
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
|
||||
let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
|
||||
assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
|
||||
|
||||
let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
|
||||
assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -373,16 +451,16 @@ mod tests {
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
// mutable buffer and maybe_flushed buffer each has `cap` bytes.
|
||||
let cap = file.buffered_writer.inspect_mutable().capacity();
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.take(cap * 2 + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
@@ -390,23 +468,21 @@ mod tests {
|
||||
// assert the state is as this test expects it to be
|
||||
assert_eq!(
|
||||
&file.load_to_io_buf(&ctx).await.unwrap(),
|
||||
&content[0..cap + cap / 2]
|
||||
&content[0..cap * 2 + cap / 2]
|
||||
);
|
||||
let md = file
|
||||
.buffered_writer
|
||||
.as_inner()
|
||||
.as_inner()
|
||||
.path()
|
||||
.metadata()
|
||||
.unwrap();
|
||||
let md = file.buffered_writer.as_inner().path().metadata().unwrap();
|
||||
assert_eq!(
|
||||
md.len(),
|
||||
cap.into_u64(),
|
||||
"buffered writer does one write if we write 1.5x buffer capacity"
|
||||
2 * cap.into_u64(),
|
||||
"buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
|
||||
);
|
||||
assert_eq!(
|
||||
&file.buffered_writer.inspect_buffer()[0..cap / 2],
|
||||
&content[cap..cap + cap / 2]
|
||||
&file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
|
||||
&content[cap..cap * 2]
|
||||
);
|
||||
assert_eq!(
|
||||
&file.buffered_writer.inspect_mutable()[0..cap / 2],
|
||||
&content[cap * 2..cap * 2 + cap / 2]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -422,19 +498,19 @@ mod tests {
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mutable = file.buffered_writer.inspect_mutable();
|
||||
let cap = mutable.capacity();
|
||||
let align = mutable.align();
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.take(cap * 2 + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
|
||||
|
||||
let test_read = |start: usize, len: usize| {
|
||||
let file = &file;
|
||||
@@ -454,16 +530,38 @@ mod tests {
|
||||
}
|
||||
};
|
||||
|
||||
let test_read_all_offset_combinations = || {
|
||||
async move {
|
||||
test_read(align, align).await;
|
||||
// border onto edge of file
|
||||
test_read(cap - align, align).await;
|
||||
// read across file and buffer
|
||||
test_read(cap - align, 2 * align).await;
|
||||
// stay from start of maybe flushed buffer
|
||||
test_read(cap, align).await;
|
||||
// completely within maybe flushed buffer
|
||||
test_read(cap + align, align).await;
|
||||
// border onto edge of maybe flushed buffer.
|
||||
test_read(cap * 2 - align, align).await;
|
||||
// read across maybe flushed and mutable buffer
|
||||
test_read(cap * 2 - align, 2 * align).await;
|
||||
// read across three segments
|
||||
test_read(cap - align, cap + 2 * align).await;
|
||||
// completely within mutable buffer
|
||||
test_read(cap * 2 + align, align).await;
|
||||
}
|
||||
};
|
||||
|
||||
// completely within the file range
|
||||
assert!(20 < cap, "test assumption");
|
||||
test_read(10, 10).await;
|
||||
// border onto edge of file
|
||||
test_read(cap - 10, 10).await;
|
||||
// read across file and buffer
|
||||
test_read(cap - 10, 20).await;
|
||||
// stay from start of buffer
|
||||
test_read(cap, 10).await;
|
||||
// completely within buffer
|
||||
test_read(cap + 10, 10).await;
|
||||
assert!(align < cap, "test assumption");
|
||||
assert!(cap % align == 0);
|
||||
|
||||
// test reads at different flush stages.
|
||||
let not_started = control.unwrap().into_not_started();
|
||||
test_read_all_offset_combinations().await;
|
||||
let in_progress = not_started.ready_to_flush();
|
||||
test_read_all_offset_combinations().await;
|
||||
in_progress.wait_until_flush_is_done().await;
|
||||
test_read_all_offset_combinations().await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -347,7 +347,7 @@ async fn init_load_generations(
|
||||
);
|
||||
emergency_generations(tenant_confs)
|
||||
} else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
info!("Calling {} API to re-attach tenants", client.base_url());
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants
|
||||
@@ -894,7 +894,7 @@ impl TenantManager {
|
||||
Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
Err(GetTenantError::ShardNotFound(tenant_shard_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError {
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantId),
|
||||
|
||||
#[error("Tenant {0} not found")]
|
||||
ShardNotFound(TenantShardId),
|
||||
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantShardId),
|
||||
|
||||
|
||||
@@ -681,6 +681,7 @@ impl RemoteTimelineClient {
|
||||
layer_file_name: &LayerName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
@@ -700,6 +701,7 @@ impl RemoteTimelineClient {
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
local_path,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
@@ -2190,6 +2192,9 @@ impl RemoteTimelineClient {
|
||||
upload_queue.clean.1 = Some(task.task_id);
|
||||
|
||||
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
|
||||
self.metrics
|
||||
.projected_remote_consistent_lsn_gauge
|
||||
.set(lsn.0);
|
||||
|
||||
if self.generation.is_none() {
|
||||
// Legacy mode: skip validating generation
|
||||
@@ -2564,9 +2569,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
}
|
||||
|
||||
/// Given the key of a tenant manifest, parse out the generation number
|
||||
pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
|
||||
pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
|
||||
let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap());
|
||||
re.captures(path.get_path().as_str())
|
||||
.and_then(|c| c.get(1))
|
||||
.and_then(|m| Generation::parse_suffix(m.as_str()))
|
||||
|
||||
@@ -26,8 +26,6 @@ use crate::span::{
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
#[cfg_attr(target_os = "macos", allow(unused_imports))]
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{
|
||||
@@ -60,6 +58,7 @@ pub async fn download_layer_file<'a>(
|
||||
layer_file_name: &'a LayerName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
local_path: &Utf8Path,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
@@ -88,7 +87,9 @@ pub async fn download_layer_file<'a>(
|
||||
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
|
||||
|| async {
|
||||
download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
@@ -148,6 +149,7 @@ async fn download_object<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
|
||||
cancel: &CancellationToken,
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
|
||||
) -> Result<u64, DownloadError> {
|
||||
@@ -205,13 +207,18 @@ async fn download_object<'a>(
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
|
||||
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
|
||||
use bytes::BytesMut;
|
||||
use crate::virtual_file::owned_buffers_io;
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use std::sync::Arc;
|
||||
async {
|
||||
let destination_file = VirtualFile::create(dst_path, ctx)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let destination_file = Arc::new(
|
||||
VirtualFile::create(dst_path, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("create a destination file for layer '{dst_path}'")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let mut download = storage
|
||||
.download(src_path, &DownloadOpts::default(), cancel)
|
||||
@@ -219,14 +226,16 @@ async fn download_object<'a>(
|
||||
|
||||
pausable_failpoint!("before-downloading-layer-stream-pausable");
|
||||
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
|
||||
destination_file,
|
||||
|| IoBufferMut::with_capacity(super::BUFFER_SIZE),
|
||||
gate.enter().map_err(|_| DownloadError::Cancelled)?,
|
||||
ctx,
|
||||
);
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
|
||||
size_tracking,
|
||||
BytesMut::with_capacity(super::BUFFER_SIZE),
|
||||
);
|
||||
while let Some(res) =
|
||||
futures::StreamExt::next(&mut download.download_stream).await
|
||||
{
|
||||
@@ -234,10 +243,10 @@ async fn download_object<'a>(
|
||||
Ok(chunk) => chunk,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered.write_buffered(chunk.slice_len(), ctx).await?;
|
||||
buffered.write_buffered_borrowed(&chunk, ctx).await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner(ctx).await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
let inner = buffered.flush_and_into_inner(ctx).await?;
|
||||
Ok(inner)
|
||||
}
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ impl TenantManifest {
|
||||
offloaded_timelines: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||
serde_json::from_slice::<Self>(bytes)
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ use super::{
|
||||
mgr::TenantManager,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::LayerName,
|
||||
GetTenantError,
|
||||
};
|
||||
|
||||
use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
|
||||
@@ -66,7 +67,21 @@ struct CommandRequest<T> {
|
||||
}
|
||||
|
||||
struct CommandResponse {
|
||||
result: anyhow::Result<()>,
|
||||
result: Result<(), SecondaryTenantError>,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum SecondaryTenantError {
|
||||
#[error("{0}")]
|
||||
GetTenant(GetTenantError),
|
||||
#[error("shutting down")]
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl From<GetTenantError> for SecondaryTenantError {
|
||||
fn from(gte: GetTenantError) -> Self {
|
||||
Self::GetTenant(gte)
|
||||
}
|
||||
}
|
||||
|
||||
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
|
||||
@@ -285,7 +300,7 @@ impl SecondaryController {
|
||||
&self,
|
||||
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
|
||||
payload: T,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), SecondaryTenantError> {
|
||||
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
queue
|
||||
@@ -294,20 +309,26 @@ impl SecondaryController {
|
||||
response_tx,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
|
||||
.map_err(|_| SecondaryTenantError::ShuttingDown)?;
|
||||
|
||||
let response = response_rx
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
|
||||
.map_err(|_| SecondaryTenantError::ShuttingDown)?;
|
||||
|
||||
response.result
|
||||
}
|
||||
|
||||
pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
pub(crate) async fn upload_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), SecondaryTenantError> {
|
||||
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
|
||||
.await
|
||||
}
|
||||
pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
pub(crate) async fn download_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), SecondaryTenantError> {
|
||||
self.dispatch(
|
||||
&self.download_req_tx,
|
||||
DownloadCommand::Download(tenant_shard_id),
|
||||
|
||||
@@ -35,7 +35,7 @@ use super::{
|
||||
self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
|
||||
TenantBackgroundJobs,
|
||||
},
|
||||
SecondaryTenant,
|
||||
GetTenantError, SecondaryTenant, SecondaryTenantError,
|
||||
};
|
||||
|
||||
use crate::tenant::{
|
||||
@@ -470,15 +470,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
result
|
||||
}
|
||||
|
||||
fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
|
||||
fn on_command(
|
||||
&mut self,
|
||||
command: DownloadCommand,
|
||||
) -> Result<PendingDownload, SecondaryTenantError> {
|
||||
let tenant_shard_id = command.get_tenant_shard_id();
|
||||
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(*tenant_shard_id);
|
||||
let Some(tenant) = tenant else {
|
||||
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||
};
|
||||
.get_secondary_tenant_shard(*tenant_shard_id)
|
||||
.ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?;
|
||||
|
||||
Ok(PendingDownload {
|
||||
target_time: None,
|
||||
@@ -1182,6 +1183,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
&layer.name,
|
||||
&layer.metadata,
|
||||
&local_path,
|
||||
&self.secondary_state.gate,
|
||||
&self.secondary_state.cancel,
|
||||
ctx,
|
||||
)
|
||||
|
||||
@@ -28,7 +28,7 @@ use super::{
|
||||
self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
|
||||
TenantBackgroundJobs,
|
||||
},
|
||||
CommandRequest, UploadCommand,
|
||||
CommandRequest, SecondaryTenantError, UploadCommand,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
@@ -279,7 +279,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
|
||||
fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
|
||||
fn on_command(
|
||||
&mut self,
|
||||
command: UploadCommand,
|
||||
) -> Result<UploadPending, SecondaryTenantError> {
|
||||
let tenant_shard_id = command.get_tenant_shard_id();
|
||||
|
||||
tracing::info!(
|
||||
@@ -287,8 +290,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
"Starting heatmap write on command");
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(*tenant_shard_id)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
.get_attached_tenant_shard(*tenant_shard_id)?;
|
||||
if !tenant.is_active() {
|
||||
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{completion::Barrier, yielding_loop::yielding_loop};
|
||||
|
||||
use super::{CommandRequest, CommandResponse};
|
||||
use super::{CommandRequest, CommandResponse, SecondaryTenantError};
|
||||
|
||||
/// Scheduling interval is the time between calls to JobGenerator::schedule.
|
||||
/// When we schedule jobs, the job generator may provide a hint of its preferred
|
||||
@@ -112,7 +112,7 @@ where
|
||||
|
||||
/// Called when a command is received. A job will be spawned immediately if the return
|
||||
/// value is Some, ignoring concurrency limits and the pending queue.
|
||||
fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
|
||||
fn on_command(&mut self, cmd: CMD) -> Result<PJ, SecondaryTenantError>;
|
||||
}
|
||||
|
||||
/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
|
||||
|
||||
@@ -555,13 +555,12 @@ impl InMemoryLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_lsn: Lsn,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file =
|
||||
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
|
||||
let key = InMemoryLayerFileId(file.page_cache_file_id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
|
||||
@@ -1149,6 +1149,7 @@ impl LayerInner {
|
||||
&self.desc.layer_name(),
|
||||
&self.metadata(),
|
||||
&self.path,
|
||||
&timeline.gate,
|
||||
&timeline.cancel,
|
||||
ctx,
|
||||
)
|
||||
|
||||
@@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
|
||||
|
||||
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
|
||||
// Or just spawn another background loop for this throttle, it's not like it's super costly.
|
||||
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let allowed_rps = tenant.pagestream_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
|
||||
@@ -1,19 +1,14 @@
|
||||
use std::{
|
||||
str::FromStr,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc, Mutex,
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use enumset::EnumSet;
|
||||
use tracing::{error, warn};
|
||||
use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
|
||||
|
||||
use crate::{context::RequestContext, task_mgr::TaskKind};
|
||||
|
||||
/// Throttle for `async` functions.
|
||||
///
|
||||
/// Runtime reconfigurable.
|
||||
@@ -35,7 +30,7 @@ pub struct Throttle<M: Metric> {
|
||||
}
|
||||
|
||||
pub struct Inner {
|
||||
task_kinds: EnumSet<TaskKind>,
|
||||
enabled: bool,
|
||||
rate_limiter: Arc<RateLimiter>,
|
||||
}
|
||||
|
||||
@@ -79,26 +74,12 @@ where
|
||||
}
|
||||
fn new_inner(config: Config) -> Inner {
|
||||
let Config {
|
||||
task_kinds,
|
||||
enabled,
|
||||
initial,
|
||||
refill_interval,
|
||||
refill_amount,
|
||||
max,
|
||||
} = config;
|
||||
let task_kinds: EnumSet<TaskKind> = task_kinds
|
||||
.iter()
|
||||
.filter_map(|s| match TaskKind::from_str(s) {
|
||||
Ok(v) => Some(v),
|
||||
Err(e) => {
|
||||
// TODO: avoid this failure mode
|
||||
error!(
|
||||
"cannot parse task kind, ignoring for rate limiting {}",
|
||||
utils::error::report_compact_sources(&e)
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// steady rate, we expect `refill_amount` requests per `refill_interval`.
|
||||
// dividing gives us the rps.
|
||||
@@ -112,7 +93,7 @@ where
|
||||
let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
|
||||
|
||||
Inner {
|
||||
task_kinds,
|
||||
enabled: enabled.is_enabled(),
|
||||
rate_limiter: Arc::new(rate_limiter),
|
||||
}
|
||||
}
|
||||
@@ -141,11 +122,13 @@ where
|
||||
self.inner.load().rate_limiter.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
|
||||
pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
if !inner.task_kinds.contains(ctx.task_kind()) {
|
||||
|
||||
if !inner.enabled {
|
||||
return None;
|
||||
};
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
self.metric.accounting_start();
|
||||
@@ -162,19 +145,6 @@ where
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
match ctx.micros_spent_throttled.add(wait_time) {
|
||||
Ok(res) => res,
|
||||
Err(error) => {
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::rate_limit::RateLimit;
|
||||
static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut guard = WARN_RATE_LIMIT.lock().unwrap();
|
||||
guard.call(move || {
|
||||
warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
Some(wait_time)
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -53,7 +53,7 @@ use utils::{
|
||||
postgres_client::PostgresClientProtocol,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
pub pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -411,9 +411,9 @@ pub struct Timeline {
|
||||
/// Timeline deletion will acquire both compaction and gc locks in whatever order.
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
|
||||
timeline_get_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
|
||||
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
|
||||
Background,
|
||||
}
|
||||
|
||||
#[derive(enumset::EnumSetType)]
|
||||
#[derive(Debug, enumset::EnumSetType)]
|
||||
pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
ForceImageLayerCreation,
|
||||
@@ -777,6 +777,19 @@ pub(crate) enum CompactFlags {
|
||||
DryRun,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct CompactRequest {
|
||||
pub compact_range: Option<CompactRange>,
|
||||
pub compact_below_lsn: Option<Lsn>,
|
||||
/// Whether the compaction job should be scheduled.
|
||||
#[serde(default)]
|
||||
pub scheduled: bool,
|
||||
/// Whether the compaction job should be split across key ranges.
|
||||
#[serde(default)]
|
||||
pub sub_compaction: bool,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct CompactRange {
|
||||
@@ -786,10 +799,27 @@ pub(crate) struct CompactRange {
|
||||
pub end: Key,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
impl From<Range<Key>> for CompactRange {
|
||||
fn from(range: Range<Key>) -> Self {
|
||||
CompactRange {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(crate) struct CompactOptions {
|
||||
pub flags: EnumSet<CompactFlags>,
|
||||
/// If set, the compaction will only compact the key range specified by this option.
|
||||
/// This option is only used by GC compaction.
|
||||
pub compact_range: Option<CompactRange>,
|
||||
/// If set, the compaction will only compact the LSN below this value.
|
||||
/// This option is only used by GC compaction.
|
||||
pub compact_below_lsn: Option<Lsn>,
|
||||
/// Enable sub-compaction (split compaction job across key ranges).
|
||||
/// This option is only used by GC compaction.
|
||||
pub sub_compaction: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -949,7 +979,7 @@ impl Timeline {
|
||||
/// If a remote layer file is needed, it is downloaded as part of this
|
||||
/// call.
|
||||
///
|
||||
/// This method enforces [`Self::timeline_get_throttle`] internally.
|
||||
/// This method enforces [`Self::pagestream_throttle`] internally.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The
|
||||
/// abstraction above this needs to store suitable metadata to track what
|
||||
@@ -977,8 +1007,6 @@ impl Timeline {
|
||||
// page_service.
|
||||
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![key..key.next()],
|
||||
};
|
||||
@@ -1058,13 +1086,6 @@ impl Timeline {
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(|metric| (metric, Instant::now()));
|
||||
|
||||
// start counting after throttle so that throttle time
|
||||
// is always less than observation time
|
||||
let throttled = self
|
||||
.timeline_get_throttle
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
let res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
@@ -1076,23 +1097,7 @@ impl Timeline {
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
let elapsed = start.elapsed();
|
||||
let ex_throttled = if let Some(throttled) = throttled {
|
||||
elapsed.checked_sub(throttled)
|
||||
} else {
|
||||
Some(elapsed)
|
||||
};
|
||||
|
||||
if let Some(ex_throttled) = ex_throttled {
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
} else {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
metric.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
|
||||
res
|
||||
@@ -1137,14 +1142,6 @@ impl Timeline {
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(ScanLatencyOngoingRecording::start_recording);
|
||||
|
||||
// start counting after throttle so that throttle time
|
||||
// is always less than observation time
|
||||
let throttled = self
|
||||
.timeline_get_throttle
|
||||
// assume scan = 1 quota for now until we find a better way to process this
|
||||
.throttle(ctx, 1)
|
||||
.await;
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
@@ -1155,7 +1152,7 @@ impl Timeline {
|
||||
.await;
|
||||
|
||||
if let Some(recording) = start {
|
||||
recording.observe(throttled);
|
||||
recording.observe();
|
||||
}
|
||||
|
||||
vectored_res
|
||||
@@ -1466,23 +1463,31 @@ impl Timeline {
|
||||
Ok(lease)
|
||||
}
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
|
||||
/// Returns the flush request ID which can be awaited with wait_flush_completion().
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
|
||||
self.freeze0().await
|
||||
}
|
||||
|
||||
/// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||
pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
|
||||
self.freeze_and_flush0().await
|
||||
}
|
||||
|
||||
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
|
||||
/// Returns the flush request ID which can be awaited with wait_flush_completion().
|
||||
pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
|
||||
let mut g = self.write_lock.lock().await;
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn, &mut g).await
|
||||
}
|
||||
|
||||
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
|
||||
// polluting the span hierarchy.
|
||||
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
|
||||
let token = {
|
||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||
// iteration.
|
||||
let mut g = self.write_lock.lock().await;
|
||||
|
||||
let to_lsn = self.get_last_record_lsn();
|
||||
self.freeze_inmem_layer_at(to_lsn, &mut g).await?
|
||||
};
|
||||
let token = self.freeze0().await?;
|
||||
self.wait_flush_completion(token).await
|
||||
}
|
||||
|
||||
@@ -1637,6 +1642,8 @@ impl Timeline {
|
||||
CompactOptions {
|
||||
flags,
|
||||
compact_range: None,
|
||||
compact_below_lsn: None,
|
||||
sub_compaction: false,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
@@ -2371,7 +2378,7 @@ impl Timeline {
|
||||
|
||||
standby_horizon: AtomicLsn::new(0),
|
||||
|
||||
timeline_get_throttle: resources.timeline_get_throttle,
|
||||
pagestream_throttle: resources.pagestream_throttle,
|
||||
|
||||
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
|
||||
|
||||
@@ -2392,7 +2399,7 @@ impl Timeline {
|
||||
|
||||
result
|
||||
.metrics
|
||||
.last_record_gauge
|
||||
.last_record_lsn_gauge
|
||||
.set(disk_consistent_lsn.0 as i64);
|
||||
result
|
||||
})
|
||||
@@ -3488,7 +3495,6 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
ensure!(
|
||||
@@ -3505,7 +3511,7 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
gate_guard,
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -3515,7 +3521,7 @@ impl Timeline {
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
assert!(new_lsn.is_aligned());
|
||||
|
||||
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
|
||||
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
@@ -3883,6 +3889,10 @@ impl Timeline {
|
||||
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
|
||||
let old_value = self.disk_consistent_lsn.fetch_max(new_value);
|
||||
assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
|
||||
|
||||
self.metrics
|
||||
.disk_consistent_lsn_gauge
|
||||
.set(new_value.0 as i64);
|
||||
new_value != old_value
|
||||
}
|
||||
|
||||
@@ -5921,6 +5931,23 @@ impl<'a> TimelineWriter<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// In debug builds, assert that we don't write any keys that don't belong to this shard.
|
||||
// We don't assert this in release builds, since key ownership policies may change over
|
||||
// time. Stray keys will be removed during compaction.
|
||||
if cfg!(debug_assertions) {
|
||||
for metadata in &batch.metadata {
|
||||
if let ValueMeta::Serialized(metadata) = metadata {
|
||||
let key = Key::from_compact(metadata.key);
|
||||
assert!(
|
||||
self.shard_identity.is_key_local(&key)
|
||||
|| self.shard_identity.is_key_global(&key),
|
||||
"key {key} does not belong on shard {}",
|
||||
self.shard_identity.shard_index()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let batch_max_lsn = batch.max_lsn;
|
||||
let buf_size: u64 = batch.buffer_size() as u64;
|
||||
|
||||
|
||||
@@ -10,13 +10,12 @@ use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
|
||||
ImageLayerCreationMode, RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
@@ -30,7 +29,6 @@ use utils::id::TimelineId;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -64,6 +62,12 @@ use super::CompactionError;
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
/// A scheduled compaction task.
|
||||
pub struct ScheduledCompactionTask {
|
||||
pub options: CompactOptions,
|
||||
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
pub struct GcCompactionJobDescription {
|
||||
/// All layers to read in the compaction job
|
||||
selected_layers: Vec<Layer>,
|
||||
@@ -1174,11 +1178,12 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
} else {
|
||||
debug!(
|
||||
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||
key,
|
||||
self.shard_identity.get_shard_number(&key)
|
||||
);
|
||||
let shard = self.shard_identity.shard_index();
|
||||
let owner = self.shard_identity.get_shard_number(&key);
|
||||
if cfg!(debug_assertions) {
|
||||
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
|
||||
}
|
||||
debug!("dropping key {key} during compaction (it belongs on shard {owner})");
|
||||
}
|
||||
|
||||
if !new_layers.is_empty() {
|
||||
@@ -1746,22 +1751,114 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn compact_with_gc(
|
||||
/// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
|
||||
/// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
|
||||
/// ad-hoc information about gc compaction itself.
|
||||
pub(crate) async fn gc_compaction_split_jobs(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.partial_compact_with_gc(
|
||||
options
|
||||
.compact_range
|
||||
.map(|range| range.start..range.end)
|
||||
.unwrap_or_else(|| Key::MIN..Key::MAX),
|
||||
cancel,
|
||||
options.flags,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
) -> anyhow::Result<Vec<CompactOptions>> {
|
||||
if !options.sub_compaction {
|
||||
return Ok(vec![options]);
|
||||
}
|
||||
let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
|
||||
start: Key::MIN,
|
||||
end: Key::MAX,
|
||||
});
|
||||
let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
|
||||
compact_below_lsn
|
||||
} else {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.select_min() // use the real gc cutoff
|
||||
};
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock");
|
||||
};
|
||||
let ((dense_ks, sparse_ks), _) = &*partition;
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
source_end: &Key,
|
||||
target_start: &Key,
|
||||
target_end: &Key,
|
||||
) -> Option<(Key, Key)> {
|
||||
let start = source_start.max(target_start);
|
||||
let end = source_end.min(target_end);
|
||||
if start < end {
|
||||
Some((*start, *end))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
let mut split_key_ranges = Vec::new();
|
||||
let ranges = dense_ks
|
||||
.parts
|
||||
.iter()
|
||||
.map(|partition| partition.ranges.iter())
|
||||
.chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
|
||||
.flatten()
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
for range in ranges.iter() {
|
||||
let Some((start, end)) = truncate_to(
|
||||
&range.start,
|
||||
&range.end,
|
||||
&compact_range.start,
|
||||
&compact_range.end,
|
||||
) else {
|
||||
continue;
|
||||
};
|
||||
split_key_ranges.push((start, end));
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let mut current_start = None;
|
||||
// Split compaction job to about 2GB each
|
||||
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
|
||||
let ranges_num = split_key_ranges.len();
|
||||
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
|
||||
if current_start.is_none() {
|
||||
current_start = Some(start);
|
||||
}
|
||||
let start = current_start.unwrap();
|
||||
if start >= end {
|
||||
// We have already processed this partition.
|
||||
continue;
|
||||
}
|
||||
let res = layer_map.range_search(start..end, compact_below_lsn);
|
||||
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
|
||||
if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
let mut compact_options = options.clone();
|
||||
// Try to extend the compaction range so that we include at least one full layer file.
|
||||
let extended_end = res
|
||||
.found
|
||||
.keys()
|
||||
.map(|layer| layer.layer.key_range.end)
|
||||
.min();
|
||||
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
|
||||
// In this case, we simply use the specified key range end.
|
||||
let end = if let Some(extended_end) = extended_end {
|
||||
extended_end.max(end)
|
||||
} else {
|
||||
end
|
||||
};
|
||||
info!(
|
||||
"splitting compaction job: {}..{}, estimated_size={}",
|
||||
start, end, total_size
|
||||
);
|
||||
compact_options.compact_range = Some(CompactRange { start, end });
|
||||
compact_options.compact_below_lsn = Some(compact_below_lsn);
|
||||
compact_options.sub_compaction = false;
|
||||
compact_jobs.push(compact_options);
|
||||
current_start = Some(end);
|
||||
}
|
||||
}
|
||||
drop(guard);
|
||||
Ok(compact_jobs)
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
@@ -1771,19 +1868,51 @@ impl Timeline {
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
///
|
||||
/// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
|
||||
/// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
|
||||
/// Partial compaction will read and process all layers overlapping with the key range, even if it might
|
||||
/// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
|
||||
/// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
|
||||
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
|
||||
/// part of the range.
|
||||
pub(crate) async fn partial_compact_with_gc(
|
||||
///
|
||||
/// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
|
||||
/// the LSN. Otherwise, it will use the gc cutoff by default.
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
compaction_key_range: Range<Key>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if options.sub_compaction {
|
||||
info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs = self.gc_compaction_split_jobs(options).await?;
|
||||
let jobs_len = jobs.len();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
info!(
|
||||
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
|
||||
idx + 1,
|
||||
jobs_len
|
||||
);
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await?;
|
||||
}
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, options, ctx).await
|
||||
}
|
||||
|
||||
async fn compact_with_gc_inner(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(
|
||||
!options.sub_compaction,
|
||||
"sub-compaction should be handled by the outer function"
|
||||
);
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
@@ -1803,6 +1932,12 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let flags = options.flags;
|
||||
let compaction_key_range = options
|
||||
.compact_range
|
||||
.map(|range| range.start..range.end)
|
||||
.unwrap_or_else(|| Key::MIN..Key::MAX);
|
||||
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
|
||||
if compaction_key_range == (Key::MIN..Key::MAX) {
|
||||
@@ -1826,7 +1961,18 @@ impl Timeline {
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
let gc_cutoff = gc_info.cutoffs.select_min();
|
||||
let gc_cutoff = {
|
||||
let real_gc_cutoff = gc_info.cutoffs.select_min();
|
||||
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
|
||||
// each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
|
||||
// the real cutoff.
|
||||
let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
|
||||
if gc_cutoff > real_gc_cutoff {
|
||||
warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
|
||||
gc_cutoff = real_gc_cutoff;
|
||||
}
|
||||
gc_cutoff
|
||||
};
|
||||
for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
|
||||
if lsn < &gc_cutoff {
|
||||
retain_lsns_below_horizon.push(*lsn);
|
||||
@@ -1846,7 +1992,7 @@ impl Timeline {
|
||||
.map(|desc| desc.get_lsn_range().end)
|
||||
.max()
|
||||
else {
|
||||
info!("no layers to compact with gc");
|
||||
info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
|
||||
return Ok(());
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
@@ -1869,7 +2015,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
info!("no layers to compact with gc");
|
||||
info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
@@ -1936,14 +2082,15 @@ impl Timeline {
|
||||
|
||||
// Step 1: construct a k-merge iterator over all layers.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let layer_names = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
}
|
||||
// disable the check for now because we need to adjust the check for partial compactions, will enable later.
|
||||
// let layer_names = job_desc
|
||||
// .selected_layers
|
||||
// .iter()
|
||||
// .map(|layer| layer.layer_desc().layer_name())
|
||||
// .collect_vec();
|
||||
// if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
// warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
// }
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
.selected_layers
|
||||
@@ -2048,6 +2195,11 @@ impl Timeline {
|
||||
// This is not handled in the filter iterator because shard is determined by hash.
|
||||
// Therefore, it does not give us any performance benefit to do things like skip
|
||||
// a whole layer file as handling key spaces (ranges).
|
||||
if cfg!(debug_assertions) {
|
||||
let shard = self.shard_identity.shard_index();
|
||||
let owner = self.shard_identity.get_shard_number(&key);
|
||||
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if !job_desc.compaction_key_range.contains(&key) {
|
||||
|
||||
@@ -298,7 +298,7 @@ impl DeleteTimelineFlow {
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
|
||||
@@ -129,22 +129,23 @@ impl Flow {
|
||||
}
|
||||
|
||||
// Import SLRUs
|
||||
|
||||
// pg_xact (01:00 keyspace)
|
||||
self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
|
||||
if self.timeline.tenant_shard_id.is_shard_zero() {
|
||||
// pg_xact (01:00 keyspace)
|
||||
self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
|
||||
.await?;
|
||||
// pg_multixact/members (01:01 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactMembers,
|
||||
&self.storage.pgdata().join("pg_multixact/members"),
|
||||
)
|
||||
.await?;
|
||||
// pg_multixact/members (01:01 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactMembers,
|
||||
&self.storage.pgdata().join("pg_multixact/members"),
|
||||
)
|
||||
.await?;
|
||||
// pg_multixact/offsets (01:02 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactOffsets,
|
||||
&self.storage.pgdata().join("pg_multixact/offsets"),
|
||||
)
|
||||
.await?;
|
||||
// pg_multixact/offsets (01:02 keyspace)
|
||||
self.import_slru(
|
||||
SlruKind::MultiXactOffsets,
|
||||
&self.storage.pgdata().join("pg_multixact/offsets"),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Import pg_twophase.
|
||||
// TODO: as empty
|
||||
@@ -302,6 +303,8 @@ impl Flow {
|
||||
}
|
||||
|
||||
async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
|
||||
assert!(self.timeline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let segments = self.storage.listfilesindir(path).await?;
|
||||
let segments: Vec<(String, u32, usize)> = segments
|
||||
.into_iter()
|
||||
@@ -337,7 +340,6 @@ impl Flow {
|
||||
debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
|
||||
self.tasks
|
||||
.push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
|
||||
*self.timeline.get_shard_identity(),
|
||||
start_key..end_key,
|
||||
&p,
|
||||
self.storage.clone(),
|
||||
@@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
}
|
||||
|
||||
struct ImportSlruBlocksTask {
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
}
|
||||
|
||||
impl ImportSlruBlocksTask {
|
||||
fn new(
|
||||
shard_identity: ShardIdentity,
|
||||
key_range: Range<Key>,
|
||||
path: &RemotePath,
|
||||
storage: RemoteStorageWrapper,
|
||||
) -> Self {
|
||||
fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
|
||||
ImportSlruBlocksTask {
|
||||
shard_identity,
|
||||
key_range,
|
||||
path: path.clone(),
|
||||
storage,
|
||||
@@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask {
|
||||
let mut file_offset = 0;
|
||||
while blknum < end_blk {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
assert!(
|
||||
!self.shard_identity.is_key_disposable(&key),
|
||||
"SLRU keys need to go into every shard"
|
||||
);
|
||||
let buf = &buf[file_offset..(file_offset + 8192)];
|
||||
file_offset += 8192;
|
||||
layer_writer
|
||||
.put_image(key, Bytes::copy_from_slice(buf), ctx)
|
||||
.await?;
|
||||
blknum += 1;
|
||||
nimages += 1;
|
||||
blknum += 1;
|
||||
}
|
||||
Ok(nimages)
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ impl OpenLayerManager {
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
ensure!(lsn.is_aligned());
|
||||
@@ -212,15 +212,9 @@ impl OpenLayerManager {
|
||||
lsn
|
||||
);
|
||||
|
||||
let new_layer = InMemoryLayer::create(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_lsn,
|
||||
gate_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let new_layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
|
||||
.await?;
|
||||
let layer = Arc::new(new_layer);
|
||||
|
||||
self.layer_map.open_layer = Some(layer.clone());
|
||||
|
||||
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// advances it to its end LSN. 0 is just an initialization placeholder.
|
||||
let mut modification = timeline.begin_modification(Lsn(0));
|
||||
|
||||
if !records.is_empty() {
|
||||
timeline
|
||||
.metrics
|
||||
.wal_records_received
|
||||
.inc_by(records.len() as u64);
|
||||
}
|
||||
|
||||
for interpreted in records {
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
timeline.metrics.wal_records_received.inc();
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
|
||||
@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
|
||||
use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
|
||||
use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
|
||||
use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
|
||||
use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io {
|
||||
pub(crate) mod io_buf_ext;
|
||||
pub(crate) mod slice;
|
||||
pub(crate) mod write;
|
||||
pub(crate) mod util {
|
||||
pub(crate) mod size_tracking_writer;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -221,7 +218,7 @@ impl VirtualFile {
|
||||
self.inner.read_exact_at_page(page, offset, ctx).await
|
||||
}
|
||||
|
||||
pub async fn write_all_at<Buf: IoBuf + Send>(
|
||||
pub async fn write_all_at<Buf: IoBufAligned + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
@@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner {
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for VirtualFile {
|
||||
#[inline(always)]
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
async fn write_all_at<Buf: IoBufAligned + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
|
||||
res.map(move |v| (v, buf))
|
||||
) -> std::io::Result<FullSlice<Buf>> {
|
||||
let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
|
||||
res.map(|_| buf)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1451,7 +1448,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn write_all_at<Buf: IoBuf + Send>(
|
||||
async fn write_all_at<Buf: IoBufAligned + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
@@ -1594,6 +1591,7 @@ mod tests {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
file_a
|
||||
.write_all(b"foobar".to_vec().slice_len(), &ctx)
|
||||
.await?;
|
||||
@@ -1652,10 +1650,10 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
file_b
|
||||
.write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
|
||||
.write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
|
||||
.await?;
|
||||
file_b
|
||||
.write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
|
||||
.write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
|
||||
|
||||
@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
|
||||
}
|
||||
|
||||
/// Alignment at compile time.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ConstAlign<const A: usize>;
|
||||
|
||||
impl<const A: usize> Alignment for ConstAlign<A> {
|
||||
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
|
||||
}
|
||||
|
||||
/// Alignment at run time.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RuntimeAlign {
|
||||
align: usize,
|
||||
}
|
||||
|
||||
@@ -3,9 +3,10 @@ use std::{
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use super::{alignment::Alignment, raw::RawAlignedBuffer};
|
||||
use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
|
||||
|
||||
/// An shared, immutable aligned buffer type.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AlignedBuffer<A: Alignment> {
|
||||
/// Shared raw buffer.
|
||||
raw: Arc<RawAlignedBuffer<A>>,
|
||||
@@ -86,6 +87,13 @@ impl<A: Alignment> AlignedBuffer<A> {
|
||||
range: begin..end,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the mutable aligned buffer, if the immutable aligned buffer
|
||||
/// has exactly one strong reference. Otherwise returns `None`.
|
||||
pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
|
||||
let raw = Arc::into_inner(self.raw)?;
|
||||
Some(AlignedBufferMut::from_raw(raw))
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> Deref for AlignedBuffer<A> {
|
||||
@@ -108,6 +116,14 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
|
||||
fn from(value: &[u8; N]) -> Self {
|
||||
let mut buf = AlignedBufferMut::with_capacity(N);
|
||||
buf.extend_from_slice(value);
|
||||
buf.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: the underlying buffer references a stable memory region.
|
||||
unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::{
|
||||
mem::MaybeUninit,
|
||||
ops::{Deref, DerefMut},
|
||||
};
|
||||
|
||||
use super::{
|
||||
alignment::{Alignment, ConstAlign},
|
||||
@@ -46,6 +49,11 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
|
||||
}
|
||||
|
||||
impl<A: Alignment> AlignedBufferMut<A> {
|
||||
/// Constructs a mutable aligned buffer from raw.
|
||||
pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
|
||||
AlignedBufferMut { raw }
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes the buffer can hold.
|
||||
#[inline]
|
||||
pub fn capacity(&self) -> usize {
|
||||
@@ -128,6 +136,39 @@ impl<A: Alignment> AlignedBufferMut<A> {
|
||||
let len = self.len();
|
||||
AlignedBuffer::from_raw(self.raw, 0..len)
|
||||
}
|
||||
|
||||
/// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
|
||||
#[inline]
|
||||
pub fn extend_from_slice(&mut self, extend: &[u8]) {
|
||||
let cnt = extend.len();
|
||||
self.reserve(cnt);
|
||||
|
||||
// SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
|
||||
unsafe {
|
||||
let dst = self.spare_capacity_mut();
|
||||
// Reserved above
|
||||
debug_assert!(dst.len() >= cnt);
|
||||
|
||||
core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
|
||||
}
|
||||
// SAFETY: We do have at least `cnt` bytes remaining before advance.
|
||||
unsafe {
|
||||
bytes::BufMut::advance_mut(self, cnt);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
|
||||
#[inline]
|
||||
fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
|
||||
// SAFETY: we guarantees that the `Self::capacity()` bytes from
|
||||
// `Self::as_mut_ptr()` are allocated.
|
||||
unsafe {
|
||||
let ptr = self.as_mut_ptr().add(self.len());
|
||||
let len = self.capacity() - self.len();
|
||||
|
||||
core::slice::from_raw_parts_mut(ptr.cast(), len)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Alignment> Deref for AlignedBufferMut<A> {
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::{IoBuf, IoBufMut};
|
||||
|
||||
use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
|
||||
use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};
|
||||
|
||||
/// A marker trait for a mutable aligned buffer type.
|
||||
pub trait IoBufAlignedMut: IoBufMut {}
|
||||
|
||||
/// A marker trait for an aligned buffer type.
|
||||
pub trait IoBufAligned: IoBuf {}
|
||||
|
||||
impl IoBufAlignedMut for IoBufferMut {}
|
||||
|
||||
impl IoBufAligned for IoBuffer {}
|
||||
|
||||
impl IoBufAlignedMut for PageWriteGuardBuf {}
|
||||
|
||||
@@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
|
||||
use super::write::CheapCloneForRead;
|
||||
|
||||
/// The true owned equivalent for Rust [`slice`]. Use this for the write path.
|
||||
///
|
||||
/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
|
||||
@@ -43,6 +45,18 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> CheapCloneForRead for FullSlice<B>
|
||||
where
|
||||
B: IoBuf + CheapCloneForRead,
|
||||
{
|
||||
fn cheap_clone(&self) -> Self {
|
||||
let bounds = self.slice.bounds();
|
||||
let clone = self.slice.get_ref().cheap_clone();
|
||||
let slice = clone.slice(bounds);
|
||||
Self { slice }
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait IoBufExt {
|
||||
/// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
|
||||
fn slice_len(self) -> FullSlice<Self>
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
|
||||
};
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
|
||||
pub struct Writer<W> {
|
||||
dst: W,
|
||||
bytes_amount: u64,
|
||||
}
|
||||
|
||||
impl<W> Writer<W> {
|
||||
pub fn new(dst: W) -> Self {
|
||||
Self {
|
||||
dst,
|
||||
bytes_amount: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
self.bytes_amount
|
||||
}
|
||||
|
||||
pub fn as_inner(&self) -> &W {
|
||||
&self.dst
|
||||
}
|
||||
|
||||
/// Returns the wrapped `VirtualFile` object as well as the number
|
||||
/// of bytes that were written to it through this object.
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub fn into_inner(self) -> (u64, W) {
|
||||
(self.bytes_amount, self.dst)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W> OwnedAsyncWriter for Writer<W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
#[inline(always)]
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
}
|
||||
}
|
||||
@@ -1,55 +1,88 @@
|
||||
use bytes::BytesMut;
|
||||
mod flush;
|
||||
use std::sync::Arc;
|
||||
|
||||
use flush::FlushHandle;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
virtual_file::{IoBuffer, IoBufferMut},
|
||||
};
|
||||
|
||||
use super::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use super::{
|
||||
io_buf_aligned::IoBufAligned,
|
||||
io_buf_ext::{FullSlice, IoBufExt},
|
||||
};
|
||||
|
||||
pub(crate) use flush::FlushControl;
|
||||
|
||||
pub(crate) trait CheapCloneForRead {
|
||||
/// Returns a cheap clone of the buffer.
|
||||
fn cheap_clone(&self) -> Self;
|
||||
}
|
||||
|
||||
impl CheapCloneForRead for IoBuffer {
|
||||
fn cheap_clone(&self) -> Self {
|
||||
// Cheap clone over an `Arc`.
|
||||
self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait for doing owned-buffer write IO.
|
||||
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
|
||||
/// The owned buffers need to be aligned due to Direct IO requirements.
|
||||
pub trait OwnedAsyncWriter {
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
fn write_all_at<Buf: IoBufAligned + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)>;
|
||||
) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
|
||||
/// small writes into larger writes of size [`Buffer::cap`].
|
||||
///
|
||||
/// # Passthrough Of Large Writers
|
||||
///
|
||||
/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
|
||||
/// cause the internal buffer to be flushed prematurely so that the large
|
||||
/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
|
||||
///
|
||||
/// This pass-through is generally beneficial for throughput, but if
|
||||
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
|
||||
/// unlimited large writes may cause latency or fairness issues.
|
||||
///
|
||||
/// In such cases, a different implementation that always buffers in memory
|
||||
/// may be preferable.
|
||||
pub struct BufferedWriter<B, W> {
|
||||
writer: W,
|
||||
// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
|
||||
// since we would avoid copying majority of the data into the internal buffer.
|
||||
pub struct BufferedWriter<B: Buffer, W> {
|
||||
writer: Arc<W>,
|
||||
/// invariant: always remains Some(buf) except
|
||||
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||
/// - after an IO error => stays `None` forever
|
||||
///
|
||||
/// In these exceptional cases, it's `None`.
|
||||
buf: Option<B>,
|
||||
mutable: Option<B>,
|
||||
/// A handle to the background flush task for writting data to disk.
|
||||
flush_handle: FlushHandle<B::IoBuf, W>,
|
||||
/// The number of bytes submitted to the background task.
|
||||
bytes_submitted: u64,
|
||||
}
|
||||
|
||||
impl<B, Buf, W> BufferedWriter<B, W>
|
||||
where
|
||||
B: Buffer<IoBuf = Buf> + Send,
|
||||
Buf: IoBuf + Send,
|
||||
W: OwnedAsyncWriter,
|
||||
B: Buffer<IoBuf = Buf> + Send + 'static,
|
||||
Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
|
||||
W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
|
||||
{
|
||||
pub fn new(writer: W, buf: B) -> Self {
|
||||
/// Creates a new buffered writer.
|
||||
///
|
||||
/// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
|
||||
pub fn new(
|
||||
writer: Arc<W>,
|
||||
buf_new: impl Fn() -> B,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Self {
|
||||
Self {
|
||||
writer,
|
||||
buf: Some(buf),
|
||||
writer: writer.clone(),
|
||||
mutable: Some(buf_new()),
|
||||
flush_handle: FlushHandle::spawn_new(
|
||||
writer,
|
||||
buf_new(),
|
||||
gate_guard,
|
||||
ctx.attached_child(),
|
||||
),
|
||||
bytes_submitted: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,87 +90,71 @@ where
|
||||
&self.writer
|
||||
}
|
||||
|
||||
/// Returns the number of bytes submitted to the background flush task.
|
||||
pub fn bytes_submitted(&self) -> u64 {
|
||||
self.bytes_submitted
|
||||
}
|
||||
|
||||
/// Panics if used after any of the write paths returned an error
|
||||
pub fn inspect_buffer(&self) -> &B {
|
||||
self.buf()
|
||||
pub fn inspect_mutable(&self) -> &B {
|
||||
self.mutable()
|
||||
}
|
||||
|
||||
/// Gets a reference to the maybe flushed read-only buffer.
|
||||
/// Returns `None` if the writer has not submitted any flush request.
|
||||
pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
|
||||
self.flush_handle.maybe_flushed.as_ref()
|
||||
}
|
||||
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
|
||||
pub async fn flush_and_into_inner(
|
||||
mut self,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(u64, Arc<W>)> {
|
||||
self.flush(ctx).await?;
|
||||
|
||||
let Self { buf, writer } = self;
|
||||
let Self {
|
||||
mutable: buf,
|
||||
writer,
|
||||
mut flush_handle,
|
||||
bytes_submitted: bytes_amount,
|
||||
} = self;
|
||||
flush_handle.shutdown().await?;
|
||||
assert!(buf.is_some());
|
||||
Ok(writer)
|
||||
Ok((bytes_amount, writer))
|
||||
}
|
||||
|
||||
/// Gets a reference to the mutable in-memory buffer.
|
||||
#[inline(always)]
|
||||
fn buf(&self) -> &B {
|
||||
self.buf
|
||||
fn mutable(&self) -> &B {
|
||||
self.mutable
|
||||
.as_ref()
|
||||
.expect("must not use after we returned an error")
|
||||
}
|
||||
|
||||
/// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn write_buffered<S: IoBuf + Send>(
|
||||
pub async fn write_buffered_borrowed(
|
||||
&mut self,
|
||||
chunk: FullSlice<S>,
|
||||
chunk: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<S>)> {
|
||||
let chunk = chunk.into_raw_slice();
|
||||
|
||||
let chunk_len = chunk.len();
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= self.buf().cap() {
|
||||
self.flush(ctx).await?;
|
||||
// do a big write, bypassing `buf`
|
||||
assert_eq!(
|
||||
self.buf
|
||||
.as_ref()
|
||||
.expect("must not use after an error")
|
||||
.pending(),
|
||||
0
|
||||
);
|
||||
let (nwritten, chunk) = self
|
||||
.writer
|
||||
.write_all(FullSlice::must_new(chunk), ctx)
|
||||
.await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
return Ok((nwritten, chunk));
|
||||
) -> std::io::Result<usize> {
|
||||
let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
|
||||
if let Some(control) = control {
|
||||
control.release().await;
|
||||
}
|
||||
// in-memory copy the < BUFFER_SIZED tail of the chunk
|
||||
assert!(chunk.len() < self.buf().cap());
|
||||
let mut slice = &chunk[..];
|
||||
while !slice.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let need = buf.cap() - buf.pending();
|
||||
let have = slice.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
buf.extend_from_slice(&slice[..n]);
|
||||
slice = &slice[n..];
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush(ctx).await?;
|
||||
}
|
||||
}
|
||||
assert!(slice.is_empty(), "by now we should have drained the chunk");
|
||||
Ok((chunk_len, FullSlice::must_new(chunk)))
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
|
||||
///
|
||||
/// It is less performant because we always have to copy the borrowed data into the internal buffer
|
||||
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
|
||||
/// for large writes.
|
||||
pub async fn write_buffered_borrowed(
|
||||
/// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
|
||||
pub(crate) async fn write_buffered_borrowed_controlled(
|
||||
&mut self,
|
||||
mut chunk: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
) -> std::io::Result<(usize, Option<FlushControl>)> {
|
||||
let chunk_len = chunk.len();
|
||||
let mut control: Option<FlushControl> = None;
|
||||
while !chunk.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let buf = self.mutable.as_mut().expect("must not use after an error");
|
||||
let need = buf.cap() - buf.pending();
|
||||
let have = chunk.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
@@ -145,26 +162,27 @@ where
|
||||
chunk = &chunk[n..];
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush(ctx).await?;
|
||||
if let Some(control) = control.take() {
|
||||
control.release().await;
|
||||
}
|
||||
control = self.flush(ctx).await?;
|
||||
}
|
||||
}
|
||||
Ok(chunk_len)
|
||||
Ok((chunk_len, control))
|
||||
}
|
||||
|
||||
async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
|
||||
let buf = self.buf.take().expect("must not use after an error");
|
||||
#[must_use = "caller must explcitly check the flush control"]
|
||||
async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
|
||||
let buf = self.mutable.take().expect("must not use after an error");
|
||||
let buf_len = buf.pending();
|
||||
if buf_len == 0 {
|
||||
self.buf = Some(buf);
|
||||
return Ok(());
|
||||
self.mutable = Some(buf);
|
||||
return Ok(None);
|
||||
}
|
||||
let slice = buf.flush();
|
||||
let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
self.buf = Some(Buffer::reuse_after_flush(
|
||||
slice.into_raw_slice().into_inner(),
|
||||
));
|
||||
Ok(())
|
||||
let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
|
||||
self.bytes_submitted += u64::try_from(buf_len).unwrap();
|
||||
self.mutable = Some(recycled);
|
||||
Ok(Some(flush_control))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,64 +210,77 @@ pub trait Buffer {
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
|
||||
}
|
||||
|
||||
impl Buffer for BytesMut {
|
||||
type IoBuf = BytesMut;
|
||||
impl Buffer for IoBufferMut {
|
||||
type IoBuf = IoBuffer;
|
||||
|
||||
#[inline(always)]
|
||||
fn cap(&self) -> usize {
|
||||
self.capacity()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
||||
BytesMut::extend_from_slice(self, other)
|
||||
if self.len() + other.len() > self.cap() {
|
||||
panic!("Buffer capacity exceeded");
|
||||
}
|
||||
|
||||
IoBufferMut::extend_from_slice(self, other);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn pending(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn flush(self) -> FullSlice<BytesMut> {
|
||||
self.slice_len()
|
||||
fn flush(self) -> FullSlice<Self::IoBuf> {
|
||||
self.freeze().slice_len()
|
||||
}
|
||||
|
||||
fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
|
||||
iobuf.clear();
|
||||
iobuf
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
self.extend_from_slice(&buf[..]);
|
||||
Ok((buf.len(), buf))
|
||||
/// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
||||
let mut recycled = iobuf
|
||||
.into_mut()
|
||||
.expect("buffer should only have one strong reference");
|
||||
recycled.clear();
|
||||
recycled
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::BytesMut;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
struct RecorderWriter {
|
||||
writes: Vec<Vec<u8>>,
|
||||
/// record bytes and write offsets.
|
||||
writes: Mutex<Vec<(Vec<u8>, u64)>>,
|
||||
}
|
||||
|
||||
impl RecorderWriter {
|
||||
/// Gets recorded bytes and write offsets.
|
||||
fn get_writes(&self) -> Vec<Vec<u8>> {
|
||||
self.writes
|
||||
.lock()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|(buf, _)| buf.clone())
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for RecorderWriter {
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
async fn write_all_at<Buf: IoBufAligned + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
self.writes.push(Vec::from(&buf[..]));
|
||||
Ok((buf.len(), buf))
|
||||
) -> std::io::Result<FullSlice<Buf>> {
|
||||
self.writes
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push((Vec::from(&buf[..]), offset));
|
||||
Ok(buf)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,71 +288,21 @@ mod tests {
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
|
||||
}
|
||||
|
||||
macro_rules! write {
|
||||
($writer:ident, $data:literal) => {{
|
||||
$writer
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
|
||||
.await?;
|
||||
}};
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_buffered_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"b");
|
||||
write!(writer, b"c");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"abc");
|
||||
write!(writer, b"de");
|
||||
write!(writer, b"");
|
||||
write!(writer, b"fghijk");
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"bc");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
|
||||
async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
|
||||
let ctx = test_ctx();
|
||||
let ctx = &ctx;
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
let recorder = Arc::new(RecorderWriter::default());
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let mut writer = BufferedWriter::<_, RecorderWriter>::new(
|
||||
recorder,
|
||||
|| IoBufferMut::with_capacity(2),
|
||||
gate.enter()?,
|
||||
ctx,
|
||||
);
|
||||
|
||||
writer.write_buffered_borrowed(b"abc", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"d", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"e", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"fg", ctx).await?;
|
||||
@@ -329,9 +310,9 @@ mod tests {
|
||||
writer.write_buffered_borrowed(b"j", ctx).await?;
|
||||
writer.write_buffered_borrowed(b"klmno", ctx).await?;
|
||||
|
||||
let recorder = writer.flush_and_into_inner(ctx).await?;
|
||||
let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
recorder.get_writes(),
|
||||
{
|
||||
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
|
||||
expect
|
||||
|
||||
314
pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
Normal file
314
pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
Normal file
@@ -0,0 +1,314 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::sync::duplex;
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
|
||||
};
|
||||
|
||||
use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
|
||||
|
||||
/// A handle to the flush task.
|
||||
pub struct FlushHandle<Buf, W> {
|
||||
inner: Option<FlushHandleInner<Buf, W>>,
|
||||
/// Immutable buffer for serving tail reads.
|
||||
/// `None` if no flush request has been submitted.
|
||||
pub(super) maybe_flushed: Option<FullSlice<Buf>>,
|
||||
}
|
||||
|
||||
pub struct FlushHandleInner<Buf, W> {
|
||||
/// A bi-directional channel that sends (buffer, offset) for writes,
|
||||
/// and receives recyled buffer.
|
||||
channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
|
||||
/// Join handle for the background flush task.
|
||||
join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
|
||||
}
|
||||
|
||||
struct FlushRequest<Buf> {
|
||||
slice: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
#[cfg(test)]
|
||||
ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
|
||||
#[cfg(test)]
|
||||
done_flush_tx: tokio::sync::oneshot::Sender<()>,
|
||||
}
|
||||
|
||||
/// Constructs a request and a control object for a new flush operation.
|
||||
#[cfg(not(test))]
|
||||
fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
|
||||
let request = FlushRequest { slice, offset };
|
||||
let control = FlushControl::untracked();
|
||||
|
||||
(request, control)
|
||||
}
|
||||
|
||||
/// Constructs a request and a control object for a new flush operation.
|
||||
#[cfg(test)]
|
||||
fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
|
||||
let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
|
||||
let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
|
||||
let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
|
||||
|
||||
let request = FlushRequest {
|
||||
slice,
|
||||
offset,
|
||||
ready_to_flush_rx,
|
||||
done_flush_tx,
|
||||
};
|
||||
(request, control)
|
||||
}
|
||||
|
||||
/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
|
||||
#[cfg(test)]
|
||||
pub(crate) struct FlushControl {
|
||||
not_started: FlushNotStarted,
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) struct FlushControl;
|
||||
|
||||
impl FlushControl {
|
||||
#[cfg(test)]
|
||||
fn not_started(
|
||||
ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
|
||||
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
|
||||
) -> Self {
|
||||
FlushControl {
|
||||
not_started: FlushNotStarted {
|
||||
ready_to_flush_tx,
|
||||
done_flush_rx,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
fn untracked() -> Self {
|
||||
FlushControl
|
||||
}
|
||||
|
||||
/// In tests, turn flush control into a not started state.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn into_not_started(self) -> FlushNotStarted {
|
||||
self.not_started
|
||||
}
|
||||
|
||||
/// Release control to the submitted buffer.
|
||||
///
|
||||
/// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
|
||||
pub async fn release(self) {
|
||||
#[cfg(test)]
|
||||
{
|
||||
self.not_started
|
||||
.ready_to_flush()
|
||||
.wait_until_flush_is_done()
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Buf, W> FlushHandle<Buf, W>
|
||||
where
|
||||
Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
|
||||
W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
|
||||
{
|
||||
/// Spawns a new background flush task and obtains a handle.
|
||||
///
|
||||
/// Note: The background task so we do not need to explicitly maintain a queue of buffers.
|
||||
pub fn spawn_new<B>(
|
||||
file: Arc<W>,
|
||||
buf: B,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: RequestContext,
|
||||
) -> Self
|
||||
where
|
||||
B: Buffer<IoBuf = Buf> + Send + 'static,
|
||||
{
|
||||
// It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
|
||||
let (front, back) = duplex::mpsc::channel(1);
|
||||
|
||||
let join_handle = tokio::spawn(async move {
|
||||
FlushBackgroundTask::new(back, file, gate_guard, ctx)
|
||||
.run(buf.flush())
|
||||
.await
|
||||
});
|
||||
|
||||
FlushHandle {
|
||||
inner: Some(FlushHandleInner {
|
||||
channel: front,
|
||||
join_handle,
|
||||
}),
|
||||
maybe_flushed: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Submits a buffer to be flushed in the background task.
|
||||
/// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
|
||||
/// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
|
||||
/// clear `maybe_flushed`.
|
||||
pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
|
||||
where
|
||||
B: Buffer<IoBuf = Buf> + Send + 'static,
|
||||
{
|
||||
let slice = buf.flush();
|
||||
|
||||
// Saves a buffer for read while flushing. This also removes reference to the old buffer.
|
||||
self.maybe_flushed = Some(slice.cheap_clone());
|
||||
|
||||
let (request, flush_control) = new_flush_op(slice, offset);
|
||||
|
||||
// Submits the buffer to the background task.
|
||||
let submit = self.inner_mut().channel.send(request).await;
|
||||
if submit.is_err() {
|
||||
return self.handle_error().await;
|
||||
}
|
||||
|
||||
// Wait for an available buffer from the background flush task.
|
||||
// This is the BACKPRESSURE mechanism: if the flush task can't keep up,
|
||||
// then the write path will eventually wait for it here.
|
||||
let Some(recycled) = self.inner_mut().channel.recv().await else {
|
||||
return self.handle_error().await;
|
||||
};
|
||||
|
||||
// The only other place that could hold a reference to the recycled buffer
|
||||
// is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
|
||||
let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
|
||||
Ok((recycled, flush_control))
|
||||
}
|
||||
|
||||
async fn handle_error<T>(&mut self) -> std::io::Result<T> {
|
||||
Err(self
|
||||
.shutdown()
|
||||
.await
|
||||
.expect_err("flush task only disconnects duplex if it exits with an error"))
|
||||
}
|
||||
|
||||
/// Cleans up the channel, join the flush task.
|
||||
pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
|
||||
let handle = self
|
||||
.inner
|
||||
.take()
|
||||
.expect("must not use after we returned an error");
|
||||
drop(handle.channel.tx);
|
||||
handle.join_handle.await.unwrap()
|
||||
}
|
||||
|
||||
/// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
|
||||
/// This only happens if the handle is used after an error.
|
||||
fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
|
||||
self.inner
|
||||
.as_mut()
|
||||
.expect("must not use after we returned an error")
|
||||
}
|
||||
}
|
||||
|
||||
/// A background task for flushing data to disk.
|
||||
pub struct FlushBackgroundTask<Buf, W> {
|
||||
/// A bi-directional channel that receives (buffer, offset) for writes,
|
||||
/// and send back recycled buffer.
|
||||
channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
|
||||
/// A writter for persisting data to disk.
|
||||
writer: Arc<W>,
|
||||
ctx: RequestContext,
|
||||
/// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
impl<Buf, W> FlushBackgroundTask<Buf, W>
|
||||
where
|
||||
Buf: IoBufAligned + Send + Sync,
|
||||
W: OwnedAsyncWriter + Sync + 'static,
|
||||
{
|
||||
/// Creates a new background flush task.
|
||||
fn new(
|
||||
channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
|
||||
file: Arc<W>,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: RequestContext,
|
||||
) -> Self {
|
||||
FlushBackgroundTask {
|
||||
channel,
|
||||
writer: file,
|
||||
_gate_guard: gate_guard,
|
||||
ctx,
|
||||
}
|
||||
}
|
||||
|
||||
/// Runs the background flush task.
|
||||
/// The passed in slice is immediately sent back to the flush handle through the duplex channel.
|
||||
async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
|
||||
// Sends the extra buffer back to the handle.
|
||||
self.channel.send(slice).await.map_err(|_| {
|
||||
std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
|
||||
})?;
|
||||
|
||||
// Exit condition: channel is closed and there is no remaining buffer to be flushed
|
||||
while let Some(request) = self.channel.recv().await {
|
||||
#[cfg(test)]
|
||||
{
|
||||
// In test, wait for control to signal that we are ready to flush.
|
||||
if request.ready_to_flush_rx.await.is_err() {
|
||||
tracing::debug!("control dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// Write slice to disk at `offset`.
|
||||
let slice = self
|
||||
.writer
|
||||
.write_all_at(request.slice, request.offset, &self.ctx)
|
||||
.await?;
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
// In test, tell control we are done flushing buffer.
|
||||
if request.done_flush_tx.send(()).is_err() {
|
||||
tracing::debug!("control dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
|
||||
if self.channel.send(slice).await.is_err() {
|
||||
// Although channel is closed. Still need to finish flushing the remaining buffers.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.writer)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) struct FlushNotStarted {
|
||||
ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
|
||||
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) struct FlushInProgress {
|
||||
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) struct FlushDone;
|
||||
|
||||
#[cfg(test)]
|
||||
impl FlushNotStarted {
|
||||
/// Signals the background task the buffer is ready to flush to disk.
|
||||
pub fn ready_to_flush(self) -> FlushInProgress {
|
||||
self.ready_to_flush_tx
|
||||
.send(())
|
||||
.map(|_| FlushInProgress {
|
||||
done_flush_rx: self.done_flush_rx,
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl FlushInProgress {
|
||||
/// Waits until background flush is done.
|
||||
pub async fn wait_until_flush_is_done(self) -> FlushDone {
|
||||
self.done_flush_rx.await.unwrap();
|
||||
FlushDone
|
||||
}
|
||||
}
|
||||
@@ -582,18 +582,21 @@ impl WalIngest {
|
||||
forknum: FSM_FORKNUM,
|
||||
};
|
||||
|
||||
// Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
|
||||
// and instead of digging in the FSM bitmap format we just clear the whole page.
|
||||
let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
|
||||
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
|
||||
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
|
||||
// Tail of last remaining FSM page has to be zeroed.
|
||||
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
|
||||
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
|
||||
&& self
|
||||
.shard
|
||||
.is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
|
||||
{
|
||||
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
|
||||
fsm_physical_page_no += 1;
|
||||
}
|
||||
// TODO: re-examine the None case here wrt. sharding; should we error?
|
||||
// Truncate this shard's view of the FSM relation size, if it even has one.
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
|
||||
if nblocks > fsm_physical_page_no {
|
||||
// check if something to do: FSM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
|
||||
.await?;
|
||||
}
|
||||
@@ -617,7 +620,7 @@ impl WalIngest {
|
||||
// tail bits in the last remaining map page, representing truncated heap
|
||||
// blocks, need to be cleared. This is not only tidy, but also necessary
|
||||
// because we don't get a chance to clear the bits if the heap is extended
|
||||
// again.
|
||||
// again. Only do this on the shard that owns the page.
|
||||
if (trunc_byte != 0 || trunc_offs != 0)
|
||||
&& self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
|
||||
{
|
||||
@@ -631,10 +634,9 @@ impl WalIngest {
|
||||
)?;
|
||||
vm_page_no += 1;
|
||||
}
|
||||
// TODO: re-examine the None case here wrt. sharding; should we error?
|
||||
// Truncate this shard's view of the VM relation size, if it even has one.
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
|
||||
if nblocks > vm_page_no {
|
||||
// check if something to do: VM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
|
||||
.await?;
|
||||
}
|
||||
@@ -1392,6 +1394,10 @@ impl WalIngest {
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
if !self.shard.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.handle_slru_extend(modification, kind, segno, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_slru_page_image(kind, segno, blknum, img)?;
|
||||
|
||||
@@ -610,6 +610,9 @@ prefetch_read(PrefetchRequest *slot)
|
||||
{
|
||||
NeonResponse *response;
|
||||
MemoryContext old;
|
||||
BufferTag buftag;
|
||||
shardno_t shard_no;
|
||||
uint64 my_ring_index;
|
||||
|
||||
Assert(slot->status == PRFS_REQUESTED);
|
||||
Assert(slot->response == NULL);
|
||||
@@ -623,11 +626,29 @@ prefetch_read(PrefetchRequest *slot)
|
||||
slot->status, slot->response,
|
||||
(long)slot->my_ring_index, (long)MyPState->ring_receive);
|
||||
|
||||
/*
|
||||
* Copy the request info so that if an error happens and the prefetch
|
||||
* queue is flushed during the receive call, we can print the original
|
||||
* values in the error message
|
||||
*/
|
||||
buftag = slot->buftag;
|
||||
shard_no = slot->shard_no;
|
||||
my_ring_index = slot->my_ring_index;
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
response = (NeonResponse *) page_server->receive(shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
if (response)
|
||||
{
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
@@ -642,11 +663,15 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, LOG,
|
||||
/*
|
||||
* Note: The slot might no longer be valid, if the connection was lost
|
||||
* and the prefetch queue was flushed during the receive call
|
||||
*/
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long)slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
(long) my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
|
||||
buftag.forkNum, buftag.blockNum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user