mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-29 19:10:38 +00:00
Compare commits
20 Commits
release-73
...
problame/l
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c7cb815c3 | ||
|
|
dd722fdaf6 | ||
|
|
0f6d3429cd | ||
|
|
2052b8b98d | ||
|
|
e3481bfcae | ||
|
|
0dd2e0d744 | ||
|
|
abef267bab | ||
|
|
89838e46bf | ||
|
|
47c4c33e0e | ||
|
|
1ce78b39fe | ||
|
|
43c47e684f | ||
|
|
c89defed85 | ||
|
|
6e51750e05 | ||
|
|
e59422eb77 | ||
|
|
a67b822c24 | ||
|
|
99cc5323b6 | ||
|
|
9efaeb871a | ||
|
|
5af0f228ed | ||
|
|
fe9417c98c | ||
|
|
b89bd691f6 |
4
.github/actionlint.yml
vendored
4
.github/actionlint.yml
vendored
@@ -21,7 +21,3 @@ config-variables:
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
- DEV_AWS_OIDC_ROLE_ARN
|
||||
- BENCHMARK_INGEST_TARGET_PROJECTID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
|
||||
|
||||
11
.github/actions/download/action.yml
vendored
11
.github/actions/download/action.yml
vendored
@@ -15,21 +15,10 @@ inputs:
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
aws_oicd_role_arn:
|
||||
description: "the OIDC role arn for aws auth"
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ inputs.aws_oicd_role_arn }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Download artifact
|
||||
id: download-artifact
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
@@ -62,7 +62,6 @@ runs:
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
@@ -71,7 +70,6 @@ runs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
- name: Download compatibility snapshot
|
||||
if: inputs.build_type != 'remote'
|
||||
@@ -83,7 +81,6 @@ runs:
|
||||
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
||||
# shouldn't fail the whole job. Only relevant test should fail.
|
||||
skip-if-does-not-exist: true
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
@@ -221,7 +218,6 @@ runs:
|
||||
# The lack of compatibility snapshot shouldn't fail the job
|
||||
# (for example if we didn't run the test for non build-and-test workflow)
|
||||
skip-if-does-not-exist: true
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
- name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
|
||||
if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
|
||||
@@ -236,4 +232,3 @@ runs:
|
||||
with:
|
||||
report-dir: /tmp/test_output/allure/results
|
||||
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
@@ -14,11 +14,9 @@ runs:
|
||||
name: coverage-data-artifact
|
||||
path: /tmp/coverage
|
||||
skip-if-does-not-exist: true # skip if there's no previous coverage to download
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
- name: Upload coverage data
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: coverage-data-artifact
|
||||
path: /tmp/coverage
|
||||
aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
|
||||
|
||||
11
.github/actions/upload/action.yml
vendored
11
.github/actions/upload/action.yml
vendored
@@ -14,10 +14,6 @@ inputs:
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
aws_oicd_role_arn:
|
||||
description: "the OIDC role arn for aws auth"
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -57,13 +53,6 @@ runs:
|
||||
|
||||
echo 'SKIPPED=false' >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ inputs.aws_oicd_role_arn }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Upload artifact
|
||||
if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
@@ -70,7 +70,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
- name: Create benchmark_restore_status table if it does not exist
|
||||
|
||||
20
.github/workflows/_build-and-test-locally.yml
vendored
20
.github/workflows/_build-and-test-locally.yml
vendored
@@ -31,13 +31,12 @@ defaults:
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
build-neon:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
contents: read
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
@@ -206,13 +205,6 @@ jobs:
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
@@ -264,7 +256,6 @@ jobs:
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
|
||||
path: /tmp/neon
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
@@ -274,10 +265,6 @@ jobs:
|
||||
regress-tests:
|
||||
# Don't run regression tests on debug arm64 builds
|
||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
contents: read
|
||||
statuses: write
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
@@ -296,7 +283,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Pytest regression tests
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
@@ -308,7 +295,6 @@ jobs:
|
||||
real_s3_region: eu-central-1
|
||||
rerun_failed: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
|
||||
6
.github/workflows/benchmarking.yml
vendored
6
.github/workflows/benchmarking.yml
vendored
@@ -105,7 +105,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
@@ -205,7 +204,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Run Logical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -407,7 +405,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
@@ -711,7 +708,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
@@ -822,7 +818,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
@@ -931,7 +926,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
|
||||
199
.github/workflows/build_and_test.yml
vendored
199
.github/workflows/build_and_test.yml
vendored
@@ -21,6 +21,8 @@ concurrency:
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
@@ -253,15 +255,15 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
|
||||
# Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled.
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||
# run without LFC on v17 release only
|
||||
test-cfg: |
|
||||
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "with-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "without-lfc"}]'
|
||||
|| '[{"pg_version":"v17", "lfc_state": "without-lfc" }]' }}
|
||||
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v15", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v16", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "without-lfc"},
|
||||
{"pg_version":"v17", "lfc_state": "with-lfc"}]'
|
||||
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
|
||||
secrets: inherit
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
@@ -358,11 +360,6 @@ jobs:
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
|
||||
@@ -383,7 +380,6 @@ jobs:
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
@@ -415,10 +411,6 @@ jobs:
|
||||
coverage-report:
|
||||
if: ${{ !startsWith(github.ref_name, 'release') }}
|
||||
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
@@ -445,14 +437,12 @@ jobs:
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Get coverage artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: coverage-data-artifact
|
||||
path: /tmp/coverage
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Merge coverage data
|
||||
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||
@@ -583,10 +573,6 @@ jobs:
|
||||
neon-image:
|
||||
needs: [ neon-image-arch, tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
@@ -601,15 +587,11 @@ jobs:
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch image to ECR
|
||||
run: |
|
||||
@@ -618,10 +600,6 @@ jobs:
|
||||
|
||||
compute-node-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -662,15 +640,11 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
@@ -743,10 +717,6 @@ jobs:
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@@ -791,15 +761,11 @@ jobs:
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
|
||||
run: |
|
||||
@@ -829,7 +795,7 @@ jobs:
|
||||
- pg: v17
|
||||
debian: bookworm
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.37.1
|
||||
VM_BUILDER_VERSION: v0.35.0
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -924,9 +890,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
id-token: write # for `aws-actions/configure-aws-credentials`
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16 v17
|
||||
@@ -937,15 +901,12 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- name: Login to dev ECR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Copy vm-compute-node images to ECR
|
||||
run: |
|
||||
@@ -1099,79 +1060,12 @@ jobs:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Create git tag and GitHub release
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
retries: 5
|
||||
script: |
|
||||
const tag = "${{ needs.tag.outputs.build-tag }}";
|
||||
|
||||
try {
|
||||
const existingRef = await github.rest.git.getRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: `tags/${tag}`,
|
||||
});
|
||||
|
||||
if (existingRef.data.object.sha !== context.sha) {
|
||||
throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
|
||||
}
|
||||
|
||||
console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
|
||||
} catch (error) {
|
||||
if (error.status !== 404) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
console.log(`Tag ${tag} does not exist. Creating it...`);
|
||||
await github.rest.git.createRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: `refs/tags/${tag}`,
|
||||
sha: context.sha,
|
||||
});
|
||||
console.log(`Tag ${tag} created successfully.`);
|
||||
}
|
||||
|
||||
// TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
|
||||
if (context.ref !== 'refs/heads/release') {
|
||||
console.log(`GitHub release skipped for ${context.ref}.`);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const existingRelease = await github.rest.repos.getReleaseByTag({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag: tag,
|
||||
});
|
||||
|
||||
console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
|
||||
} catch (error) {
|
||||
if (error.status !== 404) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
console.log(`Release for tag ${tag} does not exist. Creating it...`);
|
||||
await github.rest.repos.createRelease({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag_name: tag,
|
||||
generate_release_notes: true,
|
||||
});
|
||||
console.log(`Release for tag ${tag} created successfully.`);
|
||||
}
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
@@ -1221,13 +1115,38 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
await github.rest.git.createRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
|
||||
- name: Create GitHub release
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
await github.rest.repos.createRelease({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
tag_name: "${{ needs.tag.outputs.build-tag }}",
|
||||
generate_release_notes: true,
|
||||
})
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
needs: [ deploy ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
# `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: github.ref_name == 'release' && !failure() && !cancelled()
|
||||
|
||||
|
||||
42
.github/workflows/cloud-regress.yml
vendored
42
.github/workflows/cloud-regress.yml
vendored
@@ -19,19 +19,15 @@ concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
|
||||
jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg-version: [16, 17]
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
@@ -44,11 +40,9 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
env:
|
||||
PG_VERSION: ${{matrix.pg-version}}
|
||||
run: |
|
||||
cd "vendor/postgres-v${PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
@@ -61,9 +55,8 @@ jobs:
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
PG_VERSION: ${{matrix.pg-version}}
|
||||
run: |
|
||||
cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
@@ -79,44 +72,27 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create a new branch
|
||||
id: create-branch
|
||||
uses: ./.github/actions/neon-branch-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{matrix.pg-version}}
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
|
||||
|
||||
- name: Delete branch
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
|
||||
branch_id: ${{steps.create-branch.outputs.branch_id}}
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
|
||||
channel-id: "C033QLM5P7D" # on-call-staging-stream
|
||||
slack-message: |
|
||||
Periodic pg_regress on staging: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
|
||||
1
.github/workflows/ingest_benchmark.yml
vendored
1
.github/workflows/ingest_benchmark.yml
vendored
@@ -64,7 +64,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
|
||||
13
.github/workflows/neon_extra_builds.yml
vendored
13
.github/workflows/neon_extra_builds.yml
vendored
@@ -143,10 +143,6 @@ jobs:
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: write
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
@@ -181,18 +177,13 @@ jobs:
|
||||
- name: Produce the build stats
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Upload the build stats
|
||||
id: upload-stats
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
|
||||
aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
|
||||
|
||||
25
.github/workflows/periodic_pagebench.yml
vendored
25
.github/workflows/periodic_pagebench.yml
vendored
@@ -21,9 +21,6 @@ defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: false
|
||||
@@ -41,6 +38,8 @@ jobs:
|
||||
env:
|
||||
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
||||
AWS_DEFAULT_REGION : "eu-central-1"
|
||||
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||
steps:
|
||||
@@ -51,13 +50,6 @@ jobs:
|
||||
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||
run: curl https://ifconfig.me
|
||||
|
||||
- name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Start EC2 instance and wait for the instance to boot up
|
||||
run: |
|
||||
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||
@@ -132,10 +124,11 @@ jobs:
|
||||
cat "test_log_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Create Allure report
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
@@ -155,14 +148,6 @@ jobs:
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d ''
|
||||
|
||||
- name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
|
||||
10
.github/workflows/pg-clients.yml
vendored
10
.github/workflows/pg-clients.yml
vendored
@@ -25,13 +25,11 @@ defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write # require for posting a status update
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: neon-captest-new
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
AWS_DEFAULT_REGION: eu-central-1
|
||||
|
||||
jobs:
|
||||
@@ -96,7 +94,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
@@ -129,7 +126,6 @@ jobs:
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
@@ -163,7 +159,6 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
@@ -196,7 +191,6 @@ jobs:
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
|
||||
14
.github/workflows/pin-build-tools-image.yml
vendored
14
.github/workflows/pin-build-tools-image.yml
vendored
@@ -67,7 +67,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # for `azure/login` and aws auth
|
||||
id-token: write # for `azure/login`
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
@@ -75,15 +75,11 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
|
||||
1
.github/workflows/pre-merge-checks.yml
vendored
1
.github/workflows/pre-merge-checks.yml
vendored
@@ -63,7 +63,6 @@ jobs:
|
||||
if: always()
|
||||
permissions:
|
||||
statuses: write # for `github.repos.createCommitStatus(...)`
|
||||
contents: write
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
|
||||
4
.github/workflows/release.yml
vendored
4
.github/workflows/release.yml
vendored
@@ -3,7 +3,7 @@ name: Create Release Branch
|
||||
on:
|
||||
schedule:
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * FRI' # Storage release
|
||||
- cron: '0 6 * * MON' # Storage release
|
||||
- cron: '0 6 * * THU' # Proxy release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
@@ -29,7 +29,7 @@ defaults:
|
||||
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
515
Cargo.lock
generated
515
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
10
Cargo.toml
10
Cargo.toml
@@ -51,6 +51,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
|
||||
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -212,12 +216,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
## Azure SDK crates
|
||||
azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
|
||||
azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
|
||||
@@ -19,10 +19,3 @@ max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
|
||||
;; Disable connection logging. It produces a lot of logs that no one looks at,
|
||||
;; and we can get similar log entries from the proxy too. We had incidents in
|
||||
;; the past where the logging significantly stressed the log device or pgbouncer
|
||||
;; itself.
|
||||
log_connections=0
|
||||
log_disconnections=0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -246,48 +246,47 @@ fn try_spec_from_cli(
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// First, try to get cluster spec from the cli argument
|
||||
if let Some(spec_json) = spec_json {
|
||||
info!("got spec from cli argument {}", spec_json);
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_str(spec_json)?),
|
||||
live_config_allowed: false,
|
||||
});
|
||||
}
|
||||
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(spec_path) = spec_path {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
live_config_allowed: true,
|
||||
});
|
||||
}
|
||||
|
||||
let Some(compute_id) = compute_id else {
|
||||
panic!(
|
||||
"compute spec should be provided by one of the following ways: \
|
||||
--spec OR --spec-path OR --control-plane-uri and --compute-id"
|
||||
);
|
||||
};
|
||||
let Some(control_plane_uri) = control_plane_uri else {
|
||||
panic!("must specify both --control-plane-uri and --compute-id or none");
|
||||
};
|
||||
|
||||
match get_spec_from_control_plane(control_plane_uri, compute_id) {
|
||||
Ok(spec) => Ok(CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed: true,
|
||||
}),
|
||||
Err(e) => {
|
||||
error!(
|
||||
"cannot get response from control plane: {}\n\
|
||||
neither spec nor confirmation that compute is in the Empty state was received",
|
||||
e
|
||||
);
|
||||
Err(e)
|
||||
let spec;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => {
|
||||
info!("got spec from cli argument {}", json);
|
||||
spec = Some(serde_json::from_str(json)?);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
spec = Some(serde_json::from_reader(file)?);
|
||||
live_config_allowed = true;
|
||||
} else if let Some(id) = compute_id {
|
||||
if let Some(cp_base) = control_plane_uri {
|
||||
live_config_allowed = true;
|
||||
spec = match get_spec_from_control_plane(cp_base, id) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
error!("cannot get response from control plane: {}", e);
|
||||
panic!("neither spec nor confirmation that compute is in the Empty state was received");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
panic!("must specify both --control-plane-uri and --compute-id or none");
|
||||
}
|
||||
} else {
|
||||
panic!(
|
||||
"compute spec should be provided by one of the following ways: \
|
||||
--spec OR --spec-path OR --control-plane-uri and --compute-id"
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
})
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
|
||||
@@ -537,14 +537,12 @@ components:
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
version:
|
||||
type: string
|
||||
versions:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
owned_by_superuser:
|
||||
type: integer
|
||||
|
||||
SetRoleGrantsRequest:
|
||||
type: object
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use metrics::proto::MetricFamily;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres::{Client, NoTls};
|
||||
@@ -37,77 +38,61 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
/// Connect to every database (see list_dbs above) and get the list of installed extensions.
|
||||
///
|
||||
/// Same extension can be installed in multiple databases with different versions,
|
||||
/// so we report a separate metric (number of databases where it is installed)
|
||||
/// for each extension version.
|
||||
/// we only keep the highest and lowest version across all databases.
|
||||
pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
|
||||
conf.application_name("compute_ctl:get_installed_extensions");
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
let databases: Vec<String> = list_dbs(&mut client)?;
|
||||
|
||||
let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
|
||||
let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
|
||||
for db in databases.iter() {
|
||||
conf.dbname(db);
|
||||
let mut db_client = conf.connect(NoTls)?;
|
||||
let extensions: Vec<(String, String, i32)> = db_client
|
||||
let extensions: Vec<(String, String)> = db_client
|
||||
.query(
|
||||
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
|
||||
"SELECT extname, extversion FROM pg_catalog.pg_extension;",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| {
|
||||
(
|
||||
row.get("extname"),
|
||||
row.get("extversion"),
|
||||
row.get("extowner"),
|
||||
)
|
||||
})
|
||||
.map(|row| (row.get("extname"), row.get("extversion")))
|
||||
.collect();
|
||||
|
||||
for (extname, v, extowner) in extensions.iter() {
|
||||
for (extname, v) in extensions.iter() {
|
||||
let version = v.to_string();
|
||||
|
||||
// check if the extension is owned by superuser
|
||||
// 10 is the oid of superuser
|
||||
let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };
|
||||
// increment the number of databases where the version of extension is installed
|
||||
INSTALLED_EXTENSIONS
|
||||
.with_label_values(&[extname, &version])
|
||||
.inc();
|
||||
|
||||
extensions_map
|
||||
.entry((
|
||||
extname.to_string(),
|
||||
version.clone(),
|
||||
owned_by_superuser.to_string(),
|
||||
))
|
||||
.entry(extname.to_string())
|
||||
.and_modify(|e| {
|
||||
e.versions.insert(version.clone());
|
||||
// count the number of databases where the extension is installed
|
||||
e.n_databases += 1;
|
||||
})
|
||||
.or_insert(InstalledExtension {
|
||||
extname: extname.to_string(),
|
||||
version: version.clone(),
|
||||
versions: HashSet::from([version.clone()]),
|
||||
n_databases: 1,
|
||||
owned_by_superuser: owned_by_superuser.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (key, ext) in extensions_map.iter() {
|
||||
let (extname, version, owned_by_superuser) = key;
|
||||
let n_databases = ext.n_databases as u64;
|
||||
|
||||
INSTALLED_EXTENSIONS
|
||||
.with_label_values(&[extname, version, owned_by_superuser])
|
||||
.set(n_databases);
|
||||
}
|
||||
|
||||
Ok(InstalledExtensions {
|
||||
let res = InstalledExtensions {
|
||||
extensions: extensions_map.into_values().collect(),
|
||||
})
|
||||
};
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"compute_installed_extensions",
|
||||
"Number of databases where the version of extension is installed",
|
||||
&["extension_name", "version", "owned_by_superuser"]
|
||||
&["extension_name", "version"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -274,7 +274,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
"AWS_PROFILE",
|
||||
// HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
|
||||
"HOME",
|
||||
|
||||
@@ -810,7 +810,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(120))
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.unwrap();
|
||||
let response = client
|
||||
|
||||
@@ -435,7 +435,7 @@ impl PageServerNode {
|
||||
) -> anyhow::Result<()> {
|
||||
let config = Self::parse_config(settings)?;
|
||||
self.http_client
|
||||
.set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
|
||||
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -9,8 +9,8 @@ use pageserver_api::{
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -116,19 +116,9 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
},
|
||||
/// Set the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||
/// Any previous tenant configs are overwritten.
|
||||
SetTenantConfig {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
|
||||
/// provided JSON are unset from the tenant config and all fields with non-null values are set.
|
||||
/// Unspecified fields are not changed.
|
||||
PatchTenantConfig {
|
||||
TenantConfig {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
@@ -559,21 +549,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::SetTenantConfig { tenant_id, config } => {
|
||||
Command::TenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
vps_client
|
||||
.set_tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::PatchTenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
vps_client
|
||||
.patch_tenant_config(&TenantConfigPatchRequest {
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: tenant_conf,
|
||||
})
|
||||
@@ -756,7 +736,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
threshold,
|
||||
} => {
|
||||
vps_client
|
||||
.set_tenant_config(&TenantConfigRequest {
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: TenantConfig {
|
||||
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
|
||||
|
||||
@@ -42,7 +42,6 @@ allow = [
|
||||
"MPL-2.0",
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
"Unicode-3.0",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Display;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
@@ -162,9 +163,8 @@ pub enum ControlPlaneComputeStatus {
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct InstalledExtension {
|
||||
pub extname: String,
|
||||
pub version: String,
|
||||
pub versions: HashSet<String>,
|
||||
pub n_databases: u32, // Number of databases using this extension
|
||||
pub owned_by_superuser: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
|
||||
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct AvailabilityZone(pub String);
|
||||
|
||||
impl Display for AvailabilityZone {
|
||||
@@ -245,17 +245,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling policy enables us to selectively disable some automatic actions that the
|
||||
/// controller performs on a tenant shard. This is only set to a non-default value by
|
||||
/// human intervention, and it is reset to the default value (Active) when the tenant's
|
||||
/// placement policy is modified away from Attached.
|
||||
///
|
||||
/// The typical use of a non-Active scheduling policy is one of:
|
||||
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
|
||||
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
|
||||
///
|
||||
/// If you're not sure which policy to use to pin a shard to its current location, you probably
|
||||
/// want Pause.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
|
||||
@@ -24,7 +24,7 @@ pub struct Key {
|
||||
|
||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
||||
/// a struct of fields.
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
|
||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct CompactKey(i128);
|
||||
|
||||
/// The storage key size.
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::{
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use utils::{
|
||||
completion,
|
||||
@@ -325,115 +325,6 @@ impl Default for ShardParameters {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Eq, PartialEq)]
|
||||
pub enum FieldPatch<T> {
|
||||
Upsert(T),
|
||||
Remove,
|
||||
#[default]
|
||||
Noop,
|
||||
}
|
||||
|
||||
impl<T> FieldPatch<T> {
|
||||
fn is_noop(&self) -> bool {
|
||||
matches!(self, FieldPatch::Noop)
|
||||
}
|
||||
|
||||
pub fn apply(self, target: &mut Option<T>) {
|
||||
match self {
|
||||
Self::Upsert(v) => *target = Some(v),
|
||||
Self::Remove => *target = None,
|
||||
Self::Noop => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
|
||||
match self {
|
||||
Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
|
||||
Self::Remove => Ok(FieldPatch::<U>::Remove),
|
||||
Self::Noop => Ok(FieldPatch::<U>::Noop),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
Option::deserialize(deserializer).map(|opt| match opt {
|
||||
None => FieldPatch::Remove,
|
||||
Some(val) => FieldPatch::Upsert(val),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Serialize> Serialize for FieldPatch<T> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match self {
|
||||
FieldPatch::Upsert(val) => serializer.serialize_some(val),
|
||||
FieldPatch::Remove => serializer.serialize_none(),
|
||||
FieldPatch::Noop => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
#[serde(default)]
|
||||
pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub checkpoint_distance: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub checkpoint_timeout: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_target_size: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_threshold: FieldPatch<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_horizon: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_creation_threshold: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub pitr_interval: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub walreceiver_connect_timeout: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub lagging_wal_timeout: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub eviction_policy: FieldPatch<EvictionPolicy>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub min_resident_size_override: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub heatmap_period: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub lazy_slru_download: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub image_layer_creation_check_threshold: FieldPatch<u8>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub lsn_lease_length: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub lsn_lease_length_for_ts: FieldPatch<String>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub timeline_offloading: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
@@ -465,107 +356,6 @@ pub struct TenantConfig {
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
}
|
||||
|
||||
impl TenantConfig {
|
||||
pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
|
||||
let Self {
|
||||
mut checkpoint_distance,
|
||||
mut checkpoint_timeout,
|
||||
mut compaction_target_size,
|
||||
mut compaction_period,
|
||||
mut compaction_threshold,
|
||||
mut compaction_algorithm,
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
mut pitr_interval,
|
||||
mut walreceiver_connect_timeout,
|
||||
mut lagging_wal_timeout,
|
||||
mut max_lsn_wal_lag,
|
||||
mut eviction_policy,
|
||||
mut min_resident_size_override,
|
||||
mut evictions_low_residence_duration_metric_threshold,
|
||||
mut heatmap_period,
|
||||
mut lazy_slru_download,
|
||||
mut timeline_get_throttle,
|
||||
mut image_layer_creation_check_threshold,
|
||||
mut lsn_lease_length,
|
||||
mut lsn_lease_length_for_ts,
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
} = self;
|
||||
|
||||
patch.checkpoint_distance.apply(&mut checkpoint_distance);
|
||||
patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
|
||||
patch
|
||||
.compaction_target_size
|
||||
.apply(&mut compaction_target_size);
|
||||
patch.compaction_period.apply(&mut compaction_period);
|
||||
patch.compaction_threshold.apply(&mut compaction_threshold);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch.gc_horizon.apply(&mut gc_horizon);
|
||||
patch.gc_period.apply(&mut gc_period);
|
||||
patch
|
||||
.image_creation_threshold
|
||||
.apply(&mut image_creation_threshold);
|
||||
patch.pitr_interval.apply(&mut pitr_interval);
|
||||
patch
|
||||
.walreceiver_connect_timeout
|
||||
.apply(&mut walreceiver_connect_timeout);
|
||||
patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
|
||||
patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
|
||||
patch.eviction_policy.apply(&mut eviction_policy);
|
||||
patch
|
||||
.min_resident_size_override
|
||||
.apply(&mut min_resident_size_override);
|
||||
patch
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.apply(&mut evictions_low_residence_duration_metric_threshold);
|
||||
patch.heatmap_period.apply(&mut heatmap_period);
|
||||
patch.lazy_slru_download.apply(&mut lazy_slru_download);
|
||||
patch
|
||||
.timeline_get_throttle
|
||||
.apply(&mut timeline_get_throttle);
|
||||
patch
|
||||
.image_layer_creation_check_threshold
|
||||
.apply(&mut image_layer_creation_check_threshold);
|
||||
patch.lsn_lease_length.apply(&mut lsn_lease_length);
|
||||
patch
|
||||
.lsn_lease_length_for_ts
|
||||
.apply(&mut lsn_lease_length_for_ts);
|
||||
patch.timeline_offloading.apply(&mut timeline_offloading);
|
||||
patch
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
|
||||
Self {
|
||||
checkpoint_distance,
|
||||
checkpoint_timeout,
|
||||
compaction_target_size,
|
||||
compaction_period,
|
||||
compaction_threshold,
|
||||
compaction_algorithm,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
pitr_interval,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
eviction_policy,
|
||||
min_resident_size_override,
|
||||
evictions_low_residence_duration_metric_threshold,
|
||||
heatmap_period,
|
||||
lazy_slru_download,
|
||||
timeline_get_throttle,
|
||||
image_layer_creation_check_threshold,
|
||||
lsn_lease_length,
|
||||
lsn_lease_length_for_ts,
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage.
|
||||
///
|
||||
/// It can be switched through `switch_aux_file_policy` tenant config.
|
||||
@@ -896,14 +686,6 @@ impl TenantConfigRequest {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigPatchRequest {
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
|
||||
@@ -1917,45 +1699,4 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tenant_config_patch_request_serde() {
|
||||
let patch_request = TenantConfigPatchRequest {
|
||||
tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
|
||||
config: TenantConfigPatch {
|
||||
checkpoint_distance: FieldPatch::Upsert(42),
|
||||
gc_horizon: FieldPatch::Remove,
|
||||
compaction_threshold: FieldPatch::Noop,
|
||||
..TenantConfigPatch::default()
|
||||
},
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&patch_request).unwrap();
|
||||
|
||||
let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
|
||||
assert_eq!(json, expected);
|
||||
|
||||
let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(decoded.tenant_id, patch_request.tenant_id);
|
||||
assert_eq!(decoded.config, patch_request.config);
|
||||
|
||||
// Now apply the patch to a config to demonstrate semantics
|
||||
|
||||
let base = TenantConfig {
|
||||
checkpoint_distance: Some(28),
|
||||
gc_horizon: Some(100),
|
||||
compaction_target_size: Some(1024),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let expected = TenantConfig {
|
||||
checkpoint_distance: Some(42),
|
||||
gc_horizon: None,
|
||||
..base.clone()
|
||||
};
|
||||
|
||||
let patched = base.apply_patch(decoded.config);
|
||||
|
||||
assert_eq!(patched, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,14 +8,15 @@ use std::io;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, RetryOptions};
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::CopyStatus;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
@@ -75,9 +76,8 @@ impl AzureBlobStorage {
|
||||
let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
|
||||
StorageCredentials::access_key(account.clone(), access_key)
|
||||
} else {
|
||||
let token_credential = azure_identity::create_default_credential()
|
||||
.context("trying to obtain Azure default credentials")?;
|
||||
StorageCredentials::token_credential(token_credential)
|
||||
let token_credential = DefaultAzureCredential::default();
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
|
||||
// we have an outer retry
|
||||
@@ -624,10 +624,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
res
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
super::MAX_KEYS_PER_DELETE_AZURE
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -70,14 +70,7 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
/// As defined in S3 docs
|
||||
///
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
|
||||
pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
|
||||
|
||||
/// As defined in Azure docs
|
||||
///
|
||||
/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
|
||||
pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;
|
||||
pub const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
@@ -347,14 +340,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
|
||||
///
|
||||
/// The value returned is only an optimization hint, One can pass larger number of objects to
|
||||
/// `delete_objects` as well.
|
||||
///
|
||||
/// The value is guaranteed to be >= 1.
|
||||
fn max_keys_per_delete(&self) -> usize;
|
||||
|
||||
/// Deletes all objects matching the given prefix.
|
||||
///
|
||||
/// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
|
||||
@@ -548,16 +533,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
/// [`RemoteStorage::max_keys_per_delete`]
|
||||
pub fn max_keys_per_delete(&self) -> usize {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.max_keys_per_delete(),
|
||||
Self::AwsS3(s) => s.max_keys_per_delete(),
|
||||
Self::AzureBlob(s) => s.max_keys_per_delete(),
|
||||
Self::Unreliable(s) => s.max_keys_per_delete(),
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::delete_prefix`]
|
||||
pub async fn delete_prefix(
|
||||
&self,
|
||||
|
||||
@@ -573,10 +573,6 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
super::MAX_KEYS_PER_DELETE_S3
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -48,7 +48,7 @@ use crate::{
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
|
||||
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
|
||||
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
@@ -355,7 +355,7 @@ impl S3Bucket {
|
||||
let kind = RequestKind::Delete;
|
||||
let mut cancel = std::pin::pin!(cancel.cancelled());
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let req = self
|
||||
@@ -832,10 +832,6 @@ impl RemoteStorage for S3Bucket {
|
||||
self.delete_oids(&permit, &delete_objects, cancel).await
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
MAX_KEYS_PER_DELETE_S3
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths, cancel).await
|
||||
|
||||
@@ -203,10 +203,6 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
self.inner.max_keys_per_delete()
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -94,8 +94,6 @@ pub mod toml_edit_ext;
|
||||
|
||||
pub mod circuit_breaker;
|
||||
|
||||
pub mod try_rcu;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
|
||||
|
||||
pub trait ArcSwapExt<T> {
|
||||
/// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
|
||||
fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
|
||||
where
|
||||
F: FnMut(&T) -> Result<R, E>,
|
||||
R: Into<T>;
|
||||
}
|
||||
|
||||
impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
|
||||
where
|
||||
T: arc_swap::RefCnt,
|
||||
S: arc_swap::strategy::CaS<T>,
|
||||
{
|
||||
fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
|
||||
where
|
||||
F: FnMut(&T) -> Result<R, E>,
|
||||
R: Into<T>,
|
||||
{
|
||||
fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
|
||||
where
|
||||
A: arc_swap::AsRaw<Base>,
|
||||
B: arc_swap::AsRaw<Base>,
|
||||
{
|
||||
let a = a.as_raw();
|
||||
let b = b.as_raw();
|
||||
std::ptr::eq(a, b)
|
||||
}
|
||||
|
||||
let mut cur = self.load();
|
||||
loop {
|
||||
let new = f(&cur)?.into();
|
||||
let prev = self.compare_and_swap(&*cur, new);
|
||||
let swapped = ptr_eq(&*cur, &*prev);
|
||||
if swapped {
|
||||
return Ok(arc_swap::Guard::into_inner(prev));
|
||||
} else {
|
||||
cur = prev;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arc_swap::ArcSwap;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[test]
|
||||
fn test_try_rcu_success() {
|
||||
let swap = ArcSwap::from(Arc::new(42));
|
||||
|
||||
let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
|
||||
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(**swap.load(), 43);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_try_rcu_error() {
|
||||
let swap = ArcSwap::from(Arc::new(42));
|
||||
|
||||
let result = swap.try_rcu(|value| -> Result<i32, _> {
|
||||
if **value == 42 {
|
||||
Err("err")
|
||||
} else {
|
||||
Ok(**value + 1)
|
||||
}
|
||||
});
|
||||
|
||||
assert!(result.is_err());
|
||||
assert_eq!(result.unwrap_err(), "err");
|
||||
assert_eq!(**swap.load(), 42);
|
||||
}
|
||||
}
|
||||
@@ -37,7 +37,7 @@ message ValueMeta {
|
||||
}
|
||||
|
||||
message CompactKey {
|
||||
uint64 high = 1;
|
||||
uint64 low = 2;
|
||||
int64 high = 1;
|
||||
int64 low = 2;
|
||||
}
|
||||
|
||||
|
||||
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
|
||||
impl From<CompactKey> for proto::CompactKey {
|
||||
fn from(value: CompactKey) -> Self {
|
||||
proto::CompactKey {
|
||||
high: (value.raw() >> 64) as u64,
|
||||
low: value.raw() as u64,
|
||||
high: (value.raw() >> 64) as i64,
|
||||
low: value.raw() as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -354,64 +354,3 @@ impl From<proto::CompactKey> for CompactKey {
|
||||
(((value.high as i128) << 64) | (value.low as i128)).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compact_key_with_large_relnode() {
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
let inputs = vec![
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 0x100,
|
||||
field3: 0x200,
|
||||
field4: 0,
|
||||
field5: 0x10,
|
||||
field6: 0x5,
|
||||
},
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 0x100,
|
||||
field3: 0x200,
|
||||
field4: 0x007FFFFF,
|
||||
field5: 0x10,
|
||||
field6: 0x5,
|
||||
},
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 0x100,
|
||||
field3: 0x200,
|
||||
field4: 0x00800000,
|
||||
field5: 0x10,
|
||||
field6: 0x5,
|
||||
},
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 0x100,
|
||||
field3: 0x200,
|
||||
field4: 0x00800001,
|
||||
field5: 0x10,
|
||||
field6: 0x5,
|
||||
},
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 0xFFFFFFFF,
|
||||
field3: 0xFFFFFFFF,
|
||||
field4: 0xFFFFFFFF,
|
||||
field5: 0x0,
|
||||
field6: 0x0,
|
||||
},
|
||||
];
|
||||
|
||||
for input in inputs {
|
||||
assert!(input.is_valid_key_on_write_path());
|
||||
let compact = input.to_compact();
|
||||
let proto: proto::CompactKey = compact.into();
|
||||
let from_proto: CompactKey = proto.into();
|
||||
|
||||
assert_eq!(
|
||||
compact, from_proto,
|
||||
"Round trip failed for key with relnode={:#x}",
|
||||
input.field4
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
|
||||
let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
|
||||
let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-lib=static=pgport");
|
||||
println!("cargo:rustc-link-lib=static=pgcommon");
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||
|
||||
// Rebuild crate when libwalproposer.a changes
|
||||
|
||||
@@ -270,18 +270,12 @@ impl Client {
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PATCH, &uri, req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
|
||||
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
println!("operating on timeline {}", timeline);
|
||||
|
||||
mgmt_api_client
|
||||
.set_tenant_config(&TenantConfigRequest {
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
@@ -130,8 +131,7 @@ impl Deleter {
|
||||
}
|
||||
|
||||
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
|
||||
let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
|
||||
self.accumulator.reserve(max_keys_per_delete);
|
||||
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
|
||||
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -156,14 +156,14 @@ impl Deleter {
|
||||
|
||||
match msg {
|
||||
DeleterMessage::Delete(mut list) => {
|
||||
while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
|
||||
if self.accumulator.len() == max_keys_per_delete {
|
||||
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
self.flush().await?;
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
assert_eq!(self.accumulator.len(), 0);
|
||||
}
|
||||
|
||||
let available_slots = max_keys_per_delete - self.accumulator.len();
|
||||
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
|
||||
let take_count = std::cmp::min(available_slots, list.len());
|
||||
for path in list.drain(list.len() - take_count..) {
|
||||
self.accumulator.push(path);
|
||||
|
||||
@@ -767,27 +767,7 @@ paths:
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
Update tenant's config by setting it to the provided value
|
||||
|
||||
Invalid fields in the tenant config will cause the request to be rejected with status 400.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigRequest"
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
patch:
|
||||
description: |
|
||||
Update tenant's config additively by patching the updated fields provided.
|
||||
Null values unset the field and non-null values upsert it.
|
||||
Update tenant's config.
|
||||
|
||||
Invalid fields in the tenant config will cause the request to be rejected with status 400.
|
||||
requestBody:
|
||||
|
||||
@@ -28,7 +28,6 @@ use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::LsnLeaseRequest;
|
||||
use pageserver_api::models::OffloadedTimelineInfo;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantConfigPatchRequest;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigRequest;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
@@ -1696,47 +1695,7 @@ async fn update_tenant_config_handler(
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
let _ = tenant
|
||||
.update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
|
||||
.expect("Closure returns Ok()");
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn patch_tenant_config_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let updated = tenant
|
||||
.update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
updated,
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -2081,20 +2040,13 @@ async fn timeline_compact_handler(
|
||||
.as_ref()
|
||||
.map(|r| r.sub_compaction)
|
||||
.unwrap_or(false);
|
||||
let sub_compaction_max_job_size_mb = compact_request
|
||||
.as_ref()
|
||||
.and_then(|r| r.sub_compaction_max_job_size_mb);
|
||||
|
||||
let options = CompactOptions {
|
||||
compact_key_range: compact_request
|
||||
compact_range: compact_request
|
||||
.as_ref()
|
||||
.and_then(|r| r.compact_key_range.clone()),
|
||||
compact_lsn_range: compact_request
|
||||
.as_ref()
|
||||
.and_then(|r| r.compact_lsn_range.clone()),
|
||||
.and_then(|r| r.compact_range.clone()),
|
||||
compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
|
||||
flags,
|
||||
sub_compaction,
|
||||
sub_compaction_max_job_size_mb,
|
||||
};
|
||||
|
||||
let scheduled = compact_request
|
||||
@@ -2109,7 +2061,7 @@ async fn timeline_compact_handler(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
|
||||
let rx = tenant.schedule_compaction(timeline_id, options).await;
|
||||
if wait_until_scheduled_compaction_done {
|
||||
// It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
|
||||
rx.await.ok();
|
||||
@@ -3336,9 +3288,6 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
|
||||
api_handler(r, tenant_size_handler)
|
||||
})
|
||||
.patch("/v1/tenant/config", |r| {
|
||||
api_handler(r, patch_tenant_config_handler)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
api_handler(r, update_tenant_config_handler)
|
||||
})
|
||||
|
||||
@@ -16,6 +16,7 @@ use postgres_backend::{is_expected_io_error, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use strum::{EnumCount, VariantNames};
|
||||
use strum_macros::{IntoStaticStr, VariantNames};
|
||||
use tracing::warn;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
@@ -1224,74 +1225,30 @@ pub(crate) mod virtual_file_io_engine {
|
||||
|
||||
pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
|
||||
pub(crate) struct SmgrOpTimerInner {
|
||||
global_execution_latency_histo: Histogram,
|
||||
per_timeline_execution_latency_histo: Option<Histogram>,
|
||||
global_latency_histo: Histogram,
|
||||
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
per_timeline_latency_histo: Option<Histogram>,
|
||||
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
timings: SmgrOpTimerState,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum SmgrOpTimerState {
|
||||
Received {
|
||||
received_at: Instant,
|
||||
},
|
||||
ThrottleDoneExecutionStarting {
|
||||
received_at: Instant,
|
||||
throttle_started_at: Instant,
|
||||
started_execution_at: Instant,
|
||||
},
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
flush_started_at: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
start: Instant,
|
||||
throttled: Duration,
|
||||
op: SmgrQueryType,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
|
||||
pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
|
||||
let Some(throttle) = throttle else {
|
||||
return;
|
||||
};
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
match (&mut inner.timings, throttle) {
|
||||
(SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
|
||||
ThrottleResult::NotThrottled { start } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *received_at,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *start,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { start, end } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *start,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *end,
|
||||
};
|
||||
}
|
||||
},
|
||||
(x, _) => panic!("called in unexpected state: {x:?}"),
|
||||
}
|
||||
inner.throttled += *throttle;
|
||||
}
|
||||
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> Instant {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
flush_started_at: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
flush_start
|
||||
}
|
||||
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
@@ -1299,42 +1256,32 @@ impl SmgrOpTimer {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
let elapsed = now - inner.start;
|
||||
|
||||
let batch;
|
||||
let execution;
|
||||
let throttle;
|
||||
match inner.timings {
|
||||
SmgrOpTimerState::Received { received_at } => {
|
||||
batch = (now - received_at).as_secs_f64();
|
||||
// TODO: use label for dropped requests.
|
||||
// This is quite rare in practice, only during tenant/pageservers shutdown.
|
||||
throttle = Duration::ZERO;
|
||||
execution = Duration::ZERO.as_secs_f64();
|
||||
let elapsed = match elapsed.checked_sub(inner.throttled) {
|
||||
Some(elapsed) => elapsed,
|
||||
None => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[inner.op];
|
||||
rate_limit.call(|| {
|
||||
warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
|
||||
});
|
||||
elapsed // un-throttled time, more info than just saturating to 0
|
||||
}
|
||||
SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at,
|
||||
throttle_started_at,
|
||||
started_execution_at,
|
||||
} => {
|
||||
batch = (throttle_started_at - received_at).as_secs_f64();
|
||||
throttle = started_execution_at - throttle_started_at;
|
||||
execution = (now - started_execution_at).as_secs_f64();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// update time spent in batching
|
||||
inner.global_batch_wait_time.observe(batch);
|
||||
inner.per_timeline_batch_wait_time.observe(batch);
|
||||
let elapsed = elapsed.as_secs_f64();
|
||||
|
||||
// time spent in throttle metric is updated by throttle impl
|
||||
let _ = throttle;
|
||||
|
||||
// update metrics for execution latency
|
||||
inner.global_execution_latency_histo.observe(execution);
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution);
|
||||
inner.global_latency_histo.observe(elapsed);
|
||||
if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(elapsed);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
@@ -1347,42 +1294,6 @@ impl Drop for SmgrOpTimer {
|
||||
}
|
||||
}
|
||||
|
||||
impl SmgrOpFlushInProgress {
|
||||
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let elapsed = now - self.flush_started_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.per_timeline_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.flush_started_at = now;
|
||||
},
|
||||
|mut observe| {
|
||||
observe();
|
||||
},
|
||||
);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
|
||||
Ok(v) => return v,
|
||||
Err(_timeout) => {
|
||||
(*observe_guard)();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
@@ -1412,8 +1323,47 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_batch_size: Histogram,
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
}
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) async fn record_flush_in_progress<Fut, O>(
|
||||
&self,
|
||||
start_at: Instant,
|
||||
mut fut: Fut,
|
||||
) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut base = start_at;
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let now = Instant::now();
|
||||
let elapsed = now - base;
|
||||
self.global_flush_in_progress_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.per_timeline_flush_in_progress_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
base = now;
|
||||
},
|
||||
|mut observe| {
|
||||
observe();
|
||||
},
|
||||
);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
|
||||
Ok(v) => return v,
|
||||
Err(_timeout) => {
|
||||
(*observe_guard)();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1436,15 +1386,12 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Alias so all histograms recording per-timeline smgr timings use the same buckets.
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
|
||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
|
||||
SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1502,7 +1449,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
|
||||
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds_global",
|
||||
"Like pageserver_smgr_query_seconds, but aggregated to instance level.",
|
||||
"Time spent on smgr query handling, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
|
||||
)
|
||||
@@ -1599,25 +1546,6 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_page_service_pagestream_batch_wait_time_seconds",
|
||||
"Time a request spent waiting in its batch until the batch moved to throttle&execution.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_page_service_pagestream_batch_wait_time_seconds_global",
|
||||
"Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
|
||||
SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
@@ -1658,11 +1586,6 @@ impl SmgrQueryTimePerTimeline {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
|
||||
let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_flush_in_progress_micros =
|
||||
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
|
||||
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
|
||||
@@ -1678,11 +1601,9 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_batch_size,
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
|
||||
self.global_started[op as usize].inc();
|
||||
|
||||
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
|
||||
@@ -1693,15 +1614,11 @@ impl SmgrQueryTimePerTimeline {
|
||||
};
|
||||
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_execution_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_execution_latency_histo: per_timeline_latency_histo,
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
global_batch_wait_time: self.global_batch_wait_time.clone(),
|
||||
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
|
||||
global_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_latency_histo,
|
||||
start: started_at,
|
||||
op,
|
||||
throttled: Duration::ZERO,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -2955,11 +2872,6 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2990,7 +2902,6 @@ use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::throttle::ThrottleResult;
|
||||
use crate::tenant::Timeline;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
@@ -3845,7 +3756,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||
&CIRCUIT_BREAKERS_BROKEN,
|
||||
&CIRCUIT_BREAKERS_UNBROKEN,
|
||||
&PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -3893,7 +3803,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
&PAGE_SERVICE_BATCH_SIZE_GLOBAL,
|
||||
&PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|h| {
|
||||
|
||||
@@ -575,10 +575,7 @@ enum BatchedFeMessage {
|
||||
}
|
||||
|
||||
impl BatchedFeMessage {
|
||||
async fn throttle_and_record_start_processing(
|
||||
&mut self,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
|
||||
let (shard, tokens, timers) = match self {
|
||||
BatchedFeMessage::Exists { shard, timer, .. }
|
||||
| BatchedFeMessage::Nblocks { shard, timer, .. }
|
||||
@@ -606,7 +603,7 @@ impl BatchedFeMessage {
|
||||
}
|
||||
};
|
||||
for timer in timers {
|
||||
timer.observe_throttle_done_execution_starting(&throttled);
|
||||
timer.deduct_throttle(&throttled);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -916,9 +913,10 @@ impl PageServerHandler {
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
// invoke handler function
|
||||
let (handler_results, span): (
|
||||
let (handler_results, span, shard): (
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
|
||||
_,
|
||||
_,
|
||||
) = match batch {
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
@@ -934,6 +932,7 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
Some(shard),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::Nblocks {
|
||||
@@ -950,6 +949,7 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
Some(shard),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage {
|
||||
@@ -976,6 +976,7 @@ impl PageServerHandler {
|
||||
res
|
||||
},
|
||||
span,
|
||||
Some(shard),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::DbSize {
|
||||
@@ -992,6 +993,7 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
Some(shard),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
@@ -1008,12 +1010,13 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
Some(shard),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::RespondError { span, error } => {
|
||||
// We've already decided to respond with an error, so we don't need to
|
||||
// call the handler.
|
||||
(vec![Err(error)], span)
|
||||
(vec![Err(error)], span, None)
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1074,17 +1077,21 @@ impl PageServerHandler {
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
let start_flushing_at = match timer {
|
||||
Some(timer) => timer.observe_smgr_op_completion_and_start_flushing(),
|
||||
None => Instant::now(),
|
||||
};
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
// metric for how long flushing takes
|
||||
let flush_fut = match flushing_timer {
|
||||
Some(flushing_timer) => {
|
||||
futures::future::Either::Left(flushing_timer.measure(flush_fut))
|
||||
}
|
||||
None => futures::future::Either::Right(flush_fut),
|
||||
let flush_fut = if let Some(handle) = &shard {
|
||||
futures::future::Either::Left(
|
||||
handle
|
||||
.query_metrics
|
||||
.record_flush_in_progress(start_flushing_at, flush_fut),
|
||||
)
|
||||
} else {
|
||||
futures::future::Either::Right(flush_fut)
|
||||
};
|
||||
// do it while respecting cancellation
|
||||
let _: () = async move {
|
||||
@@ -1233,7 +1240,7 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
|
||||
if let Err(cancelled) = msg.throttle(&self.cancel).await {
|
||||
break cancelled;
|
||||
}
|
||||
|
||||
@@ -1400,9 +1407,7 @@ impl PageServerHandler {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
batch
|
||||
.throttle_and_record_start_processing(&self.cancel)
|
||||
.await?;
|
||||
batch.throttle(&self.cancel).await?;
|
||||
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@ use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::GcCompactJob;
|
||||
use timeline::compaction::ScheduledCompactionTask;
|
||||
use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
@@ -69,7 +68,6 @@ use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::timeout::TimeoutCancellableError;
|
||||
use utils::try_rcu::ArcSwapExt;
|
||||
use utils::zstd::create_zst_tarball;
|
||||
use utils::zstd::extract_zst_tarball;
|
||||
|
||||
@@ -3018,15 +3016,8 @@ impl Tenant {
|
||||
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
|
||||
} else if next_scheduled_compaction_task.options.sub_compaction {
|
||||
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs: Vec<GcCompactJob> = timeline
|
||||
.gc_compaction_split_jobs(
|
||||
GcCompactJob::from_compact_options(
|
||||
next_scheduled_compaction_task.options.clone(),
|
||||
),
|
||||
next_scheduled_compaction_task
|
||||
.options
|
||||
.sub_compaction_max_job_size_mb,
|
||||
)
|
||||
let jobs = timeline
|
||||
.gc_compaction_split_jobs(next_scheduled_compaction_task.options)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
if jobs.is_empty() {
|
||||
@@ -3037,37 +3028,14 @@ impl Tenant {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
|
||||
// until we do further refactors to allow directly call `compact_with_gc`.
|
||||
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
if job.dry_run {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
let options = CompactOptions {
|
||||
flags,
|
||||
sub_compaction: false,
|
||||
compact_key_range: Some(job.compact_key_range.into()),
|
||||
compact_lsn_range: Some(job.compact_lsn_range.into()),
|
||||
sub_compaction_max_job_size_mb: None,
|
||||
};
|
||||
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
|
||||
ScheduledCompactionTask {
|
||||
options,
|
||||
// The last job in the queue sends the signal and releases the gc guard
|
||||
result_tx: next_scheduled_compaction_task
|
||||
.result_tx
|
||||
.take(),
|
||||
gc_block: next_scheduled_compaction_task
|
||||
.gc_block
|
||||
.take(),
|
||||
}
|
||||
} else {
|
||||
ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: None,
|
||||
gc_block: None,
|
||||
}
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options: job,
|
||||
result_tx: if idx == jobs_len - 1 {
|
||||
// The last compaction job sends the completion signal
|
||||
next_scheduled_compaction_task.result_tx.take()
|
||||
} else {
|
||||
None
|
||||
},
|
||||
});
|
||||
}
|
||||
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
|
||||
@@ -3127,22 +3095,15 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
options: CompactOptions,
|
||||
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
|
||||
let gc_guard = match self.gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(e) => {
|
||||
bail!("cannot run gc-compaction because gc is blocked: {}", e);
|
||||
}
|
||||
};
|
||||
) -> tokio::sync::oneshot::Receiver<()> {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(timeline_id).or_default();
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: Some(tx),
|
||||
gc_block: Some(gc_guard),
|
||||
});
|
||||
Ok(rx)
|
||||
rx
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
@@ -3944,28 +3905,25 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
|
||||
&self,
|
||||
update: F,
|
||||
) -> anyhow::Result<TenantConfOpt> {
|
||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
// Use read-copy-update in order to avoid overwriting the location config
|
||||
// state if this races with [`Tenant::set_new_location_config`]. Note that
|
||||
// this race is not possible if both request types come from the storage
|
||||
// controller (as they should!) because an exclusive op lock is required
|
||||
// on the storage controller side.
|
||||
|
||||
self.tenant_conf
|
||||
.try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
|
||||
Ok(Arc::new(AttachedTenantConf {
|
||||
tenant_conf: update(attached_conf.tenant_conf.clone())?,
|
||||
location: attached_conf.location,
|
||||
lsn_lease_deadline: attached_conf.lsn_lease_deadline,
|
||||
}))
|
||||
})?;
|
||||
self.tenant_conf.rcu(|inner| {
|
||||
Arc::new(AttachedTenantConf {
|
||||
tenant_conf: new_tenant_conf.clone(),
|
||||
location: inner.location,
|
||||
// Attached location is not changed, no need to update lsn lease deadline.
|
||||
lsn_lease_deadline: inner.lsn_lease_deadline,
|
||||
})
|
||||
});
|
||||
|
||||
let updated = self.tenant_conf.load();
|
||||
let updated = self.tenant_conf.load().clone();
|
||||
|
||||
self.tenant_conf_updated(&updated.tenant_conf);
|
||||
self.tenant_conf_updated(&new_tenant_conf);
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||
// mutexes in struct Timeline in the future.
|
||||
@@ -3973,8 +3931,6 @@ impl Tenant {
|
||||
for timeline in timelines {
|
||||
timeline.tenant_conf_updated(&updated);
|
||||
}
|
||||
|
||||
Ok(updated.tenant_conf.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
||||
@@ -4534,12 +4490,7 @@ impl Tenant {
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
|
||||
let original_cutoffs = target.cutoffs.clone();
|
||||
// GC cutoffs should never go back
|
||||
target.cutoffs = GcCutoffs {
|
||||
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
|
||||
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
|
||||
}
|
||||
target.cutoffs = cutoffs.clone();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5764,8 +5715,6 @@ mod tests {
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::CompactLsnRange;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
@@ -8201,12 +8150,6 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x30);
|
||||
@@ -8309,12 +8252,6 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
@@ -8695,12 +8632,6 @@ mod tests {
|
||||
.await?
|
||||
};
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -8782,12 +8713,6 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
@@ -9235,12 +9160,6 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9357,6 +9276,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -9382,12 +9302,6 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x38))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x38);
|
||||
@@ -9483,12 +9397,6 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9605,6 +9513,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -9634,8 +9543,6 @@ mod tests {
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
use timeline::CompactLsnRange;
|
||||
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -9734,12 +9641,6 @@ mod tests {
|
||||
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
{
|
||||
parent_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x10))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = parent_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9754,12 +9655,6 @@ mod tests {
|
||||
}
|
||||
|
||||
{
|
||||
branch_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x50))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = branch_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9828,22 +9723,6 @@ mod tests {
|
||||
|
||||
verify_result().await;
|
||||
|
||||
// Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files.
|
||||
// Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18).
|
||||
branch_tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -10105,12 +9984,6 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -10132,7 +10005,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_key_range: Some((get_key(0)..get_key(2)).into()),
|
||||
compact_range: Some((get_key(0)..get_key(2)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -10179,7 +10052,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_key_range: Some((get_key(2)..get_key(4)).into()),
|
||||
compact_range: Some((get_key(2)..get_key(4)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -10231,7 +10104,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_key_range: Some((get_key(4)..get_key(9)).into()),
|
||||
compact_range: Some((get_key(4)..get_key(9)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -10282,7 +10155,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_key_range: Some((get_key(9)..get_key(10)).into()),
|
||||
compact_range: Some((get_key(9)..get_key(10)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -10338,7 +10211,7 @@ mod tests {
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_key_range: Some((get_key(0)..get_key(10)).into()),
|
||||
compact_range: Some((get_key(0)..get_key(10)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
@@ -10367,6 +10240,7 @@ mod tests {
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -10419,602 +10293,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
)];
|
||||
let delta4 = vec![(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
)];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x38),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The original image layer, not compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// Delta layer below the specified above_lsn not compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x28),
|
||||
is_delta: true,
|
||||
},
|
||||
// Delta layer compacted above the LSN
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(10),
|
||||
lsn_range: Lsn(0x28)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, CompactOptions::default(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The compacted image layer (full key range)
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// All other data in the delta layer
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
)];
|
||||
let delta4 = vec![(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
)];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x38),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: Some((get_key(0)..get_key(2)).into()),
|
||||
compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The original image layer, not compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and
|
||||
// the layer 0x28-0x30 into one.
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x30),
|
||||
is_delta: true,
|
||||
},
|
||||
// Above the upper bound and untouched
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
// This layer is untouched
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: Some((get_key(3)..get_key(8)).into()),
|
||||
compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The original image layer, not compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// Not in the compaction key range, uncompacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x30),
|
||||
is_delta: true,
|
||||
},
|
||||
// Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
// Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer
|
||||
// horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction
|
||||
// becomes 0x50.
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: Some((get_key(0)..get_key(5)).into()),
|
||||
compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The original image layer, not compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// The range gets compacted
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
// Not touched during this iteration of compaction
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// final full compaction
|
||||
tline
|
||||
.compact_with_gc(&cancel, CompactOptions::default(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// The compacted image layer (full key range)
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// All other data in the delta layer
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(1)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -427,129 +427,6 @@ impl TenantConfOpt {
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
|
||||
let Self {
|
||||
mut checkpoint_distance,
|
||||
mut checkpoint_timeout,
|
||||
mut compaction_target_size,
|
||||
mut compaction_period,
|
||||
mut compaction_threshold,
|
||||
mut compaction_algorithm,
|
||||
mut gc_horizon,
|
||||
mut gc_period,
|
||||
mut image_creation_threshold,
|
||||
mut pitr_interval,
|
||||
mut walreceiver_connect_timeout,
|
||||
mut lagging_wal_timeout,
|
||||
mut max_lsn_wal_lag,
|
||||
mut eviction_policy,
|
||||
mut min_resident_size_override,
|
||||
mut evictions_low_residence_duration_metric_threshold,
|
||||
mut heatmap_period,
|
||||
mut lazy_slru_download,
|
||||
mut timeline_get_throttle,
|
||||
mut image_layer_creation_check_threshold,
|
||||
mut lsn_lease_length,
|
||||
mut lsn_lease_length_for_ts,
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
} = self;
|
||||
|
||||
patch.checkpoint_distance.apply(&mut checkpoint_distance);
|
||||
patch
|
||||
.checkpoint_timeout
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut checkpoint_timeout);
|
||||
patch
|
||||
.compaction_target_size
|
||||
.apply(&mut compaction_target_size);
|
||||
patch
|
||||
.compaction_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut compaction_period);
|
||||
patch.compaction_threshold.apply(&mut compaction_threshold);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch.gc_horizon.apply(&mut gc_horizon);
|
||||
patch
|
||||
.gc_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut gc_period);
|
||||
patch
|
||||
.image_creation_threshold
|
||||
.apply(&mut image_creation_threshold);
|
||||
patch
|
||||
.pitr_interval
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut pitr_interval);
|
||||
patch
|
||||
.walreceiver_connect_timeout
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut walreceiver_connect_timeout);
|
||||
patch
|
||||
.lagging_wal_timeout
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lagging_wal_timeout);
|
||||
patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
|
||||
patch.eviction_policy.apply(&mut eviction_policy);
|
||||
patch
|
||||
.min_resident_size_override
|
||||
.apply(&mut min_resident_size_override);
|
||||
patch
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut evictions_low_residence_duration_metric_threshold);
|
||||
patch
|
||||
.heatmap_period
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut heatmap_period);
|
||||
patch.lazy_slru_download.apply(&mut lazy_slru_download);
|
||||
patch
|
||||
.timeline_get_throttle
|
||||
.apply(&mut timeline_get_throttle);
|
||||
patch
|
||||
.image_layer_creation_check_threshold
|
||||
.apply(&mut image_layer_creation_check_threshold);
|
||||
patch
|
||||
.lsn_lease_length
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lsn_lease_length);
|
||||
patch
|
||||
.lsn_lease_length_for_ts
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lsn_lease_length_for_ts);
|
||||
patch.timeline_offloading.apply(&mut timeline_offloading);
|
||||
patch
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
|
||||
Ok(Self {
|
||||
checkpoint_distance,
|
||||
checkpoint_timeout,
|
||||
compaction_target_size,
|
||||
compaction_period,
|
||||
compaction_threshold,
|
||||
compaction_algorithm,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
image_creation_threshold,
|
||||
pitr_interval,
|
||||
walreceiver_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
eviction_policy,
|
||||
min_resident_size_override,
|
||||
evictions_low_residence_duration_metric_threshold,
|
||||
heatmap_period,
|
||||
lazy_slru_download,
|
||||
timeline_get_throttle,
|
||||
image_layer_creation_check_threshold,
|
||||
lsn_lease_length,
|
||||
lsn_lease_length_for_ts,
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
|
||||
/// Do not add any more features taking and forbidding taking this lock. It should be
|
||||
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
|
||||
/// synchronizes with gc attempts by locking and unlocking this mutex.
|
||||
blocking: Arc<tokio::sync::Mutex<()>>,
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl GcBlock {
|
||||
@@ -30,7 +30,7 @@ impl GcBlock {
|
||||
/// it's ending, or if not currently possible, a value describing the reasons why not.
|
||||
///
|
||||
/// Cancellation safe.
|
||||
pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
|
||||
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
|
||||
let reasons = {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
@@ -44,7 +44,7 @@ impl GcBlock {
|
||||
Err(reasons)
|
||||
} else {
|
||||
Ok(Guard {
|
||||
_inner: self.blocking.clone().lock_owned().await,
|
||||
_inner: self.blocking.lock().await,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -170,8 +170,8 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Guard {
|
||||
_inner: tokio::sync::OwnedMutexGuard<()>,
|
||||
pub(super) struct Guard<'a> {
|
||||
_inner: tokio::sync::MutexGuard<'a, ()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -58,11 +58,6 @@ pub struct Stats {
|
||||
pub sum_throttled_usecs: u64,
|
||||
}
|
||||
|
||||
pub enum ThrottleResult {
|
||||
NotThrottled { start: Instant },
|
||||
Throttled { start: Instant, end: Instant },
|
||||
}
|
||||
|
||||
impl<M> Throttle<M>
|
||||
where
|
||||
M: Metric,
|
||||
@@ -127,15 +122,15 @@ where
|
||||
self.inner.load().rate_limiter.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
|
||||
pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
if !inner.enabled {
|
||||
return ThrottleResult::NotThrottled { start };
|
||||
return None;
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
@@ -150,9 +145,9 @@ where
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
ThrottleResult::Throttled { start, end: now }
|
||||
Some(wait_time)
|
||||
} else {
|
||||
ThrottleResult::NotThrottled { start }
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -780,90 +780,46 @@ pub(crate) enum CompactFlags {
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct CompactRequest {
|
||||
pub compact_key_range: Option<CompactKeyRange>,
|
||||
pub compact_lsn_range: Option<CompactLsnRange>,
|
||||
pub compact_range: Option<CompactRange>,
|
||||
pub compact_below_lsn: Option<Lsn>,
|
||||
/// Whether the compaction job should be scheduled.
|
||||
#[serde(default)]
|
||||
pub scheduled: bool,
|
||||
/// Whether the compaction job should be split across key ranges.
|
||||
#[serde(default)]
|
||||
pub sub_compaction: bool,
|
||||
/// Max job size for each subcompaction job.
|
||||
pub sub_compaction_max_job_size_mb: Option<u64>,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct CompactLsnRange {
|
||||
pub start: Lsn,
|
||||
pub end: Lsn,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct CompactKeyRange {
|
||||
pub(crate) struct CompactRange {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub start: Key,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub end: Key,
|
||||
}
|
||||
|
||||
impl From<Range<Lsn>> for CompactLsnRange {
|
||||
fn from(range: Range<Lsn>) -> Self {
|
||||
Self {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Range<Key>> for CompactKeyRange {
|
||||
impl From<Range<Key>> for CompactRange {
|
||||
fn from(range: Range<Key>) -> Self {
|
||||
Self {
|
||||
CompactRange {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CompactLsnRange> for Range<Lsn> {
|
||||
fn from(range: CompactLsnRange) -> Self {
|
||||
range.start..range.end
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CompactKeyRange> for Range<Key> {
|
||||
fn from(range: CompactKeyRange) -> Self {
|
||||
range.start..range.end
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactLsnRange {
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn above(lsn: Lsn) -> Self {
|
||||
Self {
|
||||
start: lsn,
|
||||
end: Lsn::MAX,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(crate) struct CompactOptions {
|
||||
pub flags: EnumSet<CompactFlags>,
|
||||
/// If set, the compaction will only compact the key range specified by this option.
|
||||
/// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
|
||||
pub compact_key_range: Option<CompactKeyRange>,
|
||||
/// If set, the compaction will only compact the LSN within this value.
|
||||
/// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
|
||||
pub compact_lsn_range: Option<CompactLsnRange>,
|
||||
/// This option is only used by GC compaction.
|
||||
pub compact_range: Option<CompactRange>,
|
||||
/// If set, the compaction will only compact the LSN below this value.
|
||||
/// This option is only used by GC compaction.
|
||||
pub compact_below_lsn: Option<Lsn>,
|
||||
/// Enable sub-compaction (split compaction job across key ranges).
|
||||
/// This option is only used by GC compaction.
|
||||
pub sub_compaction: bool,
|
||||
/// Set job size for the GC compaction.
|
||||
/// This option is only used by GC compaction.
|
||||
pub sub_compaction_max_job_size_mb: Option<u64>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -1685,10 +1641,9 @@ impl Timeline {
|
||||
cancel,
|
||||
CompactOptions {
|
||||
flags,
|
||||
compact_key_range: None,
|
||||
compact_lsn_range: None,
|
||||
compact_range: None,
|
||||
compact_below_lsn: None,
|
||||
sub_compaction: false,
|
||||
sub_compaction_max_job_size_mb: None,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
|
||||
@@ -10,8 +10,8 @@ use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
|
||||
ImageLayerCreationMode, RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
@@ -41,7 +41,7 @@ use crate::tenant::storage_layer::{
|
||||
use crate::tenant::timeline::ImageLayerCreationOutcome;
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
@@ -63,68 +63,21 @@ use super::CompactionError;
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
/// A scheduled compaction task.
|
||||
pub(crate) struct ScheduledCompactionTask {
|
||||
/// It's unfortunate that we need to store a compact options struct here because the only outer
|
||||
/// API we can call here is `compact_with_options` which does a few setup calls before starting the
|
||||
/// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
|
||||
pub struct ScheduledCompactionTask {
|
||||
pub options: CompactOptions,
|
||||
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
|
||||
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
|
||||
pub gc_block: Option<gc_block::Guard>,
|
||||
}
|
||||
|
||||
/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
|
||||
/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets
|
||||
/// called.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct GcCompactJob {
|
||||
pub dry_run: bool,
|
||||
/// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range
|
||||
/// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary.
|
||||
pub compact_key_range: Range<Key>,
|
||||
/// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be
|
||||
/// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range
|
||||
/// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
|
||||
/// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
|
||||
pub compact_lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl GcCompactJob {
|
||||
pub fn from_compact_options(options: CompactOptions) -> Self {
|
||||
GcCompactJob {
|
||||
dry_run: options.flags.contains(CompactFlags::DryRun),
|
||||
compact_key_range: options
|
||||
.compact_key_range
|
||||
.map(|x| x.into())
|
||||
.unwrap_or(Key::MIN..Key::MAX),
|
||||
compact_lsn_range: options
|
||||
.compact_lsn_range
|
||||
.map(|x| x.into())
|
||||
.unwrap_or(Lsn::INVALID..Lsn::MAX),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called
|
||||
/// and contains the exact layers we want to compact.
|
||||
pub struct GcCompactionJobDescription {
|
||||
/// All layers to read in the compaction job
|
||||
selected_layers: Vec<Layer>,
|
||||
/// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to
|
||||
/// keep all deltas <= this LSN or generate an image == this LSN.
|
||||
/// GC cutoff of the job
|
||||
gc_cutoff: Lsn,
|
||||
/// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or
|
||||
/// generate an image == this LSN.
|
||||
/// LSNs to retain for the job
|
||||
retain_lsns_below_horizon: Vec<Lsn>,
|
||||
/// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data
|
||||
/// \>= this LSN will be kept and will not be rewritten.
|
||||
/// Maximum layer LSN processed in this compaction
|
||||
max_layer_lsn: Lsn,
|
||||
/// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive.
|
||||
/// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of
|
||||
/// k-merge within gc-compaction.
|
||||
min_layer_lsn: Lsn,
|
||||
/// Only compact layers overlapping with this range.
|
||||
/// Only compact layers overlapping with this range
|
||||
compaction_key_range: Range<Key>,
|
||||
/// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
|
||||
/// This field is here solely for debugging. The field will not be read once the compaction
|
||||
@@ -343,7 +296,7 @@ impl Timeline {
|
||||
)));
|
||||
}
|
||||
|
||||
if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() {
|
||||
if options.compact_range.is_some() {
|
||||
// maybe useful in the future? could implement this at some point
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"compaction range is not supported for legacy compaction for now"
|
||||
@@ -1798,26 +1751,26 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
|
||||
/// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
|
||||
/// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
|
||||
/// like a full compaction of the specified keyspace.
|
||||
/// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
|
||||
/// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
|
||||
/// ad-hoc information about gc compaction itself.
|
||||
pub(crate) async fn gc_compaction_split_jobs(
|
||||
self: &Arc<Self>,
|
||||
job: GcCompactJob,
|
||||
sub_compaction_max_job_size_mb: Option<u64>,
|
||||
) -> anyhow::Result<Vec<GcCompactJob>> {
|
||||
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
|
||||
job.compact_lsn_range.end
|
||||
options: CompactOptions,
|
||||
) -> anyhow::Result<Vec<CompactOptions>> {
|
||||
if !options.sub_compaction {
|
||||
return Ok(vec![options]);
|
||||
}
|
||||
let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
|
||||
start: Key::MIN,
|
||||
end: Key::MAX,
|
||||
});
|
||||
let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
|
||||
compact_below_lsn
|
||||
} else {
|
||||
*self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.select_min() // use the real gc cutoff
|
||||
};
|
||||
|
||||
// Split compaction job to about 4GB each
|
||||
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
|
||||
let sub_compaction_max_job_size_mb =
|
||||
sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
|
||||
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
@@ -1853,8 +1806,8 @@ impl Timeline {
|
||||
let Some((start, end)) = truncate_to(
|
||||
&range.start,
|
||||
&range.end,
|
||||
&job.compact_key_range.start,
|
||||
&job.compact_key_range.end,
|
||||
&compact_range.start,
|
||||
&compact_range.end,
|
||||
) else {
|
||||
continue;
|
||||
};
|
||||
@@ -1864,6 +1817,8 @@ impl Timeline {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let mut current_start = None;
|
||||
// Split compaction job to about 2GB each
|
||||
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
|
||||
let ranges_num = split_key_ranges.len();
|
||||
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
|
||||
if current_start.is_none() {
|
||||
@@ -1876,7 +1831,8 @@ impl Timeline {
|
||||
}
|
||||
let res = layer_map.range_search(start..end, compact_below_lsn);
|
||||
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
|
||||
if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
let mut compact_options = options.clone();
|
||||
// Try to extend the compaction range so that we include at least one full layer file.
|
||||
let extended_end = res
|
||||
.found
|
||||
@@ -1894,11 +1850,10 @@ impl Timeline {
|
||||
"splitting compaction job: {}..{}, estimated_size={}",
|
||||
start, end, total_size
|
||||
);
|
||||
compact_jobs.push(GcCompactJob {
|
||||
dry_run: job.dry_run,
|
||||
compact_key_range: start..end,
|
||||
compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
|
||||
});
|
||||
compact_options.compact_range = Some(CompactRange { start, end });
|
||||
compact_options.compact_below_lsn = Some(compact_below_lsn);
|
||||
compact_options.sub_compaction = false;
|
||||
compact_jobs.push(compact_options);
|
||||
current_start = Some(end);
|
||||
}
|
||||
}
|
||||
@@ -1920,7 +1875,7 @@ impl Timeline {
|
||||
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
|
||||
/// part of the range.
|
||||
///
|
||||
/// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with
|
||||
/// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
|
||||
/// the LSN. Otherwise, it will use the gc cutoff by default.
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
@@ -1928,13 +1883,9 @@ impl Timeline {
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let sub_compaction = options.sub_compaction;
|
||||
let job = GcCompactJob::from_compact_options(options.clone());
|
||||
if sub_compaction {
|
||||
if options.sub_compaction {
|
||||
info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs = self
|
||||
.gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
|
||||
.await?;
|
||||
let jobs = self.gc_compaction_split_jobs(options).await?;
|
||||
let jobs_len = jobs.len();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
info!(
|
||||
@@ -1949,15 +1900,19 @@ impl Timeline {
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await
|
||||
self.compact_with_gc_inner(cancel, options, ctx).await
|
||||
}
|
||||
|
||||
async fn compact_with_gc_inner(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
job: GcCompactJob,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(
|
||||
!options.sub_compaction,
|
||||
"sub-compaction should be handled by the outer function"
|
||||
);
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
@@ -1977,11 +1932,19 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let dry_run = job.dry_run;
|
||||
let compact_key_range = job.compact_key_range;
|
||||
let compact_lsn_range = job.compact_lsn_range;
|
||||
let flags = options.flags;
|
||||
let compaction_key_range = options
|
||||
.compact_range
|
||||
.map(|range| range.start..range.end)
|
||||
.unwrap_or_else(|| Key::MIN..Key::MAX);
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
|
||||
if compaction_key_range == (Key::MIN..Key::MAX) {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
|
||||
} else {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
}
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
@@ -1999,21 +1962,13 @@ impl Timeline {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
let gc_cutoff = {
|
||||
// Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
|
||||
// Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
|
||||
// cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
|
||||
// to get the truth data.
|
||||
let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
|
||||
let real_gc_cutoff = gc_info.cutoffs.select_min();
|
||||
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
|
||||
// each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
|
||||
// each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
|
||||
// the real cutoff.
|
||||
let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
|
||||
real_gc_cutoff
|
||||
} else {
|
||||
compact_lsn_range.end
|
||||
};
|
||||
let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
|
||||
if gc_cutoff > real_gc_cutoff {
|
||||
warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
|
||||
warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
|
||||
gc_cutoff = real_gc_cutoff;
|
||||
}
|
||||
gc_cutoff
|
||||
@@ -2030,7 +1985,7 @@ impl Timeline {
|
||||
}
|
||||
let mut selected_layers: Vec<Layer> = Vec::new();
|
||||
drop(gc_info);
|
||||
// Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
// Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
let Some(max_layer_lsn) = layers
|
||||
.iter_historic_layers()
|
||||
.filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
|
||||
@@ -2040,45 +1995,27 @@ impl Timeline {
|
||||
info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
|
||||
return Ok(());
|
||||
};
|
||||
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
|
||||
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
|
||||
// it is a branch.
|
||||
let Some(min_layer_lsn) = layers
|
||||
.iter_historic_layers()
|
||||
.filter(|desc| {
|
||||
if compact_lsn_range.start == Lsn::INVALID {
|
||||
true // select all layers below if start == Lsn(0)
|
||||
} else {
|
||||
desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn
|
||||
}
|
||||
})
|
||||
.map(|desc| desc.get_lsn_range().start)
|
||||
.min()
|
||||
else {
|
||||
info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
|
||||
return Ok(());
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
let mut rewrite_layers = Vec::new();
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().end <= max_layer_lsn
|
||||
&& desc.get_lsn_range().start >= min_layer_lsn
|
||||
&& overlaps_with(&desc.get_key_range(), &compact_key_range)
|
||||
&& overlaps_with(&desc.get_key_range(), &compaction_key_range)
|
||||
{
|
||||
// If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
|
||||
// even if it might contain extra keys
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
// If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
|
||||
// to overlap image layers)
|
||||
if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range())
|
||||
if desc.is_delta()
|
||||
&& !fully_contains(&compaction_key_range, &desc.get_key_range())
|
||||
{
|
||||
rewrite_layers.push(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
|
||||
info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
@@ -2086,20 +2023,13 @@ impl Timeline {
|
||||
selected_layers,
|
||||
gc_cutoff,
|
||||
retain_lsns_below_horizon,
|
||||
min_layer_lsn,
|
||||
max_layer_lsn,
|
||||
compaction_key_range: compact_key_range,
|
||||
compaction_key_range,
|
||||
rewrite_layers,
|
||||
}
|
||||
};
|
||||
let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID {
|
||||
// If we only compact above some LSN, we should get the history from the current branch below the specified LSN.
|
||||
// We use job_desc.min_layer_lsn as if it's the lowest branch point.
|
||||
(true, job_desc.min_layer_lsn)
|
||||
} else if self.ancestor_timeline.is_some() {
|
||||
// In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the
|
||||
// LSN ranges all the way to the ancestor timeline.
|
||||
(true, self.ancestor_lsn)
|
||||
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
|
||||
Lsn(self.ancestor_lsn.0 + 1)
|
||||
} else {
|
||||
let res = job_desc
|
||||
.retain_lsns_below_horizon
|
||||
@@ -2117,19 +2047,17 @@ impl Timeline {
|
||||
.unwrap_or(job_desc.gc_cutoff)
|
||||
);
|
||||
}
|
||||
(false, res)
|
||||
res
|
||||
};
|
||||
info!(
|
||||
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
|
||||
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
|
||||
job_desc.selected_layers.len(),
|
||||
job_desc.rewrite_layers.len(),
|
||||
job_desc.max_layer_lsn,
|
||||
job_desc.min_layer_lsn,
|
||||
job_desc.gc_cutoff,
|
||||
lowest_retain_lsn,
|
||||
job_desc.compaction_key_range.start,
|
||||
job_desc.compaction_key_range.end,
|
||||
has_data_below,
|
||||
job_desc.compaction_key_range.end
|
||||
);
|
||||
|
||||
for layer in &job_desc.selected_layers {
|
||||
@@ -2173,22 +2101,10 @@ impl Timeline {
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
let mut total_downloaded_size = 0;
|
||||
let mut total_layer_size = 0;
|
||||
for layer in &job_desc.selected_layers {
|
||||
if layer.needs_download().await?.is_some() {
|
||||
total_downloaded_size += layer.layer_desc().file_size;
|
||||
}
|
||||
total_layer_size += layer.layer_desc().file_size;
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
info!(
|
||||
"finish downloading layers, downloaded={}, total={}, ratio={:.2}",
|
||||
total_downloaded_size,
|
||||
total_layer_size,
|
||||
total_downloaded_size as f64 / total_layer_size as f64
|
||||
);
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
@@ -2211,7 +2127,7 @@ impl Timeline {
|
||||
|
||||
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
|
||||
// when some condition meet.
|
||||
let mut image_layer_writer = if !has_data_below {
|
||||
let mut image_layer_writer = if self.ancestor_timeline.is_none() {
|
||||
Some(
|
||||
SplitImageLayerWriter::new(
|
||||
self.conf,
|
||||
@@ -2244,11 +2160,7 @@ impl Timeline {
|
||||
}
|
||||
let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
|
||||
|
||||
/// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`).
|
||||
/// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set.
|
||||
/// In those cases, we need to pull up data from below the LSN range we're compaction.
|
||||
///
|
||||
/// This function unifies the cases so that later code doesn't have to think about it.
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
/// is needed for reconstruction. This should be fixed in the future.
|
||||
@@ -2256,19 +2168,17 @@ impl Timeline {
|
||||
/// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
|
||||
/// images.
|
||||
async fn get_ancestor_image(
|
||||
this_tline: &Arc<Timeline>,
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
has_data_below: bool,
|
||||
history_lsn_point: Lsn,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
|
||||
if !has_data_below {
|
||||
if tline.ancestor_timeline.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
// This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
|
||||
// as much existing code as possible.
|
||||
let img = this_tline.get(key, history_lsn_point, ctx).await?;
|
||||
Ok(Some((key, history_lsn_point, img)))
|
||||
let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
|
||||
Ok(Some((key, tline.ancestor_lsn, img)))
|
||||
}
|
||||
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
@@ -2352,8 +2262,7 @@ impl Timeline {
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
|
||||
.await?,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
retention
|
||||
@@ -2382,7 +2291,7 @@ impl Timeline {
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
retention
|
||||
|
||||
@@ -877,24 +877,22 @@ impl WalIngest {
|
||||
// will block waiting for the last valid LSN to advance up to
|
||||
// it. So we use the previous record's LSN in the get calls
|
||||
// instead.
|
||||
if modification.tline.get_shard_identity().is_shard_zero() {
|
||||
for segno in modification
|
||||
.tline
|
||||
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
|
||||
.await?
|
||||
{
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
for segno in modification
|
||||
.tline
|
||||
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
|
||||
.await?
|
||||
{
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
|
||||
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
|
||||
});
|
||||
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
|
||||
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
|
||||
});
|
||||
|
||||
if may_delete {
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::Clog, segno, ctx)
|
||||
.await?;
|
||||
trace!("Drop CLOG segment {:>04X}", segno);
|
||||
}
|
||||
if may_delete {
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::Clog, segno, ctx)
|
||||
.await?;
|
||||
trace!("Drop CLOG segment {:>04X}", segno);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1049,18 +1047,16 @@ impl WalIngest {
|
||||
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
if modification.tline.get_shard_identity().is_shard_zero() {
|
||||
while segment != endsegment {
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
|
||||
.await?;
|
||||
while segment != endsegment {
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
|
||||
.await?;
|
||||
|
||||
/* move to next segment, handling wraparound correctly */
|
||||
if segment == maxsegment {
|
||||
segment = 0;
|
||||
} else {
|
||||
segment += 1;
|
||||
}
|
||||
/* move to next segment, handling wraparound correctly */
|
||||
if segment == maxsegment {
|
||||
segment = 0;
|
||||
} else {
|
||||
segment += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
#include "portability/instr_time.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/ipc.h"
|
||||
@@ -119,11 +118,6 @@ typedef struct
|
||||
*/
|
||||
PSConnectionState state;
|
||||
PGconn *conn;
|
||||
|
||||
/* request / response counters for debugging */
|
||||
uint64 nrequests_sent;
|
||||
uint64 nresponses_received;
|
||||
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
@@ -634,8 +628,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
}
|
||||
|
||||
shard->state = PS_Connected;
|
||||
shard->nrequests_sent = 0;
|
||||
shard->nresponses_received = 0;
|
||||
}
|
||||
/* FALLTHROUGH */
|
||||
case PS_Connected:
|
||||
@@ -664,27 +656,6 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
|
||||
int ret;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
instr_time now,
|
||||
start_ts,
|
||||
since_start,
|
||||
last_log_ts,
|
||||
since_last_log;
|
||||
bool logged = false;
|
||||
|
||||
/*
|
||||
* As a debugging aid, if we don't get a response for a long time, print a
|
||||
* log message.
|
||||
*
|
||||
* 10 s is a very generous threshold, normally we expect a response in a
|
||||
* few milliseconds. We have metrics to track latencies in normal ranges,
|
||||
* but in the cases that take exceptionally long, it's useful to log the
|
||||
* exact timestamps.
|
||||
*/
|
||||
#define LOG_INTERVAL_US UINT64CONST(10 * 1000000)
|
||||
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
start_ts = last_log_ts = now;
|
||||
INSTR_TIME_SET_ZERO(since_last_log);
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
|
||||
@@ -692,12 +663,9 @@ retry:
|
||||
if (ret == 0)
|
||||
{
|
||||
WaitEvent event;
|
||||
long timeout;
|
||||
|
||||
timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
|
||||
WAIT_EVENT_NEON_PS_READ);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
@@ -716,40 +684,9 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Print a message to the log if a long time has passed with no
|
||||
* response.
|
||||
*/
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
since_last_log = now;
|
||||
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
|
||||
if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
|
||||
{
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start),
|
||||
shard->nrequests_sent, shard->nresponses_received);
|
||||
last_log_ts = now;
|
||||
logged = true;
|
||||
}
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we logged earlier that the response is taking a long time, log
|
||||
* another message when the response is finally received.
|
||||
*/
|
||||
if (logged)
|
||||
{
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
|
||||
INSTR_TIME_GET_DOUBLE(since_start));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -849,7 +786,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
|
||||
* point, but on the grand scheme of things it's only a small issue.
|
||||
*/
|
||||
shard->nrequests_sent++;
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
@@ -942,7 +878,6 @@ pageserver_receive(shardno_t shard_no)
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -423,11 +423,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
* ensuring we have received all but the last n requests (n = newsize).
|
||||
*/
|
||||
if (MyPState->n_requests_inflight > newsize)
|
||||
{
|
||||
Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
|
||||
prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
|
||||
Assert(MyPState->n_requests_inflight <= newsize);
|
||||
}
|
||||
prefetch_wait_for(MyPState->ring_unused - newsize);
|
||||
|
||||
/* construct the new PrefetchState, and copy over the memory contexts */
|
||||
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
|
||||
@@ -442,6 +438,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
newPState->ring_last = newsize;
|
||||
newPState->ring_unused = newsize;
|
||||
newPState->ring_receive = newsize;
|
||||
newPState->ring_flush = newsize;
|
||||
newPState->max_shard_no = MyPState->max_shard_no;
|
||||
memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
|
||||
|
||||
@@ -492,7 +489,6 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
}
|
||||
newPState->n_unused -= 1;
|
||||
}
|
||||
newPState->ring_flush = newPState->ring_receive;
|
||||
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
@@ -502,7 +498,6 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
|
||||
{
|
||||
PrefetchRequest *slot = GetPrfSlot(end);
|
||||
Assert(slot->status != PRFS_REQUESTED);
|
||||
if (slot->status == PRFS_RECEIVED)
|
||||
{
|
||||
pfree(slot->response);
|
||||
|
||||
@@ -74,6 +74,10 @@ impl std::fmt::Display for Backend<'_, ()> {
|
||||
.debug_tuple("ControlPlane::ProxyV1")
|
||||
.field(&endpoint.url())
|
||||
.finish(),
|
||||
ControlPlaneClient::Neon(endpoint) => fmt
|
||||
.debug_tuple("ControlPlane::Neon")
|
||||
.field(&endpoint.url())
|
||||
.finish(),
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
ControlPlaneClient::PostgresMock(endpoint) => fmt
|
||||
.debug_tuple("ControlPlane::PostgresMock")
|
||||
|
||||
@@ -43,6 +43,9 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackendType {
|
||||
#[value(name("console"), alias("cplane"))]
|
||||
ControlPlane,
|
||||
|
||||
#[value(name("cplane-v1"), alias("control-plane"))]
|
||||
ControlPlaneV1,
|
||||
|
||||
@@ -485,7 +488,40 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
|
||||
if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
|
||||
if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
(client1, client2) => {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(client) = client1 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
cancel_map.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
if let Some(client) = client2 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
cancel_map.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
}
|
||||
if let Some(regional_redis_client) = regional_redis_client {
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(
|
||||
async move { cache.do_read(con, cancellation_token.clone()).await }
|
||||
.instrument(span),
|
||||
);
|
||||
}
|
||||
} else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
(client1, client2) => {
|
||||
@@ -721,6 +757,65 @@ fn build_auth_backend(
|
||||
Ok(Either::Left(config))
|
||||
}
|
||||
|
||||
AuthBackendType::ControlPlane => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
let endpoint_cache_config: config::EndpointCacheConfig =
|
||||
args.endpoint_cache_config.parse()?;
|
||||
|
||||
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
|
||||
info!(
|
||||
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
|
||||
);
|
||||
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
|
||||
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
|
||||
wake_compute_cache_config,
|
||||
project_info_cache_config,
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.wake_compute_lock.parse()?;
|
||||
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||
let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)?));
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
|
||||
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
|
||||
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
|
||||
let wake_compute_endpoint_rate_limiter =
|
||||
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
|
||||
|
||||
let api = control_plane::client::neon::NeonControlPlaneClient::new(
|
||||
endpoint,
|
||||
args.control_plane_token.clone(),
|
||||
caches,
|
||||
locks,
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
);
|
||||
let api = control_plane::client::ControlPlaneClient::Neon(api);
|
||||
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
|
||||
|
||||
let config = Box::leak(Box::new(auth_backend));
|
||||
|
||||
Ok(Either::Left(config))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackendType::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
|
||||
@@ -115,8 +115,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
|
||||
};
|
||||
if !self.limiter.lock().unwrap().check(subnet_key, 1) {
|
||||
// log only the subnet part of the IP address to know which subnet is rate limited
|
||||
tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
|
||||
tracing::debug!("Rate limit exceeded. Skipping cancellation message");
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.cancellation_requests_total
|
||||
|
||||
@@ -163,36 +163,32 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
|
||||
|
||||
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod cplane_proxy_v1;
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
pub mod mock;
|
||||
pub mod neon;
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::sync::Arc;
|
||||
@@ -27,8 +28,10 @@ use crate::types::EndpointId;
|
||||
#[non_exhaustive]
|
||||
#[derive(Clone)]
|
||||
pub enum ControlPlaneClient {
|
||||
/// Proxy V1 control plane API
|
||||
/// New Proxy V1 control plane API
|
||||
ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
|
||||
/// Current Management API (V2).
|
||||
Neon(neon::NeonControlPlaneClient),
|
||||
/// Local mock control plane.
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
PostgresMock(mock::MockControlPlane),
|
||||
@@ -46,6 +49,7 @@ impl ControlPlaneApi for ControlPlaneClient {
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
|
||||
match self {
|
||||
Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
@@ -62,6 +66,7 @@ impl ControlPlaneApi for ControlPlaneClient {
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
|
||||
match self {
|
||||
Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
@@ -76,6 +81,7 @@ impl ControlPlaneApi for ControlPlaneClient {
|
||||
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
|
||||
match self {
|
||||
Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
#[cfg(test)]
|
||||
@@ -90,6 +96,7 @@ impl ControlPlaneApi for ControlPlaneClient {
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
|
||||
match self {
|
||||
Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
|
||||
Self::Neon(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
|
||||
511
proxy/src/control_plane/client/neon.rs
Normal file
511
proxy/src/control_plane/client/neon.rs
Normal file
@@ -0,0 +1,511 @@
|
||||
//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use ::http::header::AUTHORIZATION;
|
||||
use ::http::HeaderName;
|
||||
use futures::TryFutureExt;
|
||||
use postgres_client::config::SslMode;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
|
||||
use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
|
||||
use crate::auth::backend::jwt::AuthRule;
|
||||
use crate::auth::backend::ComputeUserInfo;
|
||||
use crate::cache::Cached;
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::caches::ApiCaches;
|
||||
use crate::control_plane::errors::{
|
||||
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
|
||||
};
|
||||
use crate::control_plane::locks::ApiLocks;
|
||||
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
|
||||
use crate::control_plane::{
|
||||
AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
|
||||
};
|
||||
use crate::metrics::{CacheOutcome, Metrics};
|
||||
use crate::rate_limiter::WakeComputeRateLimiter;
|
||||
use crate::types::{EndpointCacheKey, EndpointId};
|
||||
use crate::{compute, http, scram};
|
||||
|
||||
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct NeonControlPlaneClient {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
// put in a shared ref so we don't copy secrets all over in memory
|
||||
jwt: Arc<str>,
|
||||
}
|
||||
|
||||
impl NeonControlPlaneClient {
|
||||
/// Construct an API object containing the auth parameters.
|
||||
pub fn new(
|
||||
endpoint: http::Endpoint,
|
||||
jwt: Arc<str>,
|
||||
caches: &'static ApiCaches,
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
) -> Self {
|
||||
Self {
|
||||
endpoint,
|
||||
caches,
|
||||
locks,
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
jwt,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn url(&self) -> &str {
|
||||
self.endpoint.url().as_str()
|
||||
}
|
||||
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
if !self
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &user_info.endpoint.normalize())
|
||||
{
|
||||
// TODO: refactor this because it's weird
|
||||
// this is a failure to authenticate but we return Ok.
|
||||
info!("endpoint is not valid, skipping the request");
|
||||
return Ok(AuthInfo::default());
|
||||
}
|
||||
let request_id = ctx.session_id().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get_path("proxy_get_role_secret")
|
||||
.header(X_REQUEST_ID, &request_id)
|
||||
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.query(&[
|
||||
("application_name", application_name.as_str()),
|
||||
("project", user_info.endpoint.as_str()),
|
||||
("role", user_info.user.as_str()),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
debug!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
// TODO(anna): retry
|
||||
Err(e) => {
|
||||
return if e.get_reason().is_not_found() {
|
||||
// TODO: refactor this because it's weird
|
||||
// this is a failure to authenticate but we return Ok.
|
||||
Ok(AuthInfo::default())
|
||||
} else {
|
||||
Err(e.into())
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let secret = if body.role_secret.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let secret = scram::ServerSecret::parse(&body.role_secret)
|
||||
.map(AuthSecret::Scram)
|
||||
.ok_or(GetAuthInfoError::BadSecret)?;
|
||||
Some(secret)
|
||||
};
|
||||
let allowed_ips = body.allowed_ips.unwrap_or_default();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_number
|
||||
.observe(allowed_ips.len() as f64);
|
||||
Ok(AuthInfo {
|
||||
secret,
|
||||
allowed_ips,
|
||||
project_id: body.project_id,
|
||||
})
|
||||
}
|
||||
.inspect_err(|e| tracing::debug!(error = ?e))
|
||||
.instrument(info_span!("do_get_auth_info"))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
if !self
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &endpoint.normalize())
|
||||
{
|
||||
return Err(GetEndpointJwksError::EndpointNotFound);
|
||||
}
|
||||
let request_id = ctx.session_id().to_string();
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get_with_url(|url| {
|
||||
url.path_segments_mut()
|
||||
.push("endpoints")
|
||||
.push(endpoint.as_str())
|
||||
.push("jwks");
|
||||
})
|
||||
.header(X_REQUEST_ID, &request_id)
|
||||
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.build()
|
||||
.map_err(GetEndpointJwksError::RequestBuild)?;
|
||||
|
||||
debug!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self
|
||||
.endpoint
|
||||
.execute(request)
|
||||
.await
|
||||
.map_err(GetEndpointJwksError::RequestExecute)?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
|
||||
let body = parse_body::<EndpointJwksResponse>(response).await?;
|
||||
|
||||
let rules = body
|
||||
.jwks
|
||||
.into_iter()
|
||||
.map(|jwks| AuthRule {
|
||||
id: jwks.id,
|
||||
jwks_url: jwks.jwks_url,
|
||||
audience: jwks.jwt_audience,
|
||||
role_names: jwks.role_names,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(rules)
|
||||
}
|
||||
.inspect_err(|e| tracing::debug!(error = ?e))
|
||||
.instrument(info_span!("do_get_endpoint_jwks"))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
let request_id = ctx.session_id().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let mut request_builder = self
|
||||
.endpoint
|
||||
.get_path("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.query(&[
|
||||
("application_name", application_name.as_str()),
|
||||
("project", user_info.endpoint.as_str()),
|
||||
]);
|
||||
|
||||
let options = user_info.options.to_deep_object();
|
||||
if !options.is_empty() {
|
||||
request_builder = request_builder.query(&options);
|
||||
}
|
||||
|
||||
let request = request_builder.build()?;
|
||||
|
||||
debug!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
// Unfortunately, ownership won't let us use `Option::ok_or` here.
|
||||
let (host, port) = match parse_host_port(&body.address) {
|
||||
None => return Err(WakeComputeError::BadComputeAddress(body.address)),
|
||||
Some(x) => x,
|
||||
};
|
||||
|
||||
// Don't set anything but host and port! This config will be cached.
|
||||
// We'll set username and such later using the startup message.
|
||||
// TODO: add more type safety (in progress).
|
||||
let mut config = compute::ConnCfg::new(host.to_owned(), port);
|
||||
config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
aux: body.aux,
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
.inspect_err(|e| tracing::debug!(error = ?e))
|
||||
.instrument(info_span!("do_wake_compute"))
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl super::ControlPlaneApi for NeonControlPlaneClient {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
let normalized_ep = &user_info.endpoint.normalize();
|
||||
let user = &user_info.user;
|
||||
if let Some(role_secret) = self
|
||||
.caches
|
||||
.project_info
|
||||
.get_role_secret(normalized_ep, user)
|
||||
{
|
||||
return Ok(role_secret);
|
||||
}
|
||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||
if let Some(project_id) = auth_info.project_id {
|
||||
let normalized_ep_int = normalized_ep.into();
|
||||
self.caches.project_info.insert_role_secret(
|
||||
project_id,
|
||||
normalized_ep_int,
|
||||
user.into(),
|
||||
auth_info.secret.clone(),
|
||||
);
|
||||
self.caches.project_info.insert_allowed_ips(
|
||||
project_id,
|
||||
normalized_ep_int,
|
||||
Arc::new(auth_info.allowed_ips),
|
||||
);
|
||||
ctx.set_project_id(project_id);
|
||||
}
|
||||
// When we just got a secret, we don't need to invalidate it.
|
||||
Ok(Cached::new_uncached(auth_info.secret))
|
||||
}
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
let normalized_ep = &user_info.endpoint.normalize();
|
||||
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_cache_misses
|
||||
.inc(CacheOutcome::Hit);
|
||||
return Ok((allowed_ips, None));
|
||||
}
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_cache_misses
|
||||
.inc(CacheOutcome::Miss);
|
||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||
let allowed_ips = Arc::new(auth_info.allowed_ips);
|
||||
let user = &user_info.user;
|
||||
if let Some(project_id) = auth_info.project_id {
|
||||
let normalized_ep_int = normalized_ep.into();
|
||||
self.caches.project_info.insert_role_secret(
|
||||
project_id,
|
||||
normalized_ep_int,
|
||||
user.into(),
|
||||
auth_info.secret.clone(),
|
||||
);
|
||||
self.caches.project_info.insert_allowed_ips(
|
||||
project_id,
|
||||
normalized_ep_int,
|
||||
allowed_ips.clone(),
|
||||
);
|
||||
ctx.set_project_id(project_id);
|
||||
}
|
||||
Ok((
|
||||
Cached::new_uncached(allowed_ips),
|
||||
Some(Cached::new_uncached(auth_info.secret)),
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
self.do_get_endpoint_jwks(ctx, endpoint).await
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let key = user_info.endpoint_cache_key();
|
||||
|
||||
macro_rules! check_cache {
|
||||
() => {
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
let (cached, info) = cached.take_value();
|
||||
let info = info.map_err(|c| {
|
||||
info!(key = &*key, "found cached wake_compute error");
|
||||
WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
|
||||
})?;
|
||||
|
||||
debug!(key = &*key, "found cached compute node info");
|
||||
ctx.set_project(info.aux.clone());
|
||||
return Ok(cached.map(|()| info));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Every time we do a wakeup http request, the compute node will stay up
|
||||
// for some time (highly depends on the console's scale-to-zero policy);
|
||||
// The connection info remains the same during that period of time,
|
||||
// which means that we might cache it to reduce the load and latency.
|
||||
check_cache!();
|
||||
|
||||
let permit = self.locks.get_permit(&key).await?;
|
||||
|
||||
// after getting back a permit - it's possible the cache was filled
|
||||
// double check
|
||||
if permit.should_check_cache() {
|
||||
// TODO: if there is something in the cache, mark the permit as success.
|
||||
check_cache!();
|
||||
}
|
||||
|
||||
// check rate limit
|
||||
if !self
|
||||
.wake_compute_endpoint_rate_limiter
|
||||
.check(user_info.endpoint.normalize_intern(), 1)
|
||||
{
|
||||
return Err(WakeComputeError::TooManyConnections);
|
||||
}
|
||||
|
||||
let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
|
||||
match node {
|
||||
Ok(node) => {
|
||||
ctx.set_project(node.aux.clone());
|
||||
debug!(key = &*key, "created a cache entry for woken compute node");
|
||||
|
||||
let mut stored_node = node.clone();
|
||||
// store the cached node as 'warm_cached'
|
||||
stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
|
||||
|
||||
let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
|
||||
|
||||
Ok(cached.map(|()| node))
|
||||
}
|
||||
Err(err) => match err {
|
||||
WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
|
||||
let Some(status) = &err.status else {
|
||||
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
|
||||
err,
|
||||
)));
|
||||
};
|
||||
|
||||
let reason = status
|
||||
.details
|
||||
.error_info
|
||||
.map_or(Reason::Unknown, |x| x.reason);
|
||||
|
||||
// if we can retry this error, do not cache it.
|
||||
if reason.can_retry() {
|
||||
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
|
||||
err,
|
||||
)));
|
||||
}
|
||||
|
||||
// at this point, we should only have quota errors.
|
||||
debug!(
|
||||
key = &*key,
|
||||
"created a cache entry for the wake compute error"
|
||||
);
|
||||
|
||||
self.caches.node_info.insert_ttl(
|
||||
key,
|
||||
Err(err.clone()),
|
||||
Duration::from_secs(30),
|
||||
);
|
||||
|
||||
Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
|
||||
err,
|
||||
)))
|
||||
}
|
||||
err => return Err(err),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse http response body, taking status code into account.
|
||||
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
response: http::Response,
|
||||
) -> Result<T, ControlPlaneError> {
|
||||
let status = response.status();
|
||||
if status.is_success() {
|
||||
// We shouldn't log raw body because it may contain secrets.
|
||||
info!("request succeeded, processing the body");
|
||||
return Ok(response.json().await?);
|
||||
}
|
||||
let s = response.bytes().await?;
|
||||
// Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
|
||||
info!("response_error plaintext: {:?}", s);
|
||||
|
||||
// Don't throw an error here because it's not as important
|
||||
// as the fact that the request itself has failed.
|
||||
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
|
||||
warn!("failed to parse error body: {e}");
|
||||
ControlPlaneErrorMessage {
|
||||
error: "reason unclear (malformed error message)".into(),
|
||||
http_status_code: status,
|
||||
status: None,
|
||||
}
|
||||
});
|
||||
body.http_status_code = status;
|
||||
|
||||
warn!("console responded with an error ({status}): {body:?}");
|
||||
Err(ControlPlaneError::Message(Box::new(body)))
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
let (host, port) = input.rsplit_once(':')?;
|
||||
let ipv6_brackets: &[_] = &['[', ']'];
|
||||
Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port_v4() {
|
||||
let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
|
||||
assert_eq!(host, "127.0.0.1");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port_v6() {
|
||||
let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
|
||||
assert_eq!(host, "2001:db8::1");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port_url() {
|
||||
let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
|
||||
.expect("failed to parse");
|
||||
assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
}
|
||||
@@ -221,6 +221,15 @@ pub(crate) struct UserFacingMessage {
|
||||
pub(crate) message: Box<str>,
|
||||
}
|
||||
|
||||
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
||||
/// Returned by the `/proxy_get_role_secret` API method.
|
||||
#[derive(Deserialize)]
|
||||
pub(crate) struct GetRoleSecret {
|
||||
pub(crate) role_secret: Box<str>,
|
||||
pub(crate) allowed_ips: Option<Vec<IpPattern>>,
|
||||
pub(crate) project_id: Option<ProjectIdInt>,
|
||||
}
|
||||
|
||||
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
||||
/// Returned by the `/get_endpoint_access_control` API method.
|
||||
#[derive(Deserialize)]
|
||||
@@ -231,6 +240,13 @@ pub(crate) struct GetEndpointAccessControl {
|
||||
pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
impl fmt::Debug for GetRoleSecret {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("GetRoleSecret").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
/// Response which holds compute node's `host:port` pair.
|
||||
/// Returned by the `/proxy_wake_compute` API method.
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -461,18 +477,18 @@ mod tests {
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
});
|
||||
serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
|
||||
serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
"allowed_ips": ["8.8.8.8"],
|
||||
});
|
||||
serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
|
||||
serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
"allowed_ips": ["8.8.8.8"],
|
||||
"project_id": "project",
|
||||
});
|
||||
serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
|
||||
serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -272,36 +272,32 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
|
||||
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::cache::project_info::ProjectInfoCache;
|
||||
use crate::cancellation::{CancelMap, CancellationHandler};
|
||||
use crate::intern::{ProjectIdInt, RoleNameInt};
|
||||
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
|
||||
use tracing::Instrument;
|
||||
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
|
||||
@@ -144,8 +143,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
let peer_addr = cancel_session
|
||||
.peer_addr
|
||||
.unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
|
||||
match self
|
||||
.cancellation_handler
|
||||
@@ -155,7 +152,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
peer_addr,
|
||||
cancel_session.peer_addr.is_some(),
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
|
||||
@@ -83,20 +83,14 @@ impl Env {
|
||||
node_id: NodeId,
|
||||
ttid: TenantTimelineId,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let conf = Arc::new(self.make_conf(node_id));
|
||||
let conf = self.make_conf(node_id);
|
||||
let timeline_dir = get_timeline_dir(&conf, &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
let safekeeper = self.make_safekeeper(node_id, ttid).await?;
|
||||
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
|
||||
|
||||
let timeline = Timeline::new(
|
||||
ttid,
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf.clone(),
|
||||
);
|
||||
let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
|
||||
timeline.bootstrap(
|
||||
&mut timeline.write_shared_state().await,
|
||||
&conf,
|
||||
|
||||
@@ -338,7 +338,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
let conf = Arc::new(SafeKeeperConf {
|
||||
let conf = SafeKeeperConf {
|
||||
workdir,
|
||||
my_id: id,
|
||||
listen_pg_addr: args.listen_pg,
|
||||
@@ -368,7 +368,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
eviction_min_resident: args.eviction_min_resident,
|
||||
});
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(
|
||||
@@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
/// complete, e.g. panicked, inner is error produced by task itself.
|
||||
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// fsync the datadir to make sure we have a consistent state on disk.
|
||||
if !conf.no_sync {
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
@@ -428,11 +428,9 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
e
|
||||
})?;
|
||||
|
||||
let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
wal_backup::init_remote_storage(&conf).await;
|
||||
@@ -449,8 +447,9 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
global_timelines.init().await?;
|
||||
GlobalTimelines::init(conf.clone()).await?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
// Run everything in current thread rt, if asked.
|
||||
if conf.current_thread_runtime {
|
||||
info!("running in current thread runtime");
|
||||
@@ -460,16 +459,14 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(
|
||||
conf.clone(),
|
||||
conf_,
|
||||
pg_listener,
|
||||
Scope::SafekeeperData,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
|
||||
let global_timelines_ = global_timelines.clone();
|
||||
let timeline_housekeeping_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
@@ -477,45 +474,40 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
|
||||
loop {
|
||||
tokio::time::sleep(TOMBSTONE_TTL).await;
|
||||
global_timelines_.housekeeping(&TOMBSTONE_TTL);
|
||||
GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
|
||||
}
|
||||
})
|
||||
.map(|res| ("Timeline map housekeeping".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(timeline_housekeeping_handle));
|
||||
|
||||
if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
|
||||
let conf_ = conf.clone();
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(
|
||||
conf.clone(),
|
||||
conf_,
|
||||
pg_listener_tenant_only,
|
||||
Scope::Tenant,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service tenant only main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
}
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let http_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| HTTP_RUNTIME.handle())
|
||||
.spawn(http::task_main(
|
||||
conf.clone(),
|
||||
http_listener,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
.spawn(http::task_main(conf_, http_listener))
|
||||
.map(|res| ("HTTP service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(http_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let broker_task_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| BROKER_RUNTIME.handle())
|
||||
.spawn(
|
||||
broker::task_main(conf.clone(), global_timelines.clone())
|
||||
.instrument(info_span!("broker")),
|
||||
)
|
||||
.spawn(broker::task_main(conf_).instrument(info_span!("broker")))
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
|
||||
@@ -39,17 +39,14 @@ const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
if conf.disable_periodic_broker_push {
|
||||
info!("broker push_loop is disabled, doing nothing...");
|
||||
futures::future::pending::<()>().await; // sleep forever
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let active_timelines_set = global_timelines.get_global_broker_active_set();
|
||||
let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
|
||||
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
@@ -90,13 +87,8 @@ async fn push_loop(
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
#[instrument(name = "broker_pull", skip_all)]
|
||||
async fn pull_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
stats: Arc<BrokerStats>,
|
||||
) -> Result<()> {
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
|
||||
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
@@ -121,7 +113,7 @@ async fn pull_loop(
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
// Note that we also receive *our own* info. That's
|
||||
// important, as it is used as an indication of live
|
||||
// connection to the broker.
|
||||
@@ -143,11 +135,7 @@ async fn pull_loop(
|
||||
|
||||
/// Process incoming discover requests. This is done in a separate task to avoid
|
||||
/// interfering with the normal pull/push loops.
|
||||
async fn discover_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
stats: Arc<BrokerStats>,
|
||||
) -> Result<()> {
|
||||
async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
|
||||
@@ -183,7 +171,7 @@ async fn discover_loop(
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
// we received a discovery request for a timeline we know about
|
||||
discover_counter.inc();
|
||||
|
||||
@@ -222,10 +210,7 @@ async fn discover_loop(
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
|
||||
@@ -276,13 +261,13 @@ pub async fn task_main(
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
if push_handle.is_none() {
|
||||
push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone())));
|
||||
push_handle = Some(tokio::spawn(push_loop(conf.clone())));
|
||||
}
|
||||
if pull_handle.is_none() {
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone())));
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
|
||||
}
|
||||
if discover_handle.is_none() {
|
||||
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone())));
|
||||
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
|
||||
}
|
||||
},
|
||||
_ = &mut stats_task => {}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
fs::OpenOptions,
|
||||
io::{AsyncSeekExt, AsyncWriteExt},
|
||||
@@ -12,7 +14,7 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
use crate::{
|
||||
control_file::FileStorage,
|
||||
state::TimelinePersistentState,
|
||||
timeline::{TimelineError, WalResidentTimeline},
|
||||
timeline::{Timeline, TimelineError, WalResidentTimeline},
|
||||
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
|
||||
wal_backup::copy_s3_segments,
|
||||
wal_storage::{wal_file_paths, WalReader},
|
||||
@@ -23,19 +25,16 @@ use crate::{
|
||||
const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
|
||||
|
||||
pub struct Request {
|
||||
pub source_ttid: TenantTimelineId,
|
||||
pub source: Arc<Timeline>,
|
||||
pub until_lsn: Lsn,
|
||||
pub destination_ttid: TenantTimelineId,
|
||||
}
|
||||
|
||||
pub async fn handle_request(
|
||||
request: Request,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<()> {
|
||||
pub async fn handle_request(request: Request) -> Result<()> {
|
||||
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
|
||||
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
|
||||
|
||||
match global_timelines.get(request.destination_ttid) {
|
||||
match GlobalTimelines::get(request.destination_ttid) {
|
||||
// timeline already exists. would be good to check that this timeline is the copy
|
||||
// of the source timeline, but it isn't obvious how to do that
|
||||
Ok(_) => return Ok(()),
|
||||
@@ -47,10 +46,9 @@ pub async fn handle_request(
|
||||
}
|
||||
}
|
||||
|
||||
let source = global_timelines.get(request.source_ttid)?;
|
||||
let source_tli = source.wal_residence_guard().await?;
|
||||
let source_tli = request.source.wal_residence_guard().await?;
|
||||
|
||||
let conf = &global_timelines.get_global_config();
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
let ttid = request.destination_ttid;
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
@@ -129,7 +127,7 @@ pub async fn handle_request(
|
||||
|
||||
copy_s3_segments(
|
||||
wal_seg_size,
|
||||
&request.source_ttid,
|
||||
&request.source.ttid,
|
||||
&request.destination_ttid,
|
||||
first_segment,
|
||||
first_ondisk_segment,
|
||||
@@ -160,9 +158,7 @@ pub async fn handle_request(
|
||||
|
||||
// now we have a ready timeline in a temp directory
|
||||
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
||||
global_timelines
|
||||
.load_temp_timeline(request.destination_ttid, &tli_dir_path, true)
|
||||
.await?;
|
||||
GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -207,23 +207,23 @@ pub struct FileInfo {
|
||||
}
|
||||
|
||||
/// Build debug dump response, using the provided [`Args`] filters.
|
||||
pub async fn build(args: Args, global_timelines: Arc<GlobalTimelines>) -> Result<Response> {
|
||||
pub async fn build(args: Args) -> Result<Response> {
|
||||
let start_time = Utc::now();
|
||||
let timelines_count = global_timelines.timelines_count();
|
||||
let config = global_timelines.get_global_config();
|
||||
let timelines_count = GlobalTimelines::timelines_count();
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
|
||||
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
|
||||
// If both tenant_id and timeline_id are specified, we can just get the
|
||||
// timeline directly, without taking a snapshot of the whole list.
|
||||
let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
vec![tli]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
// Otherwise, take a snapshot of the whole list.
|
||||
global_timelines.get_all()
|
||||
GlobalTimelines::get_all()
|
||||
};
|
||||
|
||||
let mut timelines = Vec::new();
|
||||
@@ -344,12 +344,12 @@ fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
|
||||
|
||||
/// Converts SafeKeeperConf to Config, filtering out the fields that are not
|
||||
/// supposed to be exposed.
|
||||
fn build_config(config: Arc<SafeKeeperConf>) -> Config {
|
||||
fn build_config(config: SafeKeeperConf) -> Config {
|
||||
Config {
|
||||
id: config.my_id,
|
||||
workdir: config.workdir.clone().into(),
|
||||
listen_pg_addr: config.listen_pg_addr.clone(),
|
||||
listen_http_addr: config.listen_http_addr.clone(),
|
||||
workdir: config.workdir.into(),
|
||||
listen_pg_addr: config.listen_pg_addr,
|
||||
listen_http_addr: config.listen_http_addr,
|
||||
no_sync: config.no_sync,
|
||||
max_offloader_lag_bytes: config.max_offloader_lag_bytes,
|
||||
wal_backup_enabled: config.wal_backup_enabled,
|
||||
|
||||
@@ -33,7 +33,7 @@ use utils::{
|
||||
|
||||
/// Safekeeper handler of postgres commands
|
||||
pub struct SafekeeperPostgresHandler {
|
||||
pub conf: Arc<SafeKeeperConf>,
|
||||
pub conf: SafeKeeperConf,
|
||||
/// assigned application name
|
||||
pub appname: Option<String>,
|
||||
pub tenant_id: Option<TenantId>,
|
||||
@@ -43,7 +43,6 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub protocol: Option<PostgresClientProtocol>,
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
pub global_timelines: Arc<GlobalTimelines>,
|
||||
/// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
claims: Option<Claims>,
|
||||
@@ -315,11 +314,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: u32,
|
||||
io_metrics: Option<TrafficMetrics>,
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
@@ -333,7 +331,6 @@ impl SafekeeperPostgresHandler {
|
||||
claims: None,
|
||||
auth,
|
||||
io_metrics,
|
||||
global_timelines,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,7 +360,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
// Get timeline, handling "not found" error
|
||||
let tli = match self.global_timelines.get(self.ttid) {
|
||||
let tli = match GlobalTimelines::get(self.ttid) {
|
||||
Ok(tli) => Ok(Some(tli)),
|
||||
Err(TimelineError::NotFound(_)) => Ok(None),
|
||||
Err(e) => Err(QueryError::Other(e.into())),
|
||||
@@ -397,10 +394,7 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.get(self.ttid)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
|
||||
let lsn = if self.is_walproposer_recovery() {
|
||||
// walproposer should get all local WAL until flush_lsn
|
||||
|
||||
@@ -3,16 +3,14 @@ pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub use safekeeper_api::models;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
pub async fn task_main(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: SafeKeeperConf,
|
||||
http_listener: std::net::TcpListener,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
let router = make_router(conf, global_timelines)
|
||||
let router = make_router(conf)
|
||||
.build()
|
||||
.map_err(|err| anyhow::anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
|
||||
@@ -66,13 +66,6 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
|
||||
request
|
||||
.data::<Arc<GlobalTimelines>>()
|
||||
.expect("unknown state type")
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
@@ -130,11 +123,9 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
|
||||
// Using an `InternalServerError` should be fixed when the types support it
|
||||
let delete_info = global_timelines
|
||||
.delete_force_all_for_tenant(&tenant_id, only_local)
|
||||
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(
|
||||
@@ -165,9 +156,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
global_timelines
|
||||
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -178,9 +167,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
/// Note: it is possible to do the same with debug_dump.
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let res: Vec<TenantTimelineId> = global_timelines
|
||||
.get_all()
|
||||
let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
|
||||
.iter()
|
||||
.map(|tli| tli.ttid)
|
||||
.collect();
|
||||
@@ -195,8 +182,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let (inmem, state) = tli.get_state().await;
|
||||
let flush_lsn = tli.get_flush_lsn().await;
|
||||
|
||||
@@ -247,11 +233,9 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
|
||||
// error handling here when we're able to.
|
||||
let resp = global_timelines
|
||||
.delete(&ttid, only_local)
|
||||
let resp = GlobalTimelines::delete(&ttid, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
@@ -263,9 +247,8 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let data: pull_timeline::Request = json_request(&mut request).await?;
|
||||
let conf = get_conf(&request);
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
|
||||
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone(), global_timelines)
|
||||
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
@@ -280,8 +263,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
||||
// so create the chan and write to it in another task.
|
||||
@@ -311,19 +293,19 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TimelineCopyRequest = json_request(&mut request).await?;
|
||||
let source_ttid = TenantTimelineId::new(
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "source_timeline_id")?,
|
||||
);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let source = GlobalTimelines::get(ttid)?;
|
||||
|
||||
copy_timeline::handle_request(copy_timeline::Request{
|
||||
source_ttid,
|
||||
source,
|
||||
until_lsn: request_data.until_lsn,
|
||||
destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
|
||||
}, global_timelines)
|
||||
.instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||
destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
|
||||
})
|
||||
.instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -340,8 +322,7 @@ async fn patch_control_file_handler(
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let patch_request: patch_control_file::Request = json_request(&mut request).await?;
|
||||
let response = patch_control_file::handle_request(tli, patch_request)
|
||||
@@ -360,8 +341,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid)?;
|
||||
let tli = GlobalTimelines::get(ttid)?;
|
||||
tli.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
@@ -379,7 +359,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
|
||||
let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
|
||||
|
||||
@@ -392,7 +371,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
)))?,
|
||||
};
|
||||
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = tli
|
||||
.wal_residence_guard()
|
||||
.await
|
||||
@@ -414,8 +393,7 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let response = tli
|
||||
.backup_partial_reset()
|
||||
@@ -437,8 +415,7 @@ async fn timeline_term_bump_handler(
|
||||
|
||||
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let response = tli
|
||||
.term_bump(request_data.term)
|
||||
.await
|
||||
@@ -475,8 +452,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
standby_horizon: sk_info.standby_horizon.0,
|
||||
};
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
tli.record_safekeeper_info(proto_sk_info)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -530,8 +506,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
let dump_term_history = dump_term_history.unwrap_or(true);
|
||||
let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
|
||||
let args = debug_dump::Args {
|
||||
dump_all,
|
||||
dump_control_file,
|
||||
@@ -543,7 +517,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let resp = debug_dump::build(args, global_timelines)
|
||||
let resp = debug_dump::build(args)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -596,10 +570,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
if conf.http_auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
@@ -621,8 +592,7 @@ pub fn make_router(
|
||||
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
|
||||
let auth = conf.http_auth.clone();
|
||||
router
|
||||
.data(conf)
|
||||
.data(global_timelines)
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
|
||||
@@ -11,6 +11,7 @@ use postgres_backend::QueryError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
|
||||
@@ -20,6 +21,7 @@ use crate::safekeeper::{
|
||||
use crate::safekeeper::{Term, TermHistory, TermLsn};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::GlobalTimelines;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_ffi::encode_logical_message;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
@@ -68,7 +70,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
info!("JSON_CTRL request: {append_request:?}");
|
||||
|
||||
// need to init safekeeper state before AppendRequest
|
||||
let tli = prepare_safekeeper(spg, append_request.pg_version).await?;
|
||||
let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
|
||||
|
||||
// if send_proposer_elected is true, we need to update local history
|
||||
if append_request.send_proposer_elected {
|
||||
@@ -97,22 +99,20 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
/// Prepare safekeeper to process append requests without crashes,
|
||||
/// by sending ProposerGreeting with default server.wal_seg_size.
|
||||
async fn prepare_safekeeper(
|
||||
spg: &SafekeeperPostgresHandler,
|
||||
ttid: TenantTimelineId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<WalResidentTimeline> {
|
||||
let tli = spg
|
||||
.global_timelines
|
||||
.create(
|
||||
spg.ttid,
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
system_id: 0,
|
||||
},
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await?;
|
||||
let tli = GlobalTimelines::create(
|
||||
ttid,
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
system_id: 0,
|
||||
},
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await?;
|
||||
|
||||
tli.wal_residence_guard().await
|
||||
}
|
||||
|
||||
@@ -455,7 +455,6 @@ pub struct FullTimelineInfo {
|
||||
|
||||
/// Collects metrics for all active timelines.
|
||||
pub struct TimelineCollector {
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
descs: Vec<Desc>,
|
||||
commit_lsn: GenericGaugeVec<AtomicU64>,
|
||||
backup_lsn: GenericGaugeVec<AtomicU64>,
|
||||
@@ -479,8 +478,14 @@ pub struct TimelineCollector {
|
||||
active_timelines_count: IntGauge,
|
||||
}
|
||||
|
||||
impl Default for TimelineCollector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelineCollector {
|
||||
pub fn new(global_timelines: Arc<GlobalTimelines>) -> TimelineCollector {
|
||||
pub fn new() -> TimelineCollector {
|
||||
let mut descs = Vec::new();
|
||||
|
||||
let commit_lsn = GenericGaugeVec::new(
|
||||
@@ -671,7 +676,6 @@ impl TimelineCollector {
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
global_timelines,
|
||||
descs,
|
||||
commit_lsn,
|
||||
backup_lsn,
|
||||
@@ -724,18 +728,17 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
let timelines_count = self.global_timelines.get_all().len();
|
||||
let timelines_count = GlobalTimelines::get_all().len();
|
||||
let mut active_timelines_count = 0;
|
||||
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
// bridge the gap with a crutch, collect data in spawned thread with
|
||||
// local tokio runtime.
|
||||
let global_timelines = self.global_timelines.clone();
|
||||
let infos = std::thread::spawn(|| {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
rt.block_on(collect_timeline_metrics(global_timelines))
|
||||
rt.block_on(collect_timeline_metrics())
|
||||
})
|
||||
.join()
|
||||
.expect("collect_timeline_metrics thread panicked");
|
||||
@@ -854,9 +857,9 @@ impl Collector for TimelineCollector {
|
||||
}
|
||||
}
|
||||
|
||||
async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec<FullTimelineInfo> {
|
||||
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
|
||||
let mut res = vec![];
|
||||
let active_timelines = global_timelines.get_global_broker_active_set().get_all();
|
||||
let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
|
||||
|
||||
for tli in active_timelines {
|
||||
if let Some(info) = tli.info_for_metrics().await {
|
||||
|
||||
@@ -409,9 +409,8 @@ pub struct DebugDumpResponse {
|
||||
pub async fn handle_request(
|
||||
request: Request,
|
||||
sk_auth_token: Option<SecretString>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<Response> {
|
||||
let existing_tli = global_timelines.get(TenantTimelineId::new(
|
||||
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
|
||||
request.tenant_id,
|
||||
request.timeline_id,
|
||||
));
|
||||
@@ -454,14 +453,13 @@ pub async fn handle_request(
|
||||
assert!(status.tenant_id == request.tenant_id);
|
||||
assert!(status.timeline_id == request.timeline_id);
|
||||
|
||||
pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await
|
||||
pull_timeline(status, safekeeper_host, sk_auth_token).await
|
||||
}
|
||||
|
||||
async fn pull_timeline(
|
||||
status: TimelineStatus,
|
||||
host: String,
|
||||
sk_auth_token: Option<SecretString>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<Response> {
|
||||
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
||||
info!(
|
||||
@@ -474,7 +472,7 @@ async fn pull_timeline(
|
||||
status.acceptor_state.epoch
|
||||
);
|
||||
|
||||
let conf = &global_timelines.get_global_config();
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
|
||||
@@ -533,9 +531,7 @@ async fn pull_timeline(
|
||||
assert!(status.commit_lsn <= status.flush_lsn);
|
||||
|
||||
// Finally, load the timeline.
|
||||
let _tli = global_timelines
|
||||
.load_temp_timeline(ttid, &tli_dir_path, false)
|
||||
.await?;
|
||||
let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
|
||||
|
||||
Ok(Response {
|
||||
safekeeper_host: host,
|
||||
|
||||
@@ -267,7 +267,6 @@ impl SafekeeperPostgresHandler {
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
@@ -332,7 +331,6 @@ struct NetworkReader<'a, IO> {
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
}
|
||||
|
||||
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
@@ -352,11 +350,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
let tli =
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::WalReaderStreamBuilder;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
@@ -399,10 +400,7 @@ impl SafekeeperPostgresHandler {
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.get(self.ttid)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
let residence_guard = tli.wal_residence_guard().await?;
|
||||
|
||||
if let Err(end) = self
|
||||
|
||||
@@ -44,8 +44,8 @@ use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
|
||||
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
|
||||
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{debug_dump, timeline_manager, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -467,7 +467,6 @@ pub struct Timeline {
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
manager_ctl: ManagerCtl,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
|
||||
/// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding
|
||||
/// this gate, you must respect [`Timeline::cancel`]
|
||||
@@ -490,7 +489,6 @@ impl Timeline {
|
||||
timeline_dir: &Utf8Path,
|
||||
remote_path: &RemotePath,
|
||||
shared_state: SharedState,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
) -> Arc<Self> {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state().commit_lsn);
|
||||
@@ -518,7 +516,6 @@ impl Timeline {
|
||||
gate: Default::default(),
|
||||
cancel: CancellationToken::default(),
|
||||
manager_ctl: ManagerCtl::new(),
|
||||
conf,
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
@@ -527,14 +524,11 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
ttid: TenantTimelineId,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf.as_ref(), &ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf.as_ref(), &ttid);
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf, &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
Ok(Timeline::new(
|
||||
@@ -542,7 +536,6 @@ impl Timeline {
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -611,7 +604,8 @@ impl Timeline {
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.close_wal_store();
|
||||
|
||||
if !only_local && self.conf.is_wal_backup_enabled() {
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
if !only_local && conf.is_wal_backup_enabled() {
|
||||
// Note: we concurrently delete remote storage data from multiple
|
||||
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
|
||||
// do some retries anyway.
|
||||
@@ -957,7 +951,7 @@ impl WalResidentTimeline {
|
||||
|
||||
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
|
||||
let (_, persisted_state) = self.get_state().await;
|
||||
let enable_remote_read = self.conf.is_wal_backup_enabled();
|
||||
let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
|
||||
|
||||
WalReader::new(
|
||||
&self.ttid,
|
||||
@@ -1067,6 +1061,7 @@ impl ManagerTimeline {
|
||||
|
||||
/// Try to switch state Offloaded->Present.
|
||||
pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
let mut shared = self.write_shared_state().await;
|
||||
|
||||
// trying to restore WAL storage
|
||||
@@ -1074,7 +1069,7 @@ impl ManagerTimeline {
|
||||
&self.ttid,
|
||||
&self.timeline_dir,
|
||||
shared.sk.state(),
|
||||
self.conf.no_sync,
|
||||
conf.no_sync,
|
||||
)?;
|
||||
|
||||
// updating control file
|
||||
@@ -1101,7 +1096,7 @@ impl ManagerTimeline {
|
||||
// now we can switch shared.sk to Present, shouldn't fail
|
||||
let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
|
||||
let cfile_state = prev_sk.take_state();
|
||||
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, self.conf.my_id)?);
|
||||
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
@@ -41,16 +42,23 @@ struct GlobalTimelinesState {
|
||||
// this map is dropped on restart.
|
||||
tombstones: HashMap<TenantTimelineId, Instant>,
|
||||
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: Option<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
}
|
||||
|
||||
impl GlobalTimelinesState {
|
||||
/// Get configuration, which must be set once during init.
|
||||
fn get_conf(&self) -> &SafeKeeperConf {
|
||||
self.conf
|
||||
.as_ref()
|
||||
.expect("GlobalTimelinesState conf is not initialized")
|
||||
}
|
||||
|
||||
/// Get dependencies for a timeline constructor.
|
||||
fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
|
||||
fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
|
||||
(
|
||||
self.conf.clone(),
|
||||
self.get_conf().clone(),
|
||||
self.broker_active_set.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
)
|
||||
@@ -74,39 +82,35 @@ impl GlobalTimelinesState {
|
||||
}
|
||||
}
|
||||
|
||||
/// A struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines {
|
||||
state: Mutex<GlobalTimelinesState>,
|
||||
}
|
||||
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
||||
Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
tombstones: HashMap::new(),
|
||||
conf: None,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
})
|
||||
});
|
||||
|
||||
/// A zero-sized struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Create a new instance of the global timelines map.
|
||||
pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
|
||||
Self {
|
||||
state: Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
tombstones: HashMap::new(),
|
||||
conf,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
|
||||
pub async fn init(&self) -> Result<()> {
|
||||
pub async fn init(conf: SafeKeeperConf) -> Result<()> {
|
||||
// clippy isn't smart enough to understand that drop(state) releases the
|
||||
// lock, so use explicit block
|
||||
let tenants_dir = {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
state.global_rate_limiter = RateLimiter::new(
|
||||
state.conf.partial_backup_concurrency,
|
||||
conf.partial_backup_concurrency,
|
||||
DEFAULT_EVICTION_CONCURRENCY,
|
||||
);
|
||||
state.conf = Some(conf);
|
||||
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
// named as a valid tenant_id.
|
||||
state.conf.workdir.clone()
|
||||
state.get_conf().workdir.clone()
|
||||
};
|
||||
let mut tenant_count = 0;
|
||||
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
@@ -118,7 +122,7 @@ impl GlobalTimelines {
|
||||
TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
tenant_count += 1;
|
||||
self.load_tenant_timelines(tenant_id).await?;
|
||||
GlobalTimelines::load_tenant_timelines(tenant_id).await?;
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
@@ -131,7 +135,7 @@ impl GlobalTimelines {
|
||||
info!(
|
||||
"found {} tenants directories, successfully loaded {} timelines",
|
||||
tenant_count,
|
||||
self.state.lock().unwrap().timelines.len()
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -139,13 +143,13 @@ impl GlobalTimelines {
|
||||
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
|
||||
/// errors if any.
|
||||
///
|
||||
/// It is async, but self.state lock is sync and there is no important
|
||||
/// It is async, but TIMELINES_STATE lock is sync and there is no important
|
||||
/// reason to make it async (it is always held for a short while), so we
|
||||
/// just lock and unlock it for each timeline -- this function is called
|
||||
/// during init when nothing else is running, so this is fine.
|
||||
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
|
||||
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
state.get_dependencies()
|
||||
};
|
||||
|
||||
@@ -159,10 +163,10 @@ impl GlobalTimelines {
|
||||
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
match Timeline::load_timeline(conf.clone(), ttid) {
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
Ok(tli) => {
|
||||
let mut shared_state = tli.write_shared_state().await;
|
||||
self.state
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
@@ -196,30 +200,29 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Get the number of timelines in the map.
|
||||
pub fn timelines_count(&self) -> usize {
|
||||
self.state.lock().unwrap().timelines.len()
|
||||
pub fn timelines_count() -> usize {
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
}
|
||||
|
||||
/// Get the global safekeeper config.
|
||||
pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
|
||||
self.state.lock().unwrap().conf.clone()
|
||||
pub fn get_global_config() -> SafeKeeperConf {
|
||||
TIMELINES_STATE.lock().unwrap().get_conf().clone()
|
||||
}
|
||||
|
||||
pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
|
||||
self.state.lock().unwrap().broker_active_set.clone()
|
||||
pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
|
||||
TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
|
||||
}
|
||||
|
||||
/// Create a new timeline with the given id. If the timeline already exists, returns
|
||||
/// an existing timeline.
|
||||
pub(crate) async fn create(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
server_info: ServerInfo,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
if let Ok(timeline) = state.get(&ttid) {
|
||||
// Timeline already exists, return it.
|
||||
return Ok(timeline);
|
||||
@@ -242,7 +245,7 @@ impl GlobalTimelines {
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
||||
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -258,14 +261,13 @@ impl GlobalTimelines {
|
||||
/// 2) move the directory and load the timeline
|
||||
/// 3) take lock again and insert the timeline into the global map.
|
||||
pub async fn load_temp_timeline(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
tmp_path: &Utf8PathBuf,
|
||||
check_tombstone: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// Check for existence and mark that we're creating it.
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
match state.timelines.get(&ttid) {
|
||||
Some(GlobalMapTimeline::CreationInProgress) => {
|
||||
bail!(TimelineError::CreationInProgress(ttid));
|
||||
@@ -293,10 +295,10 @@ impl GlobalTimelines {
|
||||
};
|
||||
|
||||
// Do the actual move and reflect the result in the map.
|
||||
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
|
||||
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
|
||||
Ok(timeline) => {
|
||||
let mut timeline_shared_state = timeline.write_shared_state().await;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(matches!(
|
||||
state.timelines.get(&ttid),
|
||||
Some(GlobalMapTimeline::CreationInProgress)
|
||||
@@ -317,7 +319,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
Err(e) => {
|
||||
// Init failed, remove the marker from the map
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
assert!(matches!(
|
||||
state.timelines.get(&ttid),
|
||||
Some(GlobalMapTimeline::CreationInProgress)
|
||||
@@ -332,10 +334,10 @@ impl GlobalTimelines {
|
||||
async fn install_temp_timeline(
|
||||
ttid: TenantTimelineId,
|
||||
tmp_path: &Utf8PathBuf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
|
||||
let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
|
||||
let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
|
||||
let timeline_path = get_timeline_dir(conf, &ttid);
|
||||
|
||||
// We must have already checked that timeline doesn't exist in the map,
|
||||
// but there might be existing datadir: if timeline is corrupted it is
|
||||
@@ -380,9 +382,9 @@ impl GlobalTimelines {
|
||||
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
|
||||
/// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
|
||||
/// i.e. loaded in memory and not cancelled.
|
||||
pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
let tli_res = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
state.get(&ttid)
|
||||
};
|
||||
match tli_res {
|
||||
@@ -397,8 +399,8 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Returns all timelines. This is used for background timeline processes.
|
||||
pub fn get_all(&self) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
pub fn get_all() -> Vec<Arc<Timeline>> {
|
||||
let global_lock = TIMELINES_STATE.lock().unwrap();
|
||||
global_lock
|
||||
.timelines
|
||||
.values()
|
||||
@@ -417,8 +419,8 @@ impl GlobalTimelines {
|
||||
|
||||
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
|
||||
/// and that's why it can return cancelled timelines, to retry deleting them.
|
||||
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
fn get_all_for_tenant(tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = TIMELINES_STATE.lock().unwrap();
|
||||
global_lock
|
||||
.timelines
|
||||
.values()
|
||||
@@ -433,12 +435,11 @@ impl GlobalTimelines {
|
||||
/// Cancels timeline, then deletes the corresponding data directory.
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub(crate) async fn delete(
|
||||
&self,
|
||||
ttid: &TenantTimelineId,
|
||||
only_local: bool,
|
||||
) -> Result<TimelineDeleteForceResult> {
|
||||
let tli_res = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
|
||||
if state.tombstones.contains_key(ttid) {
|
||||
// Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
|
||||
@@ -471,7 +472,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeline is not memory, but it may still exist on disk in broken state.
|
||||
let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
|
||||
let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
|
||||
let dir_existed = delete_dir(dir_path)?;
|
||||
|
||||
Ok(TimelineDeleteForceResult {
|
||||
@@ -484,7 +485,7 @@ impl GlobalTimelines {
|
||||
// Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones
|
||||
// are used to prevent still-running computes from re-creating the same timeline when they send data,
|
||||
// and to speed up repeated deletion calls by avoiding re-listing objects.
|
||||
self.state.lock().unwrap().delete(*ttid);
|
||||
TIMELINES_STATE.lock().unwrap().delete(*ttid);
|
||||
|
||||
result
|
||||
}
|
||||
@@ -496,18 +497,17 @@ impl GlobalTimelines {
|
||||
///
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub async fn delete_force_all_for_tenant(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
only_local: bool,
|
||||
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
|
||||
info!("deleting all timelines for tenant {}", tenant_id);
|
||||
let to_delete = self.get_all_for_tenant(*tenant_id);
|
||||
let to_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
|
||||
let mut err = None;
|
||||
|
||||
let mut deleted = HashMap::new();
|
||||
for tli in &to_delete {
|
||||
match self.delete(&tli.ttid, only_local).await {
|
||||
match Self::delete(&tli.ttid, only_local).await {
|
||||
Ok(result) => {
|
||||
deleted.insert(tli.ttid, result);
|
||||
}
|
||||
@@ -529,15 +529,15 @@ impl GlobalTimelines {
|
||||
// so the directory may be not empty. In this case timelines will have bad state
|
||||
// and timeline background jobs can panic.
|
||||
delete_dir(get_tenant_dir(
|
||||
self.state.lock().unwrap().conf.as_ref(),
|
||||
TIMELINES_STATE.lock().unwrap().get_conf(),
|
||||
tenant_id,
|
||||
))?;
|
||||
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
pub fn housekeeping(tombstone_ttl: &Duration) {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
|
||||
// We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
|
||||
// timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::QueryError;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_io_timeout::TimeoutReader;
|
||||
@@ -12,9 +11,9 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{auth::Scope, measured_stream::MeasuredStream};
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::TrafficMetrics;
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
@@ -23,10 +22,9 @@ use postgres_backend::{AuthType, PostgresBackend};
|
||||
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
|
||||
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
|
||||
pub async fn task_main(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: SafeKeeperConf,
|
||||
pg_listener: std::net::TcpListener,
|
||||
allowed_auth_scope: Scope,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
@@ -39,10 +37,10 @@ pub async fn task_main(
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
let conn_id = issue_connection_id(&mut connection_count);
|
||||
let global_timelines = global_timelines.clone();
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
}
|
||||
@@ -55,10 +53,9 @@ pub async fn task_main(
|
||||
///
|
||||
async fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: ConnectionId,
|
||||
allowed_auth_scope: Scope,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<(), QueryError> {
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
@@ -99,13 +96,8 @@ async fn handle_socket(
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(
|
||||
conf,
|
||||
conn_id,
|
||||
Some(traffic_metrics.clone()),
|
||||
auth_pair,
|
||||
global_timelines,
|
||||
);
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::NodeId;
|
||||
|
||||
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
|
||||
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct Drain {
|
||||
|
||||
@@ -18,9 +18,8 @@ use pageserver_api::controller_api::{
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
|
||||
TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
|
||||
TimelineCreateRequest,
|
||||
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::{mgmt_api, BlockUnblock};
|
||||
@@ -209,27 +208,6 @@ async fn handle_tenant_location_config(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_patch(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let mut req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service.tenant_config_patch(config_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_set(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -879,21 +857,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Infra)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
let safekeepers = state.service.safekeepers_list().await?;
|
||||
json_response(StatusCode::OK, safekeepers)
|
||||
}
|
||||
|
||||
async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Scrubber)?;
|
||||
|
||||
@@ -1218,7 +1181,7 @@ impl From<ReconcileError> for ApiError {
|
||||
///
|
||||
/// Not used by anything except manual testing.
|
||||
async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Infra)?;
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
@@ -1236,7 +1199,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
|
||||
match res {
|
||||
Ok(b) => json_response(StatusCode::OK, b),
|
||||
Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
|
||||
Err(ApiError::NotFound("unknown instance id".into()))
|
||||
Err(ApiError::NotFound("unknown instance_id".into()))
|
||||
}
|
||||
Err(other) => Err(other.into()),
|
||||
}
|
||||
@@ -1832,21 +1795,6 @@ pub fn make_router(
|
||||
RequestName("control_v1_metadata_health_list_outdated"),
|
||||
)
|
||||
})
|
||||
// Safekeepers
|
||||
.get("/control/v1/safekeeper", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_safekeeper_list,
|
||||
RequestName("control_v1_safekeeper_list"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/safekeeper/:id", |r| {
|
||||
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(
|
||||
@@ -1899,6 +1847,13 @@ pub fn make_router(
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
.get("/control/v1/safekeeper/:id", |r| {
|
||||
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
@@ -1908,13 +1863,6 @@ pub fn make_router(
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
|
||||
})
|
||||
.patch("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_config_patch,
|
||||
RequestName("v1_tenant_config"),
|
||||
)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
|
||||
})
|
||||
|
||||
@@ -104,7 +104,6 @@ pub(crate) enum DatabaseOperation {
|
||||
ListMetadataHealth,
|
||||
ListMetadataHealthUnhealthy,
|
||||
ListMetadataHealthOutdated,
|
||||
ListSafekeepers,
|
||||
GetLeader,
|
||||
UpdateLeader,
|
||||
SetPreferredAzs,
|
||||
@@ -637,13 +636,6 @@ impl Persistence {
|
||||
.into_boxed(),
|
||||
};
|
||||
|
||||
// Clear generation_pageserver if we are moving into a state where we won't have
|
||||
// any attached pageservers.
|
||||
let input_generation_pageserver = match input_placement_policy {
|
||||
None | Some(PlacementPolicy::Attached(_)) => None,
|
||||
Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
|
||||
};
|
||||
|
||||
#[derive(AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
struct ShardUpdate {
|
||||
@@ -651,7 +643,6 @@ impl Persistence {
|
||||
placement_policy: Option<String>,
|
||||
config: Option<String>,
|
||||
scheduling_policy: Option<String>,
|
||||
generation_pageserver: Option<Option<i64>>,
|
||||
}
|
||||
|
||||
let update = ShardUpdate {
|
||||
@@ -664,7 +655,6 @@ impl Persistence {
|
||||
.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
generation_pageserver: input_generation_pageserver,
|
||||
};
|
||||
|
||||
query.set(update).execute(conn)?;
|
||||
@@ -1012,22 +1002,6 @@ impl Persistence {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
|
||||
let safekeepers: Vec<SafekeeperPersistence> = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListNodes,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
|
||||
|
||||
Ok(safekeepers)
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeeper_get(
|
||||
&self,
|
||||
id: i64,
|
||||
|
||||
@@ -742,50 +742,6 @@ impl Scheduler {
|
||||
self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
|
||||
}
|
||||
|
||||
/// For choosing which AZ to schedule a new shard into, use this. It will return the
|
||||
/// AZ with the lowest median utilization.
|
||||
///
|
||||
/// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
|
||||
/// node, because while tenants start out single sharded, when they grow and undergo
|
||||
/// shard-split, they will occupy space on many nodes within an AZ.
|
||||
///
|
||||
/// We use median rather than total free space or mean utilization, because
|
||||
/// we wish to avoid preferring AZs that have low-load nodes resulting from
|
||||
/// recent replacements.
|
||||
///
|
||||
/// The practical result is that we will pick an AZ based on its median node, and
|
||||
/// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
|
||||
pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
|
||||
if self.nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut scores_by_az = HashMap::new();
|
||||
for (node_id, node) in &self.nodes {
|
||||
let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
|
||||
let score = match &node.may_schedule {
|
||||
MaySchedule::Yes(utilization) => utilization.score(),
|
||||
MaySchedule::No => PageserverUtilization::full().score(),
|
||||
};
|
||||
az_scores.push((node_id, node, score));
|
||||
}
|
||||
|
||||
// Sort by utilization. Also include the node ID to break ties.
|
||||
for scores in scores_by_az.values_mut() {
|
||||
scores.sort_by_key(|i| (i.2, i.0));
|
||||
}
|
||||
|
||||
let mut median_by_az = scores_by_az
|
||||
.iter()
|
||||
.map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
|
||||
.collect::<Vec<_>>();
|
||||
// Sort by utilization. Also include the AZ to break ties.
|
||||
median_by_az.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
// Return the AZ with the lowest median utilization
|
||||
Some(median_by_az.first().unwrap().0.clone())
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
@@ -1131,53 +1087,4 @@ mod tests {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn az_scheduling_for_new_tenant() {
|
||||
let az_a_tag = AvailabilityZone("az-a".to_string());
|
||||
let az_b_tag = AvailabilityZone("az-b".to_string());
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
6,
|
||||
&[
|
||||
az_a_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_a_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
az_b_tag.clone(),
|
||||
],
|
||||
);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
/// Force the utilization of a node in Scheduler's state to a particular
|
||||
/// number of bytes used.
|
||||
fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
|
||||
let mut node = Node::new(
|
||||
node_id,
|
||||
"".to_string(),
|
||||
0,
|
||||
"".to_string(),
|
||||
0,
|
||||
scheduler.nodes.get(&node_id).unwrap().az.clone(),
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(test_utilization::simple(
|
||||
shard_count,
|
||||
0,
|
||||
)));
|
||||
scheduler.node_upsert(&node);
|
||||
}
|
||||
|
||||
// Initial empty state. Scores are tied, scheduler prefers lower AZ ID.
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
|
||||
// Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
|
||||
set_utilization(&mut scheduler, NodeId(1), 1000000);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
||||
|
||||
// Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
|
||||
// should prefer the other AZ.
|
||||
set_utilization(&mut scheduler, NodeId(2), 1000000);
|
||||
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,19 +29,6 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
safekeepers (id) {
|
||||
id -> Int8,
|
||||
region_id -> Text,
|
||||
version -> Int8,
|
||||
host -> Text,
|
||||
port -> Int4,
|
||||
active -> Bool,
|
||||
http_port -> Int4,
|
||||
availability_zone_id -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
@@ -58,10 +45,18 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(
|
||||
controllers,
|
||||
metadata_health,
|
||||
nodes,
|
||||
safekeepers,
|
||||
tenant_shards,
|
||||
);
|
||||
diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
|
||||
|
||||
diesel::table! {
|
||||
safekeepers {
|
||||
id -> Int8,
|
||||
region_id -> Text,
|
||||
version -> Int8,
|
||||
instance_id -> Text,
|
||||
host -> Text,
|
||||
port -> Int4,
|
||||
active -> Bool,
|
||||
http_port -> Int4,
|
||||
availability_zone_id -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,8 +52,8 @@ use pageserver_api::{
|
||||
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
|
||||
TimelineArchivalConfigRequest, TopTenantShardsRequest,
|
||||
SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
|
||||
TopTenantShardsRequest,
|
||||
},
|
||||
};
|
||||
use reqwest::StatusCode;
|
||||
@@ -100,8 +100,6 @@ use crate::{
|
||||
|
||||
use context_iterator::TenantShardContextIterator;
|
||||
|
||||
const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
@@ -141,7 +139,6 @@ enum TenantOperations {
|
||||
Create,
|
||||
LocationConfig,
|
||||
ConfigSet,
|
||||
ConfigPatch,
|
||||
TimeTravelRemoteStorage,
|
||||
Delete,
|
||||
UpdatePolicy,
|
||||
@@ -516,9 +513,6 @@ struct ShardUpdate {
|
||||
|
||||
/// If this is None, generation is not updated.
|
||||
generation: Option<Generation>,
|
||||
|
||||
/// If this is None, scheduling policy is not updated.
|
||||
scheduling_policy: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
enum StopReconciliationsReason {
|
||||
@@ -1582,7 +1576,6 @@ impl Service {
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
None,
|
||||
),
|
||||
);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
@@ -2110,16 +2103,6 @@ impl Service {
|
||||
)
|
||||
};
|
||||
|
||||
let preferred_az_id = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
// Idempotency: take the existing value if the tenant already exists
|
||||
if let Some(shard) = locked.tenants.get(create_ids.first().unwrap()) {
|
||||
shard.preferred_az().cloned()
|
||||
} else {
|
||||
locked.scheduler.get_az_for_new_tenant()
|
||||
}
|
||||
};
|
||||
|
||||
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
|
||||
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
|
||||
// during the creation, rather than risking leaving orphan objects in S3.
|
||||
@@ -2139,7 +2122,7 @@ impl Service {
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
preferred_az_id: preferred_az_id.as_ref().map(|az| az.to_string()),
|
||||
preferred_az_id: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -2175,7 +2158,6 @@ impl Service {
|
||||
&create_req.shard_parameters,
|
||||
create_req.config.clone(),
|
||||
placement_policy.clone(),
|
||||
preferred_az_id.as_ref(),
|
||||
&mut schedule_context,
|
||||
)
|
||||
.await;
|
||||
@@ -2189,6 +2171,44 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
let preferred_azs = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
response_shards
|
||||
.iter()
|
||||
.filter_map(|resp| {
|
||||
let az_id = locked
|
||||
.nodes
|
||||
.get(&resp.node_id)
|
||||
.map(|n| n.get_availability_zone_id().clone())?;
|
||||
|
||||
Some((resp.shard_id, az_id))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
// Note that we persist the preferred AZ for the new shards separately.
|
||||
// In theory, we could "peek" the scheduler to determine where the shard will
|
||||
// land, but the subsequent "real" call into the scheduler might select a different
|
||||
// node. Hence, we do this awkward update to keep things consistent.
|
||||
let updated = self
|
||||
.persistence
|
||||
.set_tenant_shard_preferred_azs(preferred_azs)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Failed to persist preferred az ids: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
for (tid, az_id) in updated {
|
||||
if let Some(shard) = locked.tenants.get_mut(&tid) {
|
||||
shard.set_preferred_az(az_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we failed to schedule shards, then they are still created in the controller,
|
||||
// but we return an error to the requester to avoid a silent failure when someone
|
||||
// tries to e.g. create a tenant whose placement policy requires more nodes than
|
||||
@@ -2219,7 +2239,6 @@ impl Service {
|
||||
|
||||
/// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
|
||||
/// case of a new tenant and a pre-existing one.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn do_initial_shard_scheduling(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -2227,7 +2246,6 @@ impl Service {
|
||||
shard_params: &ShardParameters,
|
||||
config: TenantConfig,
|
||||
placement_policy: PlacementPolicy,
|
||||
preferred_az_id: Option<&AvailabilityZone>,
|
||||
schedule_context: &mut ScheduleContext,
|
||||
) -> InitialShardScheduleOutcome {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
@@ -2238,6 +2256,10 @@ impl Service {
|
||||
Entry::Occupied(mut entry) => {
|
||||
tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
|
||||
|
||||
// TODO: schedule() should take an anti-affinity expression that pushes
|
||||
// attached and secondary locations (independently) away frorm those
|
||||
// pageservers also holding a shard for this tenant.
|
||||
|
||||
if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
|
||||
return InitialShardScheduleOutcome::ShardScheduleError(err);
|
||||
}
|
||||
@@ -2261,7 +2283,6 @@ impl Service {
|
||||
tenant_shard_id,
|
||||
ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
|
||||
placement_policy,
|
||||
preferred_az_id.cloned(),
|
||||
));
|
||||
|
||||
state.generation = initial_generation;
|
||||
@@ -2355,23 +2376,6 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
// Ordinarily we do not update scheduling policy, but when making major changes
|
||||
// like detaching or demoting to secondary-only, we need to force the scheduling
|
||||
// mode to Active, or the caller's expected outcome (detach it) will not happen.
|
||||
let scheduling_policy = match req.config.mode {
|
||||
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
|
||||
// Special case: when making major changes like detaching or demoting to secondary-only,
|
||||
// we need to force the scheduling mode to Active, or nothing will happen.
|
||||
Some(ShardSchedulingPolicy::Active)
|
||||
}
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// While attached, continue to respect whatever the existing scheduling mode is.
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
// Saw an existing shard: this is not a creation
|
||||
@@ -2397,7 +2401,6 @@ impl Service {
|
||||
placement_policy: placement_policy.clone(),
|
||||
tenant_config: req.config.tenant_conf.clone(),
|
||||
generation: set_generation,
|
||||
scheduling_policy,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2494,7 +2497,6 @@ impl Service {
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation,
|
||||
scheduling_policy,
|
||||
} in &updates
|
||||
{
|
||||
self.persistence
|
||||
@@ -2503,7 +2505,7 @@ impl Service {
|
||||
Some(placement_policy.clone()),
|
||||
Some(tenant_config.clone()),
|
||||
*generation,
|
||||
*scheduling_policy,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -2519,7 +2521,6 @@ impl Service {
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation: update_generation,
|
||||
scheduling_policy,
|
||||
} in updates
|
||||
{
|
||||
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
@@ -2538,10 +2539,6 @@ impl Service {
|
||||
shard.generation = Some(generation);
|
||||
}
|
||||
|
||||
if let Some(scheduling_policy) = scheduling_policy {
|
||||
shard.set_scheduling_policy(scheduling_policy);
|
||||
}
|
||||
|
||||
shard.schedule(scheduler, &mut schedule_context)?;
|
||||
|
||||
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
|
||||
@@ -2578,55 +2575,6 @@ impl Service {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_config_patch(
|
||||
&self,
|
||||
req: TenantConfigPatchRequest,
|
||||
) -> Result<(), ApiError> {
|
||||
let _tenant_lock = trace_exclusive_lock(
|
||||
&self.tenant_op_locks,
|
||||
req.tenant_id,
|
||||
TenantOperations::ConfigPatch,
|
||||
)
|
||||
.await;
|
||||
|
||||
let tenant_id = req.tenant_id;
|
||||
let patch = req.config;
|
||||
|
||||
let base = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let shards = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(req.tenant_id));
|
||||
|
||||
let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
|
||||
|
||||
let first = match configs.peek() {
|
||||
Some(first) => (*first).clone(),
|
||||
None => {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
if !configs.all_equal() {
|
||||
tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
|
||||
// This can't happen because we atomically update the database records
|
||||
// of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Tenant configs for {} are mismatched",
|
||||
req.tenant_id
|
||||
)));
|
||||
}
|
||||
|
||||
first
|
||||
};
|
||||
|
||||
let updated_config = base.apply_patch(patch);
|
||||
self.set_tenant_config_and_reconcile(tenant_id, updated_config)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
|
||||
// We require an exclusive lock, because we are updating persistent and in-memory state
|
||||
let _tenant_lock = trace_exclusive_lock(
|
||||
@@ -2636,32 +2584,12 @@ impl Service {
|
||||
)
|
||||
.await;
|
||||
|
||||
let tenant_exists = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut r = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(req.tenant_id));
|
||||
r.next().is_some()
|
||||
};
|
||||
let tenant_id = req.tenant_id;
|
||||
let config = req.config;
|
||||
|
||||
if !tenant_exists {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
|
||||
));
|
||||
}
|
||||
|
||||
self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn set_tenant_config_and_reconcile(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
config: TenantConfig,
|
||||
) -> Result<(), ApiError> {
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Tenant(tenant_id),
|
||||
TenantFilter::Tenant(req.tenant_id),
|
||||
None,
|
||||
Some(config.clone()),
|
||||
None,
|
||||
@@ -3064,17 +2992,9 @@ impl Service {
|
||||
|
||||
let TenantPolicyRequest {
|
||||
placement,
|
||||
mut scheduling,
|
||||
scheduling,
|
||||
} = req;
|
||||
|
||||
if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
|
||||
// When someone configures a tenant to detach, we force the scheduling policy to enable
|
||||
// this to take effect.
|
||||
if scheduling.is_none() {
|
||||
scheduling = Some(ShardSchedulingPolicy::Active);
|
||||
}
|
||||
}
|
||||
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Tenant(tenant_id),
|
||||
@@ -4229,8 +4149,7 @@ impl Service {
|
||||
},
|
||||
);
|
||||
|
||||
let mut child_state =
|
||||
TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
|
||||
let mut child_state = TenantShard::new(child, child_shard, policy.clone());
|
||||
child_state.intent = IntentState::single(scheduler, Some(pageserver));
|
||||
child_state.observed = ObservedState {
|
||||
locations: child_observed,
|
||||
@@ -6774,7 +6693,7 @@ impl Service {
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
|
||||
@@ -7027,7 +6946,7 @@ impl Service {
|
||||
}
|
||||
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
}
|
||||
|
||||
@@ -7159,12 +7078,6 @@ impl Service {
|
||||
global_observed
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeepers_list(
|
||||
&self,
|
||||
) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
|
||||
self.persistence.list_safekeepers().await
|
||||
}
|
||||
|
||||
pub(crate) async fn get_safekeeper(
|
||||
&self,
|
||||
id: i64,
|
||||
|
||||
@@ -472,7 +472,6 @@ impl TenantShard {
|
||||
tenant_shard_id: TenantShardId,
|
||||
shard: ShardIdentity,
|
||||
policy: PlacementPolicy,
|
||||
preferred_az_id: Option<AvailabilityZone>,
|
||||
) -> Self {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
@@ -496,7 +495,7 @@ impl TenantShard {
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
preferred_az_id,
|
||||
preferred_az_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1572,7 +1571,6 @@ pub(crate) mod tests {
|
||||
)
|
||||
.unwrap(),
|
||||
policy,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1599,7 +1597,7 @@ pub(crate) mod tests {
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantShard::new(
|
||||
let mut ts = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
@@ -1608,8 +1606,13 @@ pub(crate) mod tests {
|
||||
)
|
||||
.unwrap(),
|
||||
policy.clone(),
|
||||
preferred_az.clone(),
|
||||
)
|
||||
);
|
||||
|
||||
if let Some(az) = &preferred_az {
|
||||
ts.set_preferred_az(az.clone());
|
||||
}
|
||||
|
||||
ts
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -533,9 +533,8 @@ async fn list_timeline_blobs_impl(
|
||||
}
|
||||
|
||||
pub(crate) struct RemoteTenantManifestInfo {
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) manifest: TenantManifest,
|
||||
pub(crate) listing_object: ListingObject,
|
||||
pub(crate) latest_generation: Option<Generation>,
|
||||
pub(crate) manifests: Vec<(Generation, ListingObject)>,
|
||||
}
|
||||
|
||||
pub(crate) enum ListTenantManifestResult {
|
||||
@@ -544,10 +543,7 @@ pub(crate) enum ListTenantManifestResult {
|
||||
#[allow(dead_code)]
|
||||
unknown_keys: Vec<ListingObject>,
|
||||
},
|
||||
NoErrors {
|
||||
latest_generation: Option<RemoteTenantManifestInfo>,
|
||||
manifests: Vec<(Generation, ListingObject)>,
|
||||
},
|
||||
NoErrors(RemoteTenantManifestInfo),
|
||||
}
|
||||
|
||||
/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
|
||||
@@ -596,6 +592,14 @@ pub(crate) async fn list_tenant_manifests(
|
||||
unknown_keys.push(obj);
|
||||
}
|
||||
|
||||
if manifests.is_empty() {
|
||||
tracing::debug!("No manifest for timeline.");
|
||||
|
||||
return Ok(ListTenantManifestResult::WithErrors {
|
||||
errors,
|
||||
unknown_keys,
|
||||
});
|
||||
}
|
||||
if !unknown_keys.is_empty() {
|
||||
errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
|
||||
|
||||
@@ -605,15 +609,6 @@ pub(crate) async fn list_tenant_manifests(
|
||||
});
|
||||
}
|
||||
|
||||
if manifests.is_empty() {
|
||||
tracing::debug!("No manifest for timeline.");
|
||||
|
||||
return Ok(ListTenantManifestResult::NoErrors {
|
||||
latest_generation: None,
|
||||
manifests,
|
||||
});
|
||||
}
|
||||
|
||||
// Find the manifest with the highest generation
|
||||
let (latest_generation, latest_listing_object) = manifests
|
||||
.iter()
|
||||
@@ -621,8 +616,6 @@ pub(crate) async fn list_tenant_manifests(
|
||||
.map(|(g, obj)| (*g, obj.clone()))
|
||||
.unwrap();
|
||||
|
||||
manifests.retain(|(gen, _obj)| gen != &latest_generation);
|
||||
|
||||
let manifest_bytes =
|
||||
match download_object_with_retries(remote_client, &latest_listing_object.key).await {
|
||||
Ok(bytes) => bytes,
|
||||
@@ -641,15 +634,13 @@ pub(crate) async fn list_tenant_manifests(
|
||||
};
|
||||
|
||||
match TenantManifest::from_json_bytes(&manifest_bytes) {
|
||||
Ok(manifest) => {
|
||||
return Ok(ListTenantManifestResult::NoErrors {
|
||||
latest_generation: Some(RemoteTenantManifestInfo {
|
||||
generation: latest_generation,
|
||||
manifest,
|
||||
listing_object: latest_listing_object,
|
||||
}),
|
||||
manifests,
|
||||
});
|
||||
Ok(_manifest) => {
|
||||
return Ok(ListTenantManifestResult::NoErrors(
|
||||
RemoteTenantManifestInfo {
|
||||
latest_generation: Some(latest_generation),
|
||||
manifests,
|
||||
},
|
||||
));
|
||||
}
|
||||
Err(parse_error) => errors.push((
|
||||
latest_listing_object.key.get_path().as_str().to_owned(),
|
||||
|
||||
@@ -459,10 +459,12 @@ pub async fn get_timeline_objects(
|
||||
Ok(list.keys)
|
||||
}
|
||||
|
||||
const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
/// Drain a buffer of keys into DeleteObjects requests
|
||||
///
|
||||
/// If `drain` is true, drains keys completely; otherwise stops when <
|
||||
/// `max_keys_per_delete`` keys are left.
|
||||
/// MAX_KEYS_PER_DELETE keys are left.
|
||||
/// `num_deleted` returns number of deleted keys.
|
||||
async fn do_delete(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
@@ -472,10 +474,9 @@ async fn do_delete(
|
||||
progress_tracker: &mut DeletionProgressTracker,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = CancellationToken::new();
|
||||
let max_keys_per_delete = remote_client.max_keys_per_delete();
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= max_keys_per_delete) {
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
|
||||
let request_keys =
|
||||
keys.split_off(keys.len() - (std::cmp::min(max_keys_per_delete, keys.len())));
|
||||
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
|
||||
|
||||
let request_keys: Vec<RemotePath> = request_keys.into_iter().map(|o| o.key).collect();
|
||||
|
||||
@@ -616,7 +617,7 @@ pub async fn purge_garbage(
|
||||
}
|
||||
|
||||
objects_to_delete.append(&mut object_list);
|
||||
if objects_to_delete.len() >= remote_client.max_keys_per_delete() {
|
||||
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
||||
do_delete(
|
||||
&remote_client,
|
||||
&mut objects_to_delete,
|
||||
|
||||
@@ -86,8 +86,6 @@ enum Command {
|
||||
/// For safekeeper node_kind only, json list of timelines and their lsn info
|
||||
#[arg(long, default_value = None)]
|
||||
timeline_lsns: Option<String>,
|
||||
#[arg(long, default_value_t = false)]
|
||||
verbose: bool,
|
||||
},
|
||||
TenantSnapshot {
|
||||
#[arg(long = "tenant-id")]
|
||||
@@ -168,7 +166,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
timeline_lsns,
|
||||
verbose,
|
||||
} => {
|
||||
if let NodeKind::Safekeeper = node_kind {
|
||||
let db_or_list = match (timeline_lsns, dump_db_connstr) {
|
||||
@@ -206,7 +203,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_ids,
|
||||
json,
|
||||
post_to_storcon,
|
||||
verbose,
|
||||
cli.exit_code,
|
||||
)
|
||||
.await
|
||||
@@ -317,7 +313,6 @@ pub async fn run_cron_job(
|
||||
Vec::new(),
|
||||
true,
|
||||
post_to_storcon,
|
||||
false, // default to non-verbose mode
|
||||
exit_code,
|
||||
)
|
||||
.await?;
|
||||
@@ -367,13 +362,12 @@ pub async fn scan_pageserver_metadata_cmd(
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
json: bool,
|
||||
post_to_storcon: bool,
|
||||
verbose: bool,
|
||||
exit_code: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
if controller_client.is_none() && post_to_storcon {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
|
||||
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
|
||||
@@ -4,13 +4,11 @@ use std::time::Duration;
|
||||
|
||||
use crate::checks::{
|
||||
list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
|
||||
RemoteTenantManifestInfo,
|
||||
};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
|
||||
use pageserver::tenant::remote_timeline_client::{
|
||||
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
|
||||
};
|
||||
@@ -529,7 +527,7 @@ async fn gc_tenant_manifests(
|
||||
target: &RootTarget,
|
||||
mode: GcMode,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let mut gc_summary = GcSummary::default();
|
||||
match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
|
||||
ListTenantManifestResult::WithErrors {
|
||||
@@ -539,35 +537,33 @@ async fn gc_tenant_manifests(
|
||||
for (_key, error) in errors {
|
||||
tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
|
||||
}
|
||||
Ok((gc_summary, None))
|
||||
}
|
||||
ListTenantManifestResult::NoErrors {
|
||||
latest_generation,
|
||||
mut manifests,
|
||||
} => {
|
||||
let Some(latest_generation) = latest_generation else {
|
||||
return Ok((gc_summary, None));
|
||||
ListTenantManifestResult::NoErrors(mut manifest_info) => {
|
||||
let Some(latest_gen) = manifest_info.latest_generation else {
|
||||
return Ok(gc_summary);
|
||||
};
|
||||
manifests.sort_by_key(|(generation, _obj)| *generation);
|
||||
manifest_info
|
||||
.manifests
|
||||
.sort_by_key(|(generation, _obj)| *generation);
|
||||
// skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
|
||||
let candidates = manifests.iter().rev().skip(2);
|
||||
let candidates = manifest_info.manifests.iter().rev().skip(2);
|
||||
for (_generation, key) in candidates {
|
||||
maybe_delete_tenant_manifest(
|
||||
remote_client,
|
||||
&min_age,
|
||||
latest_generation.generation,
|
||||
latest_gen,
|
||||
key,
|
||||
mode,
|
||||
&mut gc_summary,
|
||||
)
|
||||
.instrument(
|
||||
info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
|
||||
info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Ok((gc_summary, Some(latest_generation)))
|
||||
}
|
||||
}
|
||||
Ok(gc_summary)
|
||||
}
|
||||
|
||||
async fn gc_timeline(
|
||||
@@ -577,7 +573,6 @@ async fn gc_timeline(
|
||||
mode: GcMode,
|
||||
ttid: TenantShardTimelineId,
|
||||
accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
|
||||
tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
|
||||
) -> anyhow::Result<GcSummary> {
|
||||
let mut summary = GcSummary::default();
|
||||
let data = list_timeline_blobs(remote_client, ttid, target).await?;
|
||||
@@ -602,60 +597,6 @@ async fn gc_timeline(
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(tenant_manifest_info) = &*tenant_manifest_info {
|
||||
// TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
|
||||
let maybe_offloaded = tenant_manifest_info
|
||||
.manifest
|
||||
.offloaded_timelines
|
||||
.iter()
|
||||
.find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
|
||||
if let Some(offloaded) = maybe_offloaded {
|
||||
let warnings = validate_index_part_with_offloaded(index_part, offloaded);
|
||||
let warn = if warnings.is_empty() {
|
||||
false
|
||||
} else {
|
||||
// Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
|
||||
match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
|
||||
ListTenantManifestResult::WithErrors {
|
||||
errors,
|
||||
unknown_keys: _,
|
||||
} => {
|
||||
for (_key, error) in errors {
|
||||
tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
|
||||
}
|
||||
true
|
||||
}
|
||||
ListTenantManifestResult::NoErrors {
|
||||
latest_generation,
|
||||
manifests: _,
|
||||
} => {
|
||||
if let Some(new_latest_gen) = latest_generation {
|
||||
let manifest_changed = (
|
||||
new_latest_gen.generation,
|
||||
new_latest_gen.listing_object.last_modified,
|
||||
) == (
|
||||
tenant_manifest_info.generation,
|
||||
tenant_manifest_info.listing_object.last_modified,
|
||||
);
|
||||
if manifest_changed {
|
||||
tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
|
||||
}
|
||||
manifest_changed
|
||||
} else {
|
||||
// The latest generation is gone. This timeline is in the progress of being deleted?
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
if warn {
|
||||
for warning in warnings {
|
||||
tracing::warn!(%ttid, "{}", warning);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
accumulator.lock().unwrap().update(ttid, index_part);
|
||||
|
||||
for key in candidates {
|
||||
@@ -667,35 +608,6 @@ async fn gc_timeline(
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
fn validate_index_part_with_offloaded(
|
||||
index_part: &IndexPart,
|
||||
offloaded: &OffloadedTimelineManifest,
|
||||
) -> Vec<String> {
|
||||
let mut warnings = Vec::new();
|
||||
if let Some(archived_at_index_part) = index_part.archived_at {
|
||||
if archived_at_index_part
|
||||
.signed_duration_since(offloaded.archived_at)
|
||||
.num_seconds()
|
||||
!= 0
|
||||
{
|
||||
warnings.push(format!(
|
||||
"index-part archived_at={} differs from manifest archived_at={}",
|
||||
archived_at_index_part, offloaded.archived_at
|
||||
));
|
||||
}
|
||||
} else {
|
||||
warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
|
||||
}
|
||||
if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
|
||||
warnings.push(format!(
|
||||
"index-part anestor={:?} differs from manifest ancestor={:?}",
|
||||
index_part.metadata.ancestor_timeline(),
|
||||
offloaded.ancestor_timeline_id
|
||||
));
|
||||
}
|
||||
warnings
|
||||
}
|
||||
|
||||
/// Physical garbage collection: removing unused S3 objects.
|
||||
///
|
||||
/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
|
||||
@@ -738,38 +650,29 @@ pub async fn pageserver_physical_gc(
|
||||
let target_ref = ⌖
|
||||
let remote_client_ref = &remote_client;
|
||||
async move {
|
||||
let gc_manifest_result = gc_tenant_manifests(
|
||||
let summaries_from_manifests = match gc_tenant_manifests(
|
||||
remote_client_ref,
|
||||
min_age,
|
||||
target_ref,
|
||||
mode,
|
||||
tenant_shard_id,
|
||||
)
|
||||
.await;
|
||||
let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
|
||||
Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
|
||||
.await
|
||||
{
|
||||
Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
|
||||
gc_summary,
|
||||
))],
|
||||
Err(e) => {
|
||||
tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
|
||||
(GcSummary::default(), None)
|
||||
Vec::new()
|
||||
}
|
||||
};
|
||||
let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
|
||||
let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
|
||||
summary_from_manifest,
|
||||
));
|
||||
stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
|
||||
.await
|
||||
.map(|stream| {
|
||||
stream
|
||||
.zip(futures::stream::iter(std::iter::repeat(
|
||||
tenant_manifest_arc,
|
||||
)))
|
||||
.map(|(ttid_res, tenant_manifest_arc)| {
|
||||
ttid_res.map(move |ttid| {
|
||||
GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
|
||||
})
|
||||
})
|
||||
.chain(futures::stream::iter([summary_from_manifest].into_iter()))
|
||||
.map_ok(GcSummaryOrContent::Content)
|
||||
.chain(futures::stream::iter(summaries_from_manifests.into_iter()))
|
||||
})
|
||||
}
|
||||
});
|
||||
@@ -781,17 +684,14 @@ pub async fn pageserver_physical_gc(
|
||||
// Drain futures for per-shard GC, populating accumulator as a side effect
|
||||
{
|
||||
let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
|
||||
GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
|
||||
futures::future::Either::Left(gc_timeline(
|
||||
&remote_client,
|
||||
&min_age,
|
||||
&target,
|
||||
mode,
|
||||
ttid,
|
||||
&accumulator,
|
||||
tenant_manifest_arc,
|
||||
))
|
||||
}
|
||||
GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
|
||||
&remote_client,
|
||||
&min_age,
|
||||
&target,
|
||||
mode,
|
||||
ttid,
|
||||
&accumulator,
|
||||
)),
|
||||
GcSummaryOrContent::GcSummary(gc_summary) => {
|
||||
futures::future::Either::Right(futures::future::ok(gc_summary))
|
||||
}
|
||||
|
||||
@@ -21,12 +21,8 @@ pub struct MetadataSummary {
|
||||
tenant_count: usize,
|
||||
timeline_count: usize,
|
||||
timeline_shard_count: usize,
|
||||
/// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
|
||||
/// The key is generated using `TenantShardTimelineId::to_string()`.
|
||||
with_errors: HashMap<String, Vec<String>>,
|
||||
/// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
|
||||
/// The key is generated using `TenantShardTimelineId::to_string()`.
|
||||
with_warnings: HashMap<String, Vec<String>>,
|
||||
with_errors: HashSet<TenantShardTimelineId>,
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
@@ -56,12 +52,7 @@ impl MetadataSummary {
|
||||
}
|
||||
}
|
||||
|
||||
fn update_analysis(
|
||||
&mut self,
|
||||
id: &TenantShardTimelineId,
|
||||
analysis: &TimelineAnalysis,
|
||||
verbose: bool,
|
||||
) {
|
||||
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
||||
if analysis.is_healthy() {
|
||||
self.healthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
} else {
|
||||
@@ -70,17 +61,11 @@ impl MetadataSummary {
|
||||
}
|
||||
|
||||
if !analysis.errors.is_empty() {
|
||||
let entry = self.with_errors.entry(id.to_string()).or_default();
|
||||
if verbose {
|
||||
entry.extend(analysis.errors.iter().cloned());
|
||||
}
|
||||
self.with_errors.insert(*id);
|
||||
}
|
||||
|
||||
if !analysis.warnings.is_empty() {
|
||||
let entry = self.with_warnings.entry(id.to_string()).or_default();
|
||||
if verbose {
|
||||
entry.extend(analysis.warnings.iter().cloned());
|
||||
}
|
||||
self.with_warnings.insert(*id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +120,6 @@ Index versions: {version_summary}
|
||||
pub async fn scan_pageserver_metadata(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
verbose: bool,
|
||||
) -> anyhow::Result<MetadataSummary> {
|
||||
let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
|
||||
|
||||
@@ -180,7 +164,6 @@ pub async fn scan_pageserver_metadata(
|
||||
mut tenant_objects: TenantObjectListing,
|
||||
timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
|
||||
highest_shard_count: ShardCount,
|
||||
verbose: bool,
|
||||
) {
|
||||
summary.tenant_count += 1;
|
||||
|
||||
@@ -220,7 +203,7 @@ pub async fn scan_pageserver_metadata(
|
||||
Some(data),
|
||||
)
|
||||
.await;
|
||||
summary.update_analysis(&ttid, &analysis, verbose);
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
} else {
|
||||
@@ -288,6 +271,10 @@ pub async fn scan_pageserver_metadata(
|
||||
summary.update_data(&data);
|
||||
|
||||
match tenant_id {
|
||||
None => {
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
Some(prev_tenant_id) => {
|
||||
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
|
||||
// New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
|
||||
@@ -300,7 +287,6 @@ pub async fn scan_pageserver_metadata(
|
||||
tenant_objects,
|
||||
timelines,
|
||||
highest_shard_count,
|
||||
verbose,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
|
||||
.await;
|
||||
@@ -310,10 +296,6 @@ pub async fn scan_pageserver_metadata(
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
}
|
||||
|
||||
match &data.blob_data {
|
||||
@@ -344,7 +326,6 @@ pub async fn scan_pageserver_metadata(
|
||||
tenant_objects,
|
||||
tenant_timeline_results,
|
||||
highest_shard_count,
|
||||
verbose,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %tenant_id))
|
||||
.await;
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
# How to run the `pg_regress` tests on a cloud Neon instance.
|
||||
|
||||
* Create a Neon project on staging.
|
||||
* Grant the superuser privileges to the DB user.
|
||||
* (Optional) create a branch for testing
|
||||
* Configure the endpoint by updating the control-plane database with the following settings:
|
||||
* `Timeone`: `America/Los_Angeles`
|
||||
* `DateStyle`: `Postgres,MDY`
|
||||
* `compute_query_id`: `off`
|
||||
* Checkout the actual `Neon` sources
|
||||
* Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
|
||||
```bash
|
||||
$ cd vendor/postgres-v17
|
||||
$ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
|
||||
```
|
||||
* Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
|
||||
* Set the environment variable `PG_VERSION` to the version of your project.
|
||||
* Run
|
||||
```bash
|
||||
$ pytest -m remote_cluster -k cloud_regress
|
||||
```
|
||||
@@ -5,15 +5,68 @@ Run the regression tests on the cloud instance of Neon
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import RemotePostgres
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Setup and teardown of the tests
|
||||
"""
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
log.info("Creating the extension")
|
||||
cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
|
||||
conn.commit()
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for subscriptions in the regress database")
|
||||
cur.execute(
|
||||
"SELECT subname FROM pg_catalog.pg_subscription WHERE "
|
||||
"subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
with psycopg2.connect(
|
||||
dbname="regression",
|
||||
host=remote_pg.default_options["host"],
|
||||
user=remote_pg.default_options["user"],
|
||||
password=remote_pg.default_options["password"],
|
||||
) as regress_conn:
|
||||
with regress_conn.cursor() as regress_cur:
|
||||
for sub in cur:
|
||||
regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
|
||||
regress_cur.execute(
|
||||
f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
|
||||
)
|
||||
regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
|
||||
regress_conn.commit()
|
||||
|
||||
yield
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for extra roles...")
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
|
||||
)
|
||||
roles: list[Any] = []
|
||||
for role in cur:
|
||||
log.info("Role found: %s", role[0])
|
||||
roles.append(role[0])
|
||||
for role in roles:
|
||||
cur.execute(f"DROP ROLE {role}")
|
||||
conn.commit()
|
||||
|
||||
|
||||
@pytest.mark.timeout(7200)
|
||||
@pytest.mark.remote_cluster
|
||||
def test_cloud_regress(
|
||||
setup,
|
||||
remote_pg: RemotePostgres,
|
||||
pg_version: PgVersion,
|
||||
pg_distrib_dir: Path,
|
||||
|
||||
@@ -7,25 +7,24 @@ from pytest_httpserver import HTTPServer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
from ssl import SSLContext
|
||||
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
|
||||
ListenAddress = tuple[str, int]
|
||||
# TODO: mypy fails with:
|
||||
# Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor" [attr-defined]
|
||||
# from fixtures.neon_fixtures import PortDistributor
|
||||
|
||||
# compared to the fixtures from pytest_httpserver with same names, these are
|
||||
# always function scoped, so you can check and stop the server in tests.
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def httpserver_ssl_context() -> Iterator[SSLContext | None]:
|
||||
yield None
|
||||
def httpserver_ssl_context():
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def make_httpserver(
|
||||
httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
|
||||
) -> Iterator[HTTPServer]:
|
||||
def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
|
||||
host, port = httpserver_listen_address
|
||||
if not host:
|
||||
host = HTTPServer.DEFAULT_LISTEN_HOST
|
||||
@@ -48,6 +47,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
|
||||
def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
|
||||
port = port_distributor.get_port()
|
||||
return ("localhost", port)
|
||||
|
||||
@@ -178,7 +178,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
counter("pageserver_timeline_wal_records_received"),
|
||||
counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
|
||||
*histogram("pageserver_page_service_batch_size"),
|
||||
*histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user