mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-01 17:50:38 +00:00
Compare commits
10 Commits
sergey/ext
...
nicer_evic
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
81425123ee | ||
|
|
d364f5936b | ||
|
|
ae1b2f78b3 | ||
|
|
834e94f5e3 | ||
|
|
751c93c8f7 | ||
|
|
68b3e68642 | ||
|
|
4aa528ce45 | ||
|
|
bdd502aff7 | ||
|
|
5015194b40 | ||
|
|
5aef192bf2 |
48
.github/actions/allure-report/action.yml
vendored
48
.github/actions/allure-report/action.yml
vendored
@@ -15,32 +15,10 @@ outputs:
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
report-json-url:
|
||||
description: 'Allure report JSON URL'
|
||||
value: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
steps:
|
||||
# We're using some of env variables quite offen, so let's set them once.
|
||||
#
|
||||
# It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
|
||||
#
|
||||
# - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
|
||||
# - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
|
||||
#
|
||||
- name: Set common environment variables
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV
|
||||
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
|
||||
echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
- name: Validate input parameters
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
@@ -98,14 +76,16 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.21.0
|
||||
ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
|
||||
ALLURE_VERSION: 2.19.0
|
||||
ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
|
||||
|
||||
- name: Upload Allure results
|
||||
if: ${{ inputs.action == 'store' }}
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
@@ -124,7 +104,7 @@ runs:
|
||||
EOF
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
|
||||
TEST_SELECTION=${{ inputs.test_selection }}
|
||||
BUILD_TYPE=${BUILD_TYPE}
|
||||
BUILD_TYPE=${{ inputs.build_type }}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
@@ -133,12 +113,13 @@ runs:
|
||||
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
|
||||
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
|
||||
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
- name: Acquire Allure lock
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
@@ -168,6 +149,8 @@ runs:
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Get previously uploaded data for this run
|
||||
@@ -203,24 +186,24 @@ runs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${TEST_OUTPUT}/allure/index.html
|
||||
cat <<EOF > ./index.html
|
||||
<!DOCTYPE html>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<title>Redirecting to ${REPORT_URL}</title>
|
||||
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
|
||||
EOF
|
||||
aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release Allure lock
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
@@ -229,16 +212,11 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
rm -rf ${TEST_OUTPUT}/allure
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
|
||||
12
.github/actions/run-python-test-set/action.yml
vendored
12
.github/actions/run-python-test-set/action.yml
vendored
@@ -44,10 +44,6 @@ inputs:
|
||||
description: 'Secret access key'
|
||||
required: false
|
||||
default: ''
|
||||
rerun_flaky:
|
||||
description: 'Whether to rerun flaky tests'
|
||||
required: false
|
||||
default: 'false'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -105,7 +101,6 @@ runs:
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -148,13 +143,6 @@ runs:
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [ "${RERUN_FLAKY}" == "true" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
|
||||
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
|
||||
10
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
10
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
@@ -8,16 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
10
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -8,16 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.us-east-2.hosts.yaml
vendored
10
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -8,16 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.us-west-2.hosts.yaml
vendored
10
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -8,16 +8,6 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
7
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
7
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -8,16 +8,11 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
7
.github/ansible/staging.us-east-2.hosts.yaml
vendored
7
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -8,16 +8,11 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
threshold: "20m"
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
@@ -30,9 +30,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: eu-west-1
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: eu-west-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -15,9 +15,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
podLabels:
|
||||
neon_service: proxy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
zenith_service: proxy
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
|
||||
@@ -15,9 +15,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram-legacy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
zenith_service: proxy-scram-legacy
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -23,7 +23,6 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
extraDomains: [ "*.us-east-2.postgres.zenith.tech" ]
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
@@ -31,9 +30,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,9 +31,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: ap-southeast-1
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: ap-southeast-1
|
||||
zenith_region_slug: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,9 +31,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: eu-central-1
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: eu-central-1
|
||||
zenith_region_slug: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -13,9 +13,10 @@ settings:
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy
|
||||
neon_env: production
|
||||
neon_region: us-east-2
|
||||
zenith_service: proxy
|
||||
zenith_env: production
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
|
||||
@@ -31,9 +31,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-east-2
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,9 +31,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,9 +31,10 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
4
.github/pull_request_template.md
vendored
4
.github/pull_request_template.md
vendored
@@ -3,12 +3,8 @@
|
||||
## Issue ticket number and link
|
||||
|
||||
## Checklist before requesting a review
|
||||
|
||||
- [ ] I have performed a self-review of my code.
|
||||
- [ ] If it is a core feature, I have added thorough tests.
|
||||
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
|
||||
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
|
||||
|
||||
## Checklist before merging
|
||||
|
||||
- [ ] Do not forget to reformat commit message to not include the above checklist
|
||||
|
||||
112
.github/workflows/build_and_test.yml
vendored
112
.github/workflows/build_and_test.yml
vendored
@@ -184,10 +184,10 @@ jobs:
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
CARGO_FLAGS="--locked $CARGO_FEATURES"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||
@@ -240,18 +240,11 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
|
||||
${cov_prefix} cargo test $CARGO_FLAGS
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -275,7 +268,7 @@ jobs:
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
|
||||
test_exe_paths=$(
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
for bin in $test_exe_paths; do
|
||||
@@ -335,9 +328,6 @@ jobs:
|
||||
real_s3_region: us-west-2
|
||||
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
|
||||
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
|
||||
rerun_flaky: true
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
@@ -374,88 +364,42 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
merge-allure-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: ${{ !cancelled() }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Create Allure report (debug)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report-debug
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: debug
|
||||
|
||||
- name: Create Allure report (release)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report-release
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: release
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: >
|
||||
!cancelled() &&
|
||||
github.event_name == 'pull_request' && (
|
||||
steps.create-allure-report-debug.outputs.report-url ||
|
||||
steps.create-allure-report-release.outputs.report-url
|
||||
)
|
||||
with:
|
||||
script: |
|
||||
const reports = [{
|
||||
buildType: "debug",
|
||||
reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
|
||||
}, {
|
||||
buildType: "release",
|
||||
reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}",
|
||||
}]
|
||||
|
||||
const script = require("./scripts/pr-comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
reports,
|
||||
})
|
||||
build_type: ${{ matrix.build_type }}
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: >
|
||||
!cancelled() && (
|
||||
steps.create-allure-report-debug.outputs.report-url ||
|
||||
steps.create-allure-report-release.outputs.report-url
|
||||
)
|
||||
if: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
|
||||
REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
|
||||
REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
|
||||
./scripts/pysync
|
||||
|
||||
for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
|
||||
if [ -z "$report_url" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
|
||||
BUILD_TYPE=debug
|
||||
else
|
||||
BUILD_TYPE=release
|
||||
fi
|
||||
|
||||
curl --fail --output suites.json "${report_url}"
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
done
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -608,7 +552,7 @@ jobs:
|
||||
neon-image-depot:
|
||||
# For testing this will run side-by-side for a few merges.
|
||||
# This action is not really optimized yet, but gets the job done
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
needs: [ tag ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
permissions:
|
||||
@@ -947,16 +891,6 @@ jobs:
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
|
||||
4
.github/workflows/neon_extra_builds.yml
vendored
4
.github/workflows/neon_extra_builds.yml
vendored
@@ -53,14 +53,14 @@ jobs:
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
|
||||
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@@ -31,3 +31,4 @@ jobs:
|
||||
head: releases/${{ steps.date.outputs.date }}
|
||||
base: release
|
||||
title: Release ${{ steps.date.outputs.date }}
|
||||
team_reviewers: release
|
||||
|
||||
33
Cargo.lock
generated
33
Cargo.lock
generated
@@ -2484,6 +2484,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"ubyte",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
@@ -3086,7 +3087,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"test-context",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -3890,27 +3890,6 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-context"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
"test-context-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-context-macros"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.16.0"
|
||||
@@ -4439,6 +4418,15 @@ version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
||||
|
||||
[[package]]
|
||||
name = "ubyte"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c81f0dae7d286ad0d9366d7679a77934cfc3cf3a8d67e82669794412b2368fe6"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uname"
|
||||
version = "0.1.1"
|
||||
@@ -4557,7 +4545,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"rand",
|
||||
"regex",
|
||||
"routerify",
|
||||
"sentry",
|
||||
"serde",
|
||||
|
||||
@@ -97,7 +97,6 @@ strum_macros = "0.24"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
|
||||
@@ -40,8 +40,6 @@ pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
```
|
||||
|
||||
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
|
||||
@@ -133,7 +133,6 @@ fn main() -> Result<()> {
|
||||
.settings
|
||||
.find("neon.pageserver_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
@@ -154,7 +153,6 @@ fn main() -> Result<()> {
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
storage_auth_token,
|
||||
metrics: ComputeMetrics::default(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
@@ -203,14 +201,13 @@ fn main() -> Result<()> {
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
info!("shutting down");
|
||||
}
|
||||
|
||||
info!("shutting down tracing");
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit.
|
||||
tracing_utils::shutdown_tracing();
|
||||
|
||||
info!("shutting down");
|
||||
exit(exit_code.unwrap_or(1))
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
|
||||
@@ -45,7 +44,6 @@ pub struct ComputeNode {
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub storage_auth_token: Option<String>,
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
@@ -128,18 +126,7 @@ impl ComputeNode {
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token);
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
@@ -176,11 +163,6 @@ impl ComputeNode {
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
@@ -258,11 +240,6 @@ impl ComputeNode {
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", &self.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
|
||||
@@ -74,9 +74,18 @@ impl GenericOption {
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
// TODO: check in the console DB that we don't have these settings
|
||||
// set for any non-deleted project and drop this override.
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
|
||||
it => it,
|
||||
};
|
||||
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", self.name, val),
|
||||
"string" => format!("{} = '{}'", name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", name, val),
|
||||
}
|
||||
} else {
|
||||
self.name.to_owned()
|
||||
|
||||
@@ -24,8 +24,6 @@ pub struct ComputeSpec {
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -87,14 +88,16 @@ impl ComputeControlPlane {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test: false,
|
||||
timeline_id,
|
||||
lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer: false,
|
||||
pg_version,
|
||||
});
|
||||
|
||||
node.create_pgdata()?;
|
||||
node.setup_pg_conf()?;
|
||||
node.setup_pg_conf(self.env.pageserver.pg_auth_type)?;
|
||||
|
||||
self.nodes
|
||||
.insert((tenant_id, node.name.clone()), Arc::clone(&node));
|
||||
@@ -111,9 +114,11 @@ pub struct PostgresNode {
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenant_id: TenantId,
|
||||
uses_wal_proposer: bool,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
@@ -147,6 +152,7 @@ impl PostgresNode {
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
|
||||
let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
|
||||
|
||||
// Read postgres version from PG_VERSION file to determine which postgres version binary to use.
|
||||
// If it doesn't exist, assume broken data directory and use default pg version.
|
||||
@@ -166,9 +172,11 @@ impl PostgresNode {
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timeline_id,
|
||||
lsn: recovery_target_lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer,
|
||||
pg_version,
|
||||
})
|
||||
}
|
||||
@@ -270,7 +278,7 @@ impl PostgresNode {
|
||||
|
||||
// Write postgresql.conf with default configuration
|
||||
// and PG_VERSION file to the data directory of a new node.
|
||||
fn setup_pg_conf(&self) -> Result<()> {
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
conf.append("wal_log_hints", "off");
|
||||
@@ -294,12 +302,29 @@ impl PostgresNode {
|
||||
let config = &self.pageserver.pg_connection_config;
|
||||
let (host, port) = (config.host(), config.port());
|
||||
|
||||
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
format!("postgresql://no_user@{host}:{port}")
|
||||
// Set up authentication
|
||||
//
|
||||
// $NEON_AUTH_TOKEN will be replaced with value from environment
|
||||
// variable during compute pg startup. It is done this way because
|
||||
// otherwise user will be able to retrieve the value using SHOW
|
||||
// command or pg_settings
|
||||
let password = if let AuthType::NeonJWT = auth_type {
|
||||
"$NEON_AUTH_TOKEN"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
// NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
|
||||
// Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
|
||||
// We parse this string and build it back with token from env var, and for simplicity rebuild
|
||||
// uses only needed variables namely host, port, user, password.
|
||||
format!("postgresql://no_user:{password}@{host}:{port}")
|
||||
};
|
||||
conf.append("shared_preload_libraries", "neon");
|
||||
conf.append_line("");
|
||||
conf.append("neon.pageserver_connstring", &pageserver_connstr);
|
||||
if let AuthType::NeonJWT = auth_type {
|
||||
conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
|
||||
}
|
||||
conf.append("neon.tenant_id", &self.tenant_id.to_string());
|
||||
conf.append("neon.timeline_id", &self.timeline_id.to_string());
|
||||
if let Some(lsn) = self.lsn {
|
||||
@@ -360,7 +385,7 @@ impl PostgresNode {
|
||||
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if !self.env.safekeepers.is_empty() {
|
||||
} else if self.uses_wal_proposer {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
@@ -399,7 +424,7 @@ impl PostgresNode {
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
|
||||
let mut cmd = Command::new(&pg_ctl_path);
|
||||
let mut cmd = Command::new(pg_ctl_path);
|
||||
cmd.args(
|
||||
[
|
||||
&[
|
||||
@@ -422,15 +447,11 @@ impl PostgresNode {
|
||||
"DYLD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
|
||||
);
|
||||
|
||||
// Pass authentication token used for the connections to pageserver and safekeepers
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let pg_ctl = cmd
|
||||
.output()
|
||||
.context(format!("{} failed", pg_ctl_path.display()))?;
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
if !pg_ctl.status.success() {
|
||||
anyhow::bail!(
|
||||
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
||||
@@ -475,6 +496,10 @@ impl PostgresNode {
|
||||
self.pg_ctl(&["start"], auth_token)
|
||||
}
|
||||
|
||||
pub fn restart(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
self.pg_ctl(&["restart"], auth_token)
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
// If we are going to destroy data directory,
|
||||
// use immediate shutdown mode, otherwise,
|
||||
@@ -505,4 +530,26 @@ impl PostgresNode {
|
||||
"postgres"
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,7 +156,7 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&format!("safekeeper {id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
|
||||
@@ -160,7 +160,6 @@ services:
|
||||
build:
|
||||
context: ./compute_wrapper/
|
||||
args:
|
||||
- REPOSITORY=${REPOSITORY:-neondatabase}
|
||||
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
|
||||
- TAG=${TAG:-latest}
|
||||
- http_proxy=$http_proxy
|
||||
|
||||
@@ -106,22 +106,20 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo
|
||||
There is no administrative API except those provided by PostgreSQL.
|
||||
|
||||
#### Outgoing connections
|
||||
Compute connects to Pageserver for getting pages. The connection string is
|
||||
configured by the `neon.pageserver_connstring` PostgreSQL GUC,
|
||||
e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN`
|
||||
environment variable is set, it is used as the password for the connection. (The
|
||||
pageserver uses JWT tokens for authentication, so the password is really a
|
||||
token.)
|
||||
Compute connects to Pageserver for getting pages.
|
||||
The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
|
||||
The environment variable inside the connection string is substituted with
|
||||
the JWT token.
|
||||
|
||||
Compute connects to Safekeepers to write and commit data. The list of safekeeper
|
||||
addresses is given in the `neon.safekeepers` GUC. The connections to the
|
||||
safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
|
||||
variable, if set.
|
||||
Compute connects to Safekeepers to write and commit data.
|
||||
The token is the same for all safekeepers.
|
||||
It's stored in an environment variable, whose name is configured
|
||||
by the `neon.safekeeper_token_env` PostgreSQL GUC.
|
||||
If the GUC is unset, no token is passed.
|
||||
|
||||
The `compute_ctl` binary that runs before the PostgreSQL server, and launches
|
||||
PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the
|
||||
initial "base backup" dump, to initialize the PostgreSQL data directory. It also
|
||||
uses `$NEON_AUTH_TOKEN` as the password for the connection.
|
||||
Note that both tokens can be (and typically are) the same;
|
||||
the scope is the tenant and the token is usually passed through the
|
||||
`$NEON_AUTH_TOKEN` environment variable.
|
||||
|
||||
### Pageserver
|
||||
#### Overview
|
||||
|
||||
@@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values.
|
||||
- PG_VERSION: postgres version for compute (default is 14)
|
||||
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
|
||||
```
|
||||
$ cd docker-compose/
|
||||
$ cd docker-compose/docker-compose.yml
|
||||
$ docker-compose down # remove the conainers if exists
|
||||
$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version
|
||||
$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version
|
||||
Creating network "dockercompose_default" with the default driver
|
||||
Creating docker-compose_storage_broker_1 ... done
|
||||
(...omit...)
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
# Deleting pageserver part of tenants data from s3
|
||||
|
||||
Created on 08.03.23
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
|
||||
|
||||
This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
|
||||
|
||||
## Summary
|
||||
|
||||
TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons.
|
||||
|
||||
The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision).
|
||||
|
||||
## Components
|
||||
|
||||
pageserver, control-plane
|
||||
|
||||
## Requirements
|
||||
|
||||
Deletion should successfully finish (eventually) without leaving dangling files in presense of:
|
||||
|
||||
- component restarts
|
||||
- component outage
|
||||
- pageserver loss
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files.
|
||||
|
||||
Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation.
|
||||
|
||||
The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called.
|
||||
|
||||
For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines)
|
||||
|
||||
### 1. Pageserver owns deletion machinery
|
||||
|
||||
#### The sequence
|
||||
|
||||
TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence.
|
||||
|
||||
Happy path:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>PS: Create deleted mark file locally
|
||||
PS->>CP: Accepted
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
loop Poll for status
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True or False
|
||||
end
|
||||
```
|
||||
|
||||
Why two mark files?
|
||||
Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach.
|
||||
|
||||
Why local mark file is needed?
|
||||
|
||||
If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
|
||||
|
||||
If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
|
||||
|
||||
If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
|
||||
|
||||
Thus we need local record of tenant being deleted as well.
|
||||
|
||||
##### Handle pageserver crashes
|
||||
|
||||
Lets explore sequences with various crash points.
|
||||
|
||||
Pageserver crashes before `deleted` mark file is persisted in s3:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
note over PS: Crash point 1.
|
||||
CP->>PS: Retry delete request
|
||||
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>PS: Create deleted mark file locally
|
||||
|
||||
PS->>CP: Accepted
|
||||
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True
|
||||
```
|
||||
|
||||
Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant CP as Control Plane
|
||||
participant PS as Pageserver
|
||||
participant S3
|
||||
|
||||
CP->>PS: Delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
|
||||
note over PS: Crash point 2.
|
||||
note over PS: During startup we reconcile <br> with remote and see <br> whether the remote mark exists
|
||||
alt Remote mark exists
|
||||
PS->>PS: create local mark if its missing
|
||||
PS->>PS: delete local files other than deleted mark
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
end
|
||||
|
||||
note over CP: Eventually console should <br> retry delete request
|
||||
|
||||
CP->>PS: Retry delete tenant
|
||||
PS->>CP: Not modified
|
||||
else Mark is missing
|
||||
note over PS: Continue to operate the tenant as if deletion didnt happen
|
||||
|
||||
note over CP: Eventually console should <br> retry delete request
|
||||
|
||||
CP->>PS: Retry delete tenant
|
||||
PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
|
||||
PS->>CP: Delete tenant
|
||||
end
|
||||
|
||||
PS->>PS: Continue with layer file deletions
|
||||
loop Delete layers for each timeline
|
||||
PS->>S3: delete(..)
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: False
|
||||
end
|
||||
|
||||
PS->>S3: Delete mark file
|
||||
PS->>PS: Delete local mark file
|
||||
|
||||
CP->>PS: Finished?
|
||||
PS->>CP: True
|
||||
```
|
||||
|
||||
Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
|
||||
|
||||
If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
|
||||
|
||||
The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404.
|
||||
|
||||
##### Differences between tenants and timelines
|
||||
|
||||
For timeline the sequence is the same with the following differences:
|
||||
|
||||
- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json
|
||||
- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible
|
||||
|
||||
##### Handle pageserver loss
|
||||
|
||||
If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created).
|
||||
|
||||
##### Restrictions for tenant that is in progress of being deleted
|
||||
|
||||
I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
|
||||
- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working.
|
||||
- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck.
|
||||
- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract.
|
||||
- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See
|
||||
|
||||
Cons:
|
||||
|
||||
- Logic is a tricky, needs good testing
|
||||
- Anything else?
|
||||
|
||||
### 2. Control plane owns deletion machinery
|
||||
|
||||
In this case the only action performed on pageserver is removal of local files.
|
||||
|
||||
Everything else is done by control plane. The steps are as follows:
|
||||
|
||||
1. Control plane marks tenant as "delete pending" in its database
|
||||
2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind
|
||||
3. When no files are left marks deletion as completed
|
||||
|
||||
In case of restart it selects all tenants marked as "delete pending" and continues the deletion.
|
||||
|
||||
For tenants it is simple. For timelines there are caveats.
|
||||
|
||||
Assume that the same workflow is used for timelines.
|
||||
|
||||
If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state.
|
||||
|
||||
Available options:
|
||||
|
||||
- require list of alive timelines to be passed to attach call
|
||||
- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks.
|
||||
|
||||
With first option the following problem becomes apparent:
|
||||
|
||||
Who is the source of truth regarding timeline liveness?
|
||||
|
||||
Imagine:
|
||||
PS1 fails.
|
||||
PS2 gets assigned the tenant.
|
||||
New branch gets created
|
||||
PS1 starts up (is it possible or we just recycle it?)
|
||||
PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
|
||||
|
||||
So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
|
||||
|
||||
### Summary
|
||||
|
||||
Cons:
|
||||
|
||||
- Potential thundering herd-like problem during storage restart (requests to control plane)
|
||||
- Potential increase in storage startup time (additional request to control plane)
|
||||
- Storage startup starts to depend on console
|
||||
- Erroneous attach call can attach tenant in half deleted state
|
||||
|
||||
Pros:
|
||||
|
||||
- Easier to reason about if you dont have to account for pageserver restarts
|
||||
|
||||
### Extra notes
|
||||
|
||||
There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at.
|
||||
|
||||
Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary.
|
||||
|
||||
## Decision
|
||||
|
||||
After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
|
||||
|
||||
To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
|
||||
|
||||
With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
|
||||
|
||||
So the decision is to proceed with pageserver centric approach.
|
||||
@@ -349,7 +349,7 @@ pub enum InMemoryLayerInfo {
|
||||
pub enum HistoricLayerInfo {
|
||||
Delta {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
layer_file_size: Option<u64>,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
@@ -360,7 +360,7 @@ pub enum HistoricLayerInfo {
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
layer_file_size: Option<u64>,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
|
||||
@@ -767,7 +767,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
let err_to_send_and_errcode = match &end {
|
||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||
Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)),
|
||||
// Note: CopyFail in duplex copy is somewhat unexpected (at least to
|
||||
// PG walsender; evidently and per my docs reading client should
|
||||
// finish it with CopyDone). It is not a problem to recover from it
|
||||
|
||||
@@ -936,40 +936,35 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
// Neon extension of postgres replication protocol
|
||||
// See NEON_STATUS_UPDATE_TAG_BYTE
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub struct ReplicationFeedback {
|
||||
// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
/// LSN last received and ingested by the pageserver.
|
||||
pub last_received_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
pub disk_consistent_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
pub remote_consistent_lsn: u64,
|
||||
pub replytime: SystemTime,
|
||||
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
||||
pub ps_writelsn: u64,
|
||||
pub ps_applylsn: u64,
|
||||
pub ps_flushlsn: u64,
|
||||
pub ps_replytime: SystemTime,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
impl ReplicationFeedback {
|
||||
pub fn empty() -> ReplicationFeedback {
|
||||
ReplicationFeedback {
|
||||
current_timeline_size: 0,
|
||||
last_received_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
disk_consistent_lsn: 0,
|
||||
replytime: SystemTime::now(),
|
||||
ps_writelsn: 0,
|
||||
ps_applylsn: 0,
|
||||
ps_flushlsn: 0,
|
||||
ps_replytime: SystemTime::now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize PageserverFeedback using custom format
|
||||
// Serialize ReplicationFeedback using custom format
|
||||
// to support protocol extensibility.
|
||||
//
|
||||
// Following layout is used:
|
||||
@@ -979,26 +974,24 @@ impl PageserverFeedback {
|
||||
// null-terminated string - key,
|
||||
// uint32 - value length in bytes
|
||||
// value itself
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.last_received_lsn);
|
||||
buf.put_u64(self.ps_writelsn);
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.disk_consistent_lsn);
|
||||
buf.put_u64(self.ps_flushlsn);
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.remote_consistent_lsn);
|
||||
buf.put_u64(self.ps_applylsn);
|
||||
|
||||
let timestamp = self
|
||||
.replytime
|
||||
.ps_replytime
|
||||
.duration_since(*PG_EPOCH)
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
@@ -1008,10 +1001,9 @@ impl PageserverFeedback {
|
||||
buf.put_i64(timestamp);
|
||||
}
|
||||
|
||||
// Deserialize PageserverFeedback message
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn parse(mut buf: Bytes) -> PageserverFeedback {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Deserialize ReplicationFeedback message
|
||||
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
@@ -1024,39 +1016,39 @@ impl PageserverFeedback {
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.last_received_lsn = buf.get_u64();
|
||||
rf.ps_writelsn = buf.get_u64();
|
||||
}
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.disk_consistent_lsn = buf.get_u64();
|
||||
rf.ps_flushlsn = buf.get_u64();
|
||||
}
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.remote_consistent_lsn = buf.get_u64();
|
||||
rf.ps_applylsn = buf.get_u64();
|
||||
}
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
if raw_time > 0 {
|
||||
rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
} else {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
"PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
"ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
String::from_utf8_lossy(key.as_ref())
|
||||
);
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("PageserverFeedback parsed is {:?}", rf);
|
||||
trace!("ReplicationFeedback parsed is {:?}", rf);
|
||||
rf
|
||||
}
|
||||
}
|
||||
@@ -1067,33 +1059,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_serialization() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data.put_slice(b"new_field_one\0");
|
||||
@@ -1101,7 +1093,7 @@ mod tests {
|
||||
data.put_u64(42);
|
||||
|
||||
// Parse serialized data and check that new field is not parsed
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
|
||||
@@ -26,4 +26,3 @@ workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
|
||||
@@ -39,9 +39,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
@@ -67,10 +64,6 @@ impl RemotePath {
|
||||
pub fn object_name(&self) -> Option<&str> {
|
||||
self.0.file_name().and_then(|os_str| os_str.to_str())
|
||||
}
|
||||
|
||||
pub fn join(&self, segment: &Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -78,6 +71,9 @@ impl RemotePath {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
|
||||
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
@@ -270,7 +266,6 @@ pub struct S3Config {
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
@@ -280,10 +275,6 @@ impl Debug for S3Config {
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -312,11 +303,6 @@ impl RemoteStorageConfig {
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
let max_keys_per_list_response =
|
||||
parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
|
||||
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
|
||||
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
|
||||
(None, None, None) => return Ok(None),
|
||||
@@ -338,7 +324,6 @@ impl RemoteStorageConfig {
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
|
||||
@@ -73,8 +73,10 @@ impl LocalFs {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
@@ -89,10 +91,7 @@ impl LocalFs {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -102,7 +102,6 @@ pub struct S3Bucket {
|
||||
client: Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
@@ -165,7 +164,6 @@ impl S3Bucket {
|
||||
Ok(Self {
|
||||
client,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
max_keys_per_list_response: aws_config.max_keys_per_list_response,
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
|
||||
})
|
||||
@@ -275,6 +273,48 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||
|
||||
metrics::inc_list_objects();
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(self.prefix_in_bucket.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})?;
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.contents
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
@@ -314,7 +354,6 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
@@ -332,7 +371,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
match fetch_response.next_continuation_token {
|
||||
match fetch_response.continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ pub struct UnreliableWrapper {
|
||||
/// Used to identify retries of different unique operation.
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
enum RemoteOp {
|
||||
List,
|
||||
ListPrefixes(Option<RemotePath>),
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
@@ -74,6 +75,12 @@ impl UnreliableWrapper {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
self.attempt(RemoteOp::List)?;
|
||||
self.inner.list().await
|
||||
}
|
||||
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -1,275 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// since current default AWS S3 pagination limit is 1000.
|
||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix =
|
||||
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
enum MaybeEnabledS3 {
|
||||
Enabled(S3WithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
||||
}
|
||||
|
||||
struct S3WithTestBlobs {
|
||||
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
remote_prefixes: HashSet<RemotePath>,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
async fn setup() -> Self {
|
||||
utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
|
||||
let base_prefix_str = "test/";
|
||||
match upload_s3_data(
|
||||
&client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
upload_tasks_count,
|
||||
)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
Self::Enabled(S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
|
||||
S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
|
||||
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
|
||||
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
|
||||
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
|
||||
let random_prefix_part = std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.context("random s3 test prefix part calculation")?
|
||||
.as_millis();
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: remote_storage_s3_bucket,
|
||||
bucket_region: remote_storage_s3_region,
|
||||
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(max_keys_per_list_response),
|
||||
}),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
|
||||
let blob_prefix = RemotePath::new(&prefix)
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(
|
||||
Box::new(std::io::Cursor::new(data)),
|
||||
data_len,
|
||||
&blob_path,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,6 @@ jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
regex.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
51
libs/utils/src/approx_accurate.rs
Normal file
51
libs/utils/src/approx_accurate.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
/// Three-state `max` accumulator.
|
||||
///
|
||||
/// If it accumulates over 0 or many `Some(T)` values, it is `Accurate` maximum of those values.
|
||||
/// If a single `None` value is merged, it becomes `Approximate` variant.
|
||||
///
|
||||
/// Remove when `Layer::file_size` is no longer an `Option`.
|
||||
#[derive(Default, Debug)]
|
||||
pub enum ApproxAccurate<T> {
|
||||
Approximate(T),
|
||||
Accurate(T),
|
||||
#[default]
|
||||
Empty,
|
||||
}
|
||||
|
||||
impl<T: Ord + Copy + Default> ApproxAccurate<T> {
|
||||
/// `max(a, b)` where the approximate is inflicted receiving a `None`, or infected onwards.
|
||||
#[must_use]
|
||||
pub fn max(self, next: Option<T>) -> ApproxAccurate<T> {
|
||||
use ApproxAccurate::*;
|
||||
match (self, next) {
|
||||
(Accurate(a) | Approximate(a), None) => Approximate(a),
|
||||
(Empty, None) => Approximate(T::default()),
|
||||
(Accurate(a), Some(b)) => Accurate(a.max(b)),
|
||||
(Approximate(a), Some(b)) => Approximate(a.max(b)),
|
||||
(Empty, Some(b)) => Accurate(b),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_approximate(&self) -> bool {
|
||||
matches!(self, ApproxAccurate::Approximate(_))
|
||||
}
|
||||
|
||||
pub fn accurate(self) -> Option<T> {
|
||||
use ApproxAccurate::*;
|
||||
match self {
|
||||
Accurate(a) => Some(a),
|
||||
Empty => Some(T::default()),
|
||||
Approximate(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unwrap_accurate_or(self, default: T) -> T {
|
||||
use ApproxAccurate::*;
|
||||
match self {
|
||||
Accurate(a) => a,
|
||||
Approximate(_) => default,
|
||||
// Empty is still accurate, just special case for above `max`
|
||||
Empty => T::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,9 +20,6 @@ pub enum ApiError {
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(&'static str),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -47,10 +44,6 @@ impl ApiError {
|
||||
ApiError::Conflict(_) => {
|
||||
HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT)
|
||||
}
|
||||
ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
|
||||
self.to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -23,7 +23,7 @@ pub enum IdError {
|
||||
struct Id([u8; 16]);
|
||||
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
Id::from(arr)
|
||||
@@ -112,7 +112,7 @@ impl fmt::Debug for Id {
|
||||
macro_rules! id_newtype {
|
||||
($t:ident) => {
|
||||
impl $t {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t {
|
||||
$t(Id::get_from_buf(buf))
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ pub mod pid_file;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod approx_accurate;
|
||||
pub mod shutdown;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
@@ -51,9 +52,6 @@ pub mod history_buffer;
|
||||
|
||||
pub mod measured_stream;
|
||||
|
||||
pub mod serde_percent;
|
||||
pub mod serde_regex;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
#[macro_export]
|
||||
macro_rules! failpoint_sleep_millis_async {
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
//! A serde::Deserialize type for percentages.
|
||||
//!
|
||||
//! See [`Percent`] for details.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// If the value is not an integer between 0 and 100,
|
||||
/// deserialization fails with a descriptive error.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
|
||||
|
||||
impl Percent {
|
||||
pub fn get(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u8, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let v: u8 = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
if v > 100 {
|
||||
return Err(serde::de::Error::custom(
|
||||
"must be an integer between 0 and 100",
|
||||
));
|
||||
}
|
||||
Ok(v)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Percent;
|
||||
|
||||
#[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)]
|
||||
struct Foo {
|
||||
bar: Percent,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basics() {
|
||||
let input = r#"{ "bar": 50 }"#;
|
||||
let foo: Foo = serde_json::from_str(input).unwrap();
|
||||
assert_eq!(foo.bar.get(), 50);
|
||||
}
|
||||
#[test]
|
||||
fn null_handling() {
|
||||
let input = r#"{ "bar": null }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn zero() {
|
||||
let input = r#"{ "bar": 0 }"#;
|
||||
let foo: Foo = serde_json::from_str(input).unwrap();
|
||||
assert_eq!(foo.bar.get(), 0);
|
||||
}
|
||||
#[test]
|
||||
fn out_of_range_above() {
|
||||
let input = r#"{ "bar": 101 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn out_of_range_below() {
|
||||
let input = r#"{ "bar": -1 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn float() {
|
||||
let input = r#"{ "bar": 50.5 }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
#[test]
|
||||
fn string() {
|
||||
let input = r#"{ "bar": "50 %" }"#;
|
||||
let res: Result<Foo, _> = serde_json::from_str(input);
|
||||
assert!(res.is_err());
|
||||
}
|
||||
}
|
||||
@@ -1,60 +0,0 @@
|
||||
//! A `serde::{Deserialize,Serialize}` type for regexes.
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct Regex(
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_regex",
|
||||
serialize_with = "serialize_regex"
|
||||
)]
|
||||
regex::Regex,
|
||||
);
|
||||
|
||||
fn deserialize_regex<'de, D>(deserializer: D) -> Result<regex::Regex, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?;
|
||||
Ok(re)
|
||||
}
|
||||
|
||||
fn serialize_regex<S>(re: ®ex::Regex, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
{
|
||||
serializer.collect_str(re.as_str())
|
||||
}
|
||||
|
||||
impl Deref for Regex {
|
||||
type Target = regex::Regex;
|
||||
|
||||
fn deref(&self) -> ®ex::Regex {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Regex {
|
||||
fn eq(&self, other: &Regex) -> bool {
|
||||
// comparing the automatons would be quite complicated
|
||||
self.as_str() == other.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Regex {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
#[test]
|
||||
fn roundtrip() {
|
||||
let input = r#""foo.*bar""#;
|
||||
let re: super::Regex = serde_json::from_str(input).unwrap();
|
||||
assert!(re.is_match("foo123bar"));
|
||||
assert!(!re.is_match("foo"));
|
||||
let output = serde_json::to_string(&re).unwrap();
|
||||
assert_eq!(output, input);
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,25 @@
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::Signals;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use signal_hook::consts::{signal::*, TERM_SIGNALS};
|
||||
|
||||
pub fn install_shutdown_handlers() -> anyhow::Result<ShutdownSignals> {
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
|
||||
Ok(ShutdownSignals)
|
||||
}
|
||||
|
||||
pub enum Signal {
|
||||
Quit,
|
||||
Interrupt,
|
||||
@@ -21,7 +39,10 @@ impl Signal {
|
||||
pub struct ShutdownSignals;
|
||||
|
||||
impl ShutdownSignals {
|
||||
pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> {
|
||||
pub fn handle(
|
||||
self,
|
||||
mut handler: impl FnMut(Signal) -> anyhow::Result<()>,
|
||||
) -> anyhow::Result<()> {
|
||||
for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() {
|
||||
let signal = match raw_signal {
|
||||
SIGINT => Signal::Interrupt,
|
||||
|
||||
@@ -56,6 +56,7 @@ tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
ubyte = { version = "0.10.3", features = ["serde"] }
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -8,7 +8,7 @@ use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::disk_usage_eviction_task::launch_disk_usage_global_eviction_task;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
@@ -25,9 +25,11 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
|
||||
auth::JwtAuth,
|
||||
logging, project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
@@ -262,6 +264,9 @@ fn start_pageserver(
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
// Install signal handlers
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// Launch broker client
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
|
||||
|
||||
@@ -315,18 +320,8 @@ fn start_pageserver(
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
// is still accessible even if background task is not configured as long as remote storage has
|
||||
// been configured.
|
||||
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
||||
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
launch_disk_usage_global_eviction_task(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
)?;
|
||||
launch_disk_usage_global_eviction_task(conf, remote_storage.clone())?;
|
||||
}
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
@@ -334,15 +329,9 @@ fn start_pageserver(
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router = http::make_router(
|
||||
conf,
|
||||
launch_ts,
|
||||
http_auth,
|
||||
remote_storage,
|
||||
disk_usage_eviction_state,
|
||||
)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router = http::make_router(conf, launch_ts, http_auth, remote_storage)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
@@ -425,7 +414,7 @@ fn start_pageserver(
|
||||
}
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
|
||||
@@ -93,8 +93,6 @@ pub mod defaults {
|
||||
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
|
||||
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -107,8 +105,6 @@ pub mod defaults {
|
||||
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
|
||||
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
|
||||
|
||||
#min_resident_size_override = .. # in bytes
|
||||
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
@@ -170,10 +166,6 @@ pub struct PageServerConf {
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
@@ -250,7 +242,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
@@ -299,9 +291,7 @@ impl Default for PageServerConfigBuilder {
|
||||
.expect("cannot parse default keepalive interval")),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(
|
||||
ConfigurableSemaphore::DEFAULT_INITIAL,
|
||||
),
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
metric_collection_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
@@ -406,7 +396,7 @@ impl PageServerConfigBuilder {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
@@ -455,11 +445,6 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?;
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
@@ -507,12 +492,11 @@ impl PageServerConfigBuilder {
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
@@ -710,7 +694,8 @@ impl PageServerConf {
|
||||
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
@@ -871,8 +856,6 @@ impl PageServerConf {
|
||||
broker_keepalive_interval: Duration::from_secs(5000),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
|
||||
),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
@@ -966,11 +949,6 @@ impl ConfigurableSemaphore {
|
||||
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the configured amount of permits.
|
||||
pub fn initial_permits(&self) -> NonZeroUsize {
|
||||
self.initial_permits
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConfigurableSemaphore {
|
||||
@@ -1075,8 +1053,6 @@ log_format = 'json'
|
||||
)?,
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
@@ -1138,8 +1114,6 @@ log_format = 'json'
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
@@ -1272,7 +1246,6 @@ broker_endpoint = '{broker_endpoint}'
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
max_keys_per_list_response: None,
|
||||
}),
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
//! This module implements the pageserver-global disk-usage-based layer eviction task.
|
||||
//!
|
||||
//! # Mechanics
|
||||
//!
|
||||
//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
|
||||
//! loop that evicts layers in response to a shortage of available bytes
|
||||
//! in the $repo/tenants directory's filesystem.
|
||||
@@ -15,79 +13,80 @@
|
||||
//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
|
||||
//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
|
||||
//!
|
||||
//! # Eviction Policy
|
||||
//!
|
||||
//! There are two thresholds:
|
||||
//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
|
||||
//! If the actual usage is higher, the threshold is exceeded.
|
||||
//! `min_avail_bytes` is the absolute available space in bytes.
|
||||
//! If the actual usage is lower, the threshold is exceeded.
|
||||
//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
|
||||
//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
|
||||
//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
|
||||
//! The reservation is to keep the most recently accessed X bytes per tenant resident.
|
||||
//! If we cannot relieve pressure by evicting layers outside of the reservation, we
|
||||
//! start evicting layers that are part of the reservation, LRU first.
|
||||
//!
|
||||
//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
|
||||
//! throughout the code, but, no actual variable carries that name.
|
||||
//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
|
||||
//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
|
||||
//! during page reconstruction.
|
||||
//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
|
||||
//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
|
||||
|
||||
// Implementation notes:
|
||||
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
|
||||
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
||||
|
||||
//! The iteration evicts layers in LRU fashion.
|
||||
//! It tries first with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant.
|
||||
//! The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds.
|
||||
//! If the per-tenant-reservation strategy doesn't work out, it falls back to global LRU.
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
ops::ControlFlow,
|
||||
sync::{Arc, Mutex},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::dir::Dir;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sync_wrapper::SyncWrapper;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::serde_percent::Percent;
|
||||
use utils::{approx_accurate::ApproxAccurate, id::TenantId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, Timeline},
|
||||
tenant::{self, LocalLayerInfoForDiskUsageEviction, Timeline},
|
||||
};
|
||||
|
||||
fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u64, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let v: u64 = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
if v > 100 {
|
||||
return Err(serde::de::Error::custom(
|
||||
"must be an integer between 0 and 100",
|
||||
));
|
||||
}
|
||||
Ok(v)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: Percent,
|
||||
#[serde(deserialize_with = "deserialize_pct_0_to_100")]
|
||||
pub max_usage_pct: u64,
|
||||
pub min_avail_bytes: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct State {
|
||||
/// Exclude http requests and background task from running at the same time.
|
||||
mutex: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
pub fn launch_disk_usage_global_eviction_task(
|
||||
conf: &'static PageServerConf,
|
||||
storage: GenericRemoteStorage,
|
||||
state: Arc<State>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
info!("disk usage based eviction task not configured");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let tenants_dir_fd = {
|
||||
let tenants_path = conf.tenants_path();
|
||||
nix::dir::Dir::open(
|
||||
&tenants_path,
|
||||
nix::fcntl::OFlag::O_DIRECTORY,
|
||||
nix::sys::stat::Mode::empty(),
|
||||
)
|
||||
.with_context(|| format!("open tenants_path {tenants_path:?}"))?
|
||||
};
|
||||
|
||||
info!("launching disk usage based eviction task");
|
||||
|
||||
task_mgr::spawn(
|
||||
@@ -99,10 +98,9 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
false,
|
||||
async move {
|
||||
disk_usage_eviction_task(
|
||||
&state,
|
||||
task_config,
|
||||
storage,
|
||||
&conf.tenants_path(),
|
||||
tenants_dir_fd,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await;
|
||||
@@ -116,12 +114,19 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn disk_usage_eviction_task(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: GenericRemoteStorage,
|
||||
tenants_dir: &Path,
|
||||
tenants_dir_fd: Dir,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
// nix::dir::Dir is Send but not Sync.
|
||||
// One would think that that is sufficient, but rustc complains that the &tenants_dir_fd
|
||||
// that we pass to disk_usage_eviction_iteration below will outlive the .await;
|
||||
// The reason is that the &tenants_dir_fd is not sync because of stdlib-enforced axiom
|
||||
// T: Sync <=> &T: Send
|
||||
// The solution is to use SyncWrapper, which, by owning the tenants_dir_fd, can impl Sync.
|
||||
let mut tenants_dir_fd = SyncWrapper::new(tenants_dir_fd);
|
||||
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
if random_init_delay(task_config.period, &cancel)
|
||||
@@ -140,10 +145,9 @@ async fn disk_usage_eviction_task(
|
||||
|
||||
async {
|
||||
let res = disk_usage_eviction_task_iteration(
|
||||
state,
|
||||
task_config,
|
||||
&storage,
|
||||
tenants_dir,
|
||||
&mut tenants_dir_fd,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
@@ -176,15 +180,14 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_task_iteration(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenants_dir: &Path,
|
||||
tenants_dir_fd: &mut SyncWrapper<Dir>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir_fd, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(storage, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -194,7 +197,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
}
|
||||
IterationOutcome::Finished(outcome) => {
|
||||
// Verify with statvfs whether we made any real progress
|
||||
let after = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
let after = filesystem_level_usage::get(tenants_dir_fd, task_config)
|
||||
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
||||
.context("get filesystem-level disk usage after evictions")?;
|
||||
|
||||
@@ -229,6 +232,8 @@ pub enum IterationOutcome<U> {
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
||||
// We use the Debug impl for logging, so, it's allright.
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
@@ -243,6 +248,8 @@ pub struct IterationOutcomeFinished<U> {
|
||||
assumed: AssumedUsage<U>,
|
||||
}
|
||||
|
||||
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
||||
// We use the Debug impl for logging, so, it's allright.
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(dead_code)]
|
||||
struct AssumedUsage<U> {
|
||||
@@ -252,6 +259,8 @@ struct AssumedUsage<U> {
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
||||
// We use the Debug impl for logging, so, it's allright.
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
@@ -266,18 +275,26 @@ struct LayerCount {
|
||||
count: usize,
|
||||
}
|
||||
|
||||
#[allow(clippy::needless_late_init)]
|
||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
let _g = state
|
||||
.mutex
|
||||
static MUTEX: once_cell::sync::Lazy<tokio::sync::Mutex<()>> =
|
||||
once_cell::sync::Lazy::new(|| tokio::sync::Mutex::new(()));
|
||||
|
||||
let _g = MUTEX
|
||||
.try_lock()
|
||||
.map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
|
||||
|
||||
// planned post-eviction usage
|
||||
let mut usage_planned_min_resident_size_respecting = usage_pre;
|
||||
let mut usage_planned_global_lru = None;
|
||||
// achieved post-eviction usage according to internal accounting
|
||||
let mut usage_assumed = usage_pre;
|
||||
// actual usage read after batched evictions
|
||||
|
||||
debug!(?usage_pre, "disk usage");
|
||||
|
||||
if !usage_pre.has_pressure() {
|
||||
@@ -289,46 +306,42 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
EvictionCandidates::Finished(partitioned) => partitioned,
|
||||
};
|
||||
let mut lru_candidates: Vec<(_, LocalLayerInfoForDiskUsageEviction)> = Vec::new();
|
||||
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
candidate.layer.file_size(),
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
candidate.layer.get_tenant_id(),
|
||||
candidate.layer.get_timeline_id(),
|
||||
candidate.layer.filename().file_name(),
|
||||
);
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
{
|
||||
let mut tmp = Vec::new();
|
||||
for (tenant_id, _state) in &tenants {
|
||||
let flow = extend_lru_candidates(
|
||||
Mode::RespectTenantMinResidentSize,
|
||||
*tenant_id,
|
||||
&mut lru_candidates,
|
||||
&mut tmp,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let ControlFlow::Break(()) = flow {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
|
||||
assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
|
||||
}
|
||||
}
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
|
||||
// phase1: select victims to relieve pressure
|
||||
//
|
||||
// Walk through the list of candidates, until we have accumulated enough layers to get
|
||||
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
|
||||
// how much disk space would be used after evicting all the layers up to the current
|
||||
// point in the list. The layers are collected in 'batched', grouped per timeline.
|
||||
//
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
|
||||
let mut batched: HashMap<_, Vec<LocalLayerInfoForDiskUsageEviction>> = HashMap::new();
|
||||
for (i, (timeline, layer)) in lru_candidates.into_iter().enumerate() {
|
||||
if !usage_planned_min_resident_size_respecting.has_pressure() {
|
||||
debug!(
|
||||
no_candidates_evicted = i,
|
||||
"took enough candidates for pressure to be relieved"
|
||||
@@ -336,40 +349,66 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
break;
|
||||
}
|
||||
|
||||
if partition == MinResidentSizePartition::Below && warned.is_none() {
|
||||
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||
warned = Some(usage_planned);
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.file_size());
|
||||
usage_planned_min_resident_size_respecting.add_available_bytes(layer.file_size());
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
.entry(TimelineKey(timeline.clone()))
|
||||
.or_default()
|
||||
.push(candidate.layer);
|
||||
.push(layer);
|
||||
}
|
||||
// If we can't relieve pressure while respecting tenant_min_resident_size, fall back to global LRU.
|
||||
if usage_planned_min_resident_size_respecting.has_pressure() {
|
||||
// NB: tests depend on parts of this log message
|
||||
warn!(?usage_pre, ?usage_planned_min_resident_size_respecting, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU");
|
||||
batched.clear();
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut global_lru_candidates = Vec::new();
|
||||
let mut tmp = Vec::new();
|
||||
for (tenant_id, _state) in &tenants {
|
||||
let flow = extend_lru_candidates(
|
||||
Mode::GlobalLru,
|
||||
*tenant_id,
|
||||
&mut global_lru_candidates,
|
||||
&mut tmp,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
let usage_planned = match warned {
|
||||
Some(respecting_tenant_min_resident_size) => PlannedUsage {
|
||||
respecting_tenant_min_resident_size,
|
||||
fallback_to_global_lru: Some(usage_planned),
|
||||
},
|
||||
None => PlannedUsage {
|
||||
respecting_tenant_min_resident_size: usage_planned,
|
||||
fallback_to_global_lru: None,
|
||||
},
|
||||
if let ControlFlow::Break(()) = flow {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
|
||||
assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
|
||||
}
|
||||
global_lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
|
||||
for (timeline, layer) in global_lru_candidates {
|
||||
usage_planned.add_available_bytes(layer.file_size());
|
||||
batched
|
||||
.entry(TimelineKey(timeline.clone()))
|
||||
.or_default()
|
||||
.push(layer);
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
}
|
||||
usage_planned_global_lru = Some(usage_planned);
|
||||
}
|
||||
let usage_planned = PlannedUsage {
|
||||
respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting,
|
||||
fallback_to_global_lru: usage_planned_global_lru,
|
||||
};
|
||||
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
// phase2: evict victims batched by timeline
|
||||
|
||||
// After the loop, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut batch = Vec::new();
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
for (timeline, batch) in batched {
|
||||
for (timeline, layers) in batched {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
batch.clear();
|
||||
batch.extend(layers.iter().map(|x| &x.layer).cloned());
|
||||
let batch_size = batch.len();
|
||||
|
||||
debug!(%timeline_id, "evicting batch for timeline");
|
||||
@@ -382,8 +421,8 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
assert_eq!(results.len(), layers.len());
|
||||
for (result, layer) in results.into_iter().zip(layers.iter()) {
|
||||
match result {
|
||||
Some(Ok(true)) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
@@ -426,161 +465,110 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
}))
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
last_activity_ts: SystemTime,
|
||||
/// Different modes of gathering tenant's least recently used layers.
|
||||
#[derive(Debug)]
|
||||
enum Mode {
|
||||
/// Add all but the most recently used `min_resident_size` worth of layers to the candidates
|
||||
/// list.
|
||||
///
|
||||
/// `min_resident_size` defaults to maximum layer file size of the tenant. This ensures that
|
||||
/// the tenant will always have one layer resident. If we cannot compute `min_resident_size`
|
||||
/// accurately because metadata is missing we use hardcoded constant. `min_resident_size` can
|
||||
/// be overridden per tenant for important tenants.
|
||||
RespectTenantMinResidentSize,
|
||||
/// Consider all layer files from all tenants in LRU order.
|
||||
///
|
||||
/// This is done if the `min_resident_size` respecting does not relieve pressure.
|
||||
GlobalLru,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum MinResidentSizePartition {
|
||||
Above,
|
||||
Below,
|
||||
}
|
||||
|
||||
enum EvictionCandidates {
|
||||
Cancelled,
|
||||
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
|
||||
}
|
||||
|
||||
/// Gather the eviction candidates.
|
||||
///
|
||||
/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
|
||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||
/// the eviction policy outlined in the module comment.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||
/// The eviction order would be
|
||||
///
|
||||
/// ```text
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||
/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
|
||||
///
|
||||
/// But, if we need to evict 900 bytes to relieve pressure, we'd evict
|
||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
async fn collect_eviction_candidates(
|
||||
#[instrument(skip_all, fields(?mode, %tenant_id))]
|
||||
async fn extend_lru_candidates(
|
||||
mode: Mode,
|
||||
tenant_id: TenantId,
|
||||
lru_candidates: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
|
||||
scratch: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
) -> ControlFlow<()> {
|
||||
debug!("begin");
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
};
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
let mut max_layer_size = ApproxAccurate::default();
|
||||
for tl in tenant.list_timelines() {
|
||||
if !tl.is_active() {
|
||||
continue;
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction();
|
||||
debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
scratch.extend(
|
||||
info.resident_layers
|
||||
.into_iter()
|
||||
.map(|layer_infos| (tl.clone(), layer_infos)),
|
||||
);
|
||||
max_layer_size = max_layer_size.max(info.max_layer_size.accurate());
|
||||
|
||||
for (tenant_id, _state) in &tenants {
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||
// for example because we're shutting down, then `max_layer_size` can be too small.
|
||||
// That's OK. This code only runs under a disk pressure situation, and being
|
||||
// a little unfair to tenants during shutdown in such a situation is tolerable.
|
||||
let mut tenant_candidates = Vec::new();
|
||||
let mut max_layer_size = 0;
|
||||
for tl in tenant.list_timelines() {
|
||||
if !tl.is_active() {
|
||||
continue;
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction();
|
||||
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
info.resident_layers
|
||||
.into_iter()
|
||||
.map(|layer_infos| (tl.clone(), layer_infos)),
|
||||
);
|
||||
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
}
|
||||
|
||||
// `min_resident_size` defaults to maximum layer file size of the tenant.
|
||||
// This ensures that each tenant can have at least one layer resident at a given time,
|
||||
// ensuring forward progress for a single Timeline::get in that tenant.
|
||||
// It's a questionable heuristic since, usually, there are many Timeline::get
|
||||
// requests going on for a tenant, and, at least in Neon prod, the median
|
||||
// layer file size is much smaller than the compaction target size.
|
||||
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
|
||||
// That's what's typically used by the various background loops.
|
||||
//
|
||||
// The default can be overriden with a fixed value in the tenant conf.
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
overriden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
} else {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
max_layer_size,
|
||||
"using max layer size as min_resident_size for tenant",
|
||||
);
|
||||
max_layer_size
|
||||
};
|
||||
|
||||
// Sort layers most-recently-used first, then partition by
|
||||
// cumsum above/below min_resident_size.
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
};
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
} else {
|
||||
MinResidentSizePartition::Below
|
||||
};
|
||||
candidates.push((partition, candidate));
|
||||
cumsum += i128::from(file_size);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
let min_resident_size = match mode {
|
||||
Mode::GlobalLru => {
|
||||
lru_candidates.append(scratch);
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() {
|
||||
Some(size) => size,
|
||||
None => {
|
||||
match max_layer_size.accurate() {
|
||||
Some(size) => size,
|
||||
None => {
|
||||
let prod_max_layer_file_size = 332_880_000;
|
||||
// rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers
|
||||
static LAST_WARNED: Mutex<Option<Instant>> = Mutex::new(None);
|
||||
let mut last_warned = LAST_WARNED.lock().unwrap();
|
||||
if last_warned
|
||||
.map(|v| v.elapsed() > Duration::from_secs(60))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value");
|
||||
*last_warned = Some(Instant::now());
|
||||
}
|
||||
prod_max_layer_file_size
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts);
|
||||
|
||||
let mut current: u64 = scratch.iter().map(|(_, layer)| layer.file_size()).sum();
|
||||
for (tl, layer) in scratch.drain(..) {
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
if current <= min_resident_size {
|
||||
break;
|
||||
}
|
||||
current -= layer.file_size();
|
||||
debug!(?layer, "adding layer to lru_candidates");
|
||||
lru_candidates.push((tl, layer));
|
||||
}
|
||||
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
struct TimelineKey(Arc<Timeline>);
|
||||
@@ -608,14 +596,17 @@ impl std::ops::Deref for TimelineKey {
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use crate::statvfs::Statvfs;
|
||||
use nix::{
|
||||
dir::Dir,
|
||||
sys::statvfs::{self, Statvfs},
|
||||
};
|
||||
use sync_wrapper::SyncWrapper;
|
||||
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
||||
// We use the Debug impl for logging, so, it's allright.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[allow(dead_code)]
|
||||
pub struct Usage<'a> {
|
||||
@@ -637,10 +628,7 @@ mod filesystem_level_usage {
|
||||
"min_avail_bytes",
|
||||
self.avail_bytes < self.config.min_avail_bytes,
|
||||
),
|
||||
(
|
||||
"max_usage_pct",
|
||||
usage_pct > self.config.max_usage_pct.get() as u64,
|
||||
),
|
||||
("max_usage_pct", usage_pct > self.config.max_usage_pct),
|
||||
];
|
||||
|
||||
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
|
||||
@@ -652,21 +640,10 @@ mod filesystem_level_usage {
|
||||
}
|
||||
|
||||
pub fn get<'a>(
|
||||
tenants_dir: &Path,
|
||||
tenants_dir_fd: &mut SyncWrapper<Dir>,
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
) -> anyhow::Result<Usage<'a>> {
|
||||
let mock_config = {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
config.mock_statvfs.as_ref()
|
||||
}
|
||||
#[cfg(not(feature = "testing"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let stat = Statvfs::get(tenants_dir, mock_config)
|
||||
let stat: Statvfs = statvfs::fstatvfs(tenants_dir_fd.get_mut())
|
||||
.context("statvfs failed, presumably directory got unlinked")?;
|
||||
|
||||
// https://unix.stackexchange.com/a/703650
|
||||
|
||||
@@ -40,7 +40,11 @@ paths:
|
||||
- evict_bytes
|
||||
properties:
|
||||
evict_bytes:
|
||||
type: integer
|
||||
description: Unsigned bytes or a human writable amount of bytes with IEC kibibyte suffixes.
|
||||
oneOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
pattern: '^[0-9]+ ?(([KMGTP]i)?B)?$'
|
||||
responses:
|
||||
"200":
|
||||
description: |
|
||||
@@ -208,19 +212,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"412":
|
||||
description: Tenant is missing
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -389,13 +380,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: detach_ignored
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, allow to detach a tenant which state is ignored.
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
|
||||
@@ -421,12 +405,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -898,9 +876,13 @@ components:
|
||||
type: object
|
||||
properties:
|
||||
tenant_specific_overrides:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
type: object
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
effective_config:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
type: object
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -986,13 +968,6 @@ components:
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
PreconditionFailedError:
|
||||
type: object
|
||||
required:
|
||||
- msg
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
|
||||
@@ -18,7 +18,6 @@ use super::models::{
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::disk_usage_eviction_task;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
@@ -49,7 +48,6 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -57,7 +55,6 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
@@ -68,7 +65,6 @@ impl State {
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_storage,
|
||||
disk_usage_eviction_state,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -135,34 +131,6 @@ impl From<TenantStateError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
|
||||
HasChildren => ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot delete timeline which has child timelines"
|
||||
)),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
match value {
|
||||
// Report Precondition failed so client can distinguish between
|
||||
// "tenant is missing" case from "timeline is missing"
|
||||
Tenant(TenantStateError::NotFound(..)) => {
|
||||
ApiError::PreconditionFailed("Requested tenant is missing")
|
||||
}
|
||||
Tenant(t) => ApiError::from(t),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
@@ -217,7 +185,7 @@ fn build_timeline_info_common(
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum());
|
||||
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
@@ -416,11 +384,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||
mgr::detach_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
@@ -484,7 +451,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.layer_size_sum();
|
||||
current_physical_size += timeline.layer_size_sum().approximate_is_ok();
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
@@ -922,20 +889,6 @@ async fn update_tenant_config_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||
#[cfg(feature = "testing")]
|
||||
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test");
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
@@ -1085,89 +1038,6 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
json_response(StatusCode::NO_CONTENT, ())
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
evict_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
config: Config,
|
||||
// updated by `add_available_bytes`
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
self.freed_bytes += bytes;
|
||||
}
|
||||
}
|
||||
|
||||
let config = json_request::<Config>(&mut r)
|
||||
.await
|
||||
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
|
||||
|
||||
let usage = Usage {
|
||||
config,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)))
|
||||
};
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let child_cancel = cancel.clone();
|
||||
let _g = cancel.drop_guard();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
"ondemand disk usage eviction",
|
||||
false,
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
&storage,
|
||||
usage,
|
||||
&child_cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
|
||||
|
||||
let _ = tx.send(res);
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1180,7 +1050,6 @@ pub fn make_router(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
@@ -1225,8 +1094,7 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_storage, disk_usage_eviction_state)
|
||||
.context("Failed to initialize router state")?,
|
||||
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
|
||||
.put(
|
||||
@@ -1317,3 +1185,151 @@ pub fn make_router(
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
|
||||
#[cfg(feature = "testing")]
|
||||
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use std::str::FromStr;
|
||||
let tenant_id = get_request_param(&r, "tenant_id")?;
|
||||
let tenant_id = TenantId::from_str(tenant_id).map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test");
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_bytes",
|
||||
serialize_with = "serialize_bytes"
|
||||
)]
|
||||
evict_bytes: u64,
|
||||
}
|
||||
|
||||
fn serialize_bytes<S: serde::Serializer>(x: &u64, ser: S) -> Result<S::Ok, S::Error> {
|
||||
use ubyte::ByteUnit;
|
||||
|
||||
let x = ByteUnit::from(*x);
|
||||
|
||||
// ByteUnit has a nice lossy serialization format as it's Display
|
||||
ser.collect_str(&x)
|
||||
}
|
||||
|
||||
fn deserialize_bytes<'d, D: serde::Deserializer<'d>>(des: D) -> Result<u64, D::Error> {
|
||||
struct Visitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for Visitor {
|
||||
type Value = u64;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("positive nsigned number of bytes or positive number of bytes with SI/IEC suffix in a string")
|
||||
}
|
||||
|
||||
fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
if v == 0 {
|
||||
Err(serde::de::Error::invalid_value(
|
||||
serde::de::Unexpected::Unsigned(v),
|
||||
&self,
|
||||
))
|
||||
} else {
|
||||
Ok(v)
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> std::result::Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
use std::str::FromStr;
|
||||
let bytes = ubyte::ByteUnit::from_str(v).map_err(serde::de::Error::custom)?;
|
||||
let bytes = u64::from(bytes);
|
||||
self.visit_u64(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
des.deserialize_any(Visitor)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
config: Config,
|
||||
// updated by `add_available_bytes`
|
||||
#[serde(serialize_with = "serialize_bytes")]
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
self.freed_bytes += bytes;
|
||||
}
|
||||
}
|
||||
|
||||
let config = json_request::<Config>(&mut r)
|
||||
.await
|
||||
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
|
||||
|
||||
let usage = Usage {
|
||||
config,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)))
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let child_cancel = cancel.clone();
|
||||
let _g = cancel.drop_guard();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
"ondemand disk usage eviction",
|
||||
false,
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&storage,
|
||||
usage,
|
||||
&child_cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
|
||||
|
||||
let _ = tx.send(res);
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod trace;
|
||||
|
||||
@@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration {
|
||||
}
|
||||
|
||||
pub fn observe(&self, observed_value: Duration) {
|
||||
if observed_value < self.threshold {
|
||||
if self.threshold < observed_value {
|
||||
self.counter
|
||||
.as_ref()
|
||||
.expect("nobody calls this function after `remove_from_vec`")
|
||||
@@ -586,6 +586,7 @@ pub struct TimelineMetrics {
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
pub init_logical_size_histo: StorageTimeMetrics,
|
||||
pub logical_size_histo: StorageTimeMetrics,
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
@@ -618,6 +619,8 @@ impl TimelineMetrics {
|
||||
let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
|
||||
let create_images_time_histo =
|
||||
StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
|
||||
let init_logical_size_histo =
|
||||
StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
|
||||
let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
|
||||
let load_layer_map_histo =
|
||||
StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
|
||||
@@ -654,6 +657,7 @@ impl TimelineMetrics {
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
init_logical_size_histo,
|
||||
logical_size_histo,
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
|
||||
@@ -27,7 +27,6 @@ use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -467,7 +466,8 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
let copyin_reader = StreamReader::new(copyin_stream(pgb));
|
||||
tokio::pin!(copyin_reader);
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
|
||||
.await?;
|
||||
@@ -512,7 +512,8 @@ impl PageServerHandler {
|
||||
info!("importing wal");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
let copyin_reader = StreamReader::new(copyin_stream(pgb));
|
||||
tokio::pin!(copyin_reader);
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
pub enum Statvfs {
|
||||
Real(nix::sys::statvfs::Statvfs),
|
||||
Mock(mock::Statvfs),
|
||||
}
|
||||
|
||||
// NB: on macOS, the block count type of struct statvfs is u32.
|
||||
// The workaround seems to be to use the non-standard statfs64 call.
|
||||
// Sincce it should only be a problem on > 2TiB disks, let's ignore
|
||||
// the problem for now and upcast to u64.
|
||||
impl Statvfs {
|
||||
pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
|
||||
if let Some(mocked) = mocked {
|
||||
Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
|
||||
} else {
|
||||
Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
|
||||
}
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
|
||||
Statvfs::Mock(stat) => stat.blocks,
|
||||
}
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks_available(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
|
||||
Statvfs::Mock(stat) => stat.blocks_available,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fragment_size(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => stat.fragment_size(),
|
||||
Statvfs::Mock(stat) => stat.fragment_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn block_size(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => stat.block_size(),
|
||||
Statvfs::Mock(stat) => stat.block_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod mock {
|
||||
use anyhow::Context;
|
||||
use regex::Regex;
|
||||
use std::path::Path;
|
||||
use tracing::log::info;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum Behavior {
|
||||
Success {
|
||||
blocksize: u64,
|
||||
total_blocks: u64,
|
||||
name_filter: Option<utils::serde_regex::Regex>,
|
||||
},
|
||||
Failure {
|
||||
mocked_error: MockedError,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[allow(clippy::upper_case_acronyms)]
|
||||
pub enum MockedError {
|
||||
EIO,
|
||||
}
|
||||
|
||||
impl From<MockedError> for nix::Error {
|
||||
fn from(e: MockedError) -> Self {
|
||||
match e {
|
||||
MockedError::EIO => nix::Error::EIO,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
|
||||
info!("running mocked statvfs");
|
||||
|
||||
match behavior {
|
||||
Behavior::Success {
|
||||
blocksize,
|
||||
total_blocks,
|
||||
ref name_filter,
|
||||
} => {
|
||||
let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
|
||||
|
||||
// round it up to the nearest block multiple
|
||||
let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
|
||||
|
||||
if used_blocks > *total_blocks {
|
||||
panic!(
|
||||
"mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}"
|
||||
);
|
||||
}
|
||||
|
||||
let avail_blocks = total_blocks - used_blocks;
|
||||
|
||||
Ok(Statvfs {
|
||||
blocks: *total_blocks,
|
||||
blocks_available: avail_blocks,
|
||||
fragment_size: *blocksize,
|
||||
block_size: *blocksize,
|
||||
})
|
||||
}
|
||||
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
|
||||
}
|
||||
}
|
||||
|
||||
fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
|
||||
let mut total = 0;
|
||||
for entry in walkdir::WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
if !entry.file_type().is_file() {
|
||||
continue;
|
||||
}
|
||||
if !name_filter
|
||||
.as_ref()
|
||||
.map(|filter| filter.is_match(entry.file_name().to_str().unwrap()))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
total += entry
|
||||
.metadata()
|
||||
.with_context(|| format!("get metadata of {:?}", entry.path()))?
|
||||
.len();
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
pub struct Statvfs {
|
||||
pub blocks: u64,
|
||||
pub blocks_available: u64,
|
||||
pub fragment_size: u64,
|
||||
pub block_size: u64,
|
||||
}
|
||||
}
|
||||
@@ -484,25 +484,13 @@ pub async fn shutdown_tasks(
|
||||
for task in victim_tasks {
|
||||
let join_handle = {
|
||||
let mut task_mut = task.mutable.lock().unwrap();
|
||||
task_mut.join_handle.take()
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
let join_handle = task_mut.join_handle.take();
|
||||
drop(task_mut);
|
||||
join_handle
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
let completed = tokio::select! {
|
||||
_ = &mut join_handle => { true },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
false
|
||||
}
|
||||
};
|
||||
if !completed {
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
// - task errors are already logged in the wrapper
|
||||
let _ = join_handle.await;
|
||||
}
|
||||
if let Some(join_handle) = join_handle {
|
||||
let _ = join_handle.await;
|
||||
} else {
|
||||
// Possibly one of:
|
||||
// * The task had not even fully started yet.
|
||||
|
||||
@@ -46,7 +46,6 @@ use std::time::{Duration, Instant};
|
||||
use self::config::TenantConf;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir;
|
||||
@@ -143,8 +142,6 @@ pub struct Tenant {
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
@@ -434,16 +431,6 @@ remote:
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
#[error("HasChildren")]
|
||||
HasChildren,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
@@ -491,7 +478,7 @@ impl Tenant {
|
||||
|
||||
let dummy_timeline = self.create_timeline_data(
|
||||
timeline_id,
|
||||
up_to_date_metadata,
|
||||
up_to_date_metadata.clone(),
|
||||
ancestor.clone(),
|
||||
remote_client,
|
||||
)?;
|
||||
@@ -516,7 +503,7 @@ impl Tenant {
|
||||
let broken_timeline = self
|
||||
.create_timeline_data(
|
||||
timeline_id,
|
||||
up_to_date_metadata,
|
||||
up_to_date_metadata.clone(),
|
||||
ancestor.clone(),
|
||||
None,
|
||||
)
|
||||
@@ -1155,7 +1142,7 @@ impl Tenant {
|
||||
);
|
||||
self.prepare_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
new_metadata,
|
||||
timeline_uninit_mark,
|
||||
true,
|
||||
None,
|
||||
@@ -1320,7 +1307,7 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
@@ -1332,13 +1319,13 @@ impl Tenant {
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
|
||||
if children_exist {
|
||||
return Err(DeleteTimelineError::HasChildren);
|
||||
}
|
||||
|
||||
anyhow::ensure!(
|
||||
!children_exist,
|
||||
"Cannot delete timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
let timeline = Arc::clone(timeline_entry.get());
|
||||
@@ -1720,7 +1707,7 @@ impl Tenant {
|
||||
fn create_timeline_data(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
@@ -1791,7 +1778,6 @@ impl Tenant {
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2181,25 +2167,13 @@ impl Tenant {
|
||||
let new_timeline = self
|
||||
.prepare_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
metadata,
|
||||
timeline_uninit_mark,
|
||||
false,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)?
|
||||
.initialize_with_lock(&mut timelines, true, true)?;
|
||||
drop(timelines);
|
||||
|
||||
// Root timeline gets its layers during creation and uploads them along with the metadata.
|
||||
// A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
|
||||
// We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
|
||||
// could get incorrect information and remove more layers, than needed.
|
||||
// See also https://github.com/neondatabase/neon/issues/3865
|
||||
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
|
||||
remote_client
|
||||
.schedule_index_upload_for_metadata_update(&metadata)
|
||||
.context("branch initial metadata upload")?;
|
||||
}
|
||||
|
||||
info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
|
||||
|
||||
Ok(new_timeline)
|
||||
@@ -2262,7 +2236,7 @@ impl Tenant {
|
||||
pg_version,
|
||||
);
|
||||
let raw_timeline =
|
||||
self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;
|
||||
self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?;
|
||||
|
||||
let tenant_id = raw_timeline.owning_tenant.tenant_id;
|
||||
let unfinished_timeline = raw_timeline.raw_timeline()?;
|
||||
@@ -2316,7 +2290,7 @@ impl Tenant {
|
||||
fn prepare_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
uninit_mark: TimelineUninitMark,
|
||||
init_layers: bool,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
@@ -2330,7 +2304,7 @@ impl Tenant {
|
||||
tenant_id,
|
||||
new_timeline_id,
|
||||
);
|
||||
remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
|
||||
remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
|
||||
Some(remote_client)
|
||||
} else {
|
||||
None
|
||||
@@ -2369,12 +2343,17 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_path: &Path,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
new_metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_data = self
|
||||
.create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
|
||||
.create_timeline_data(
|
||||
new_timeline_id,
|
||||
new_metadata.clone(),
|
||||
ancestor,
|
||||
remote_client,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
@@ -2386,7 +2365,7 @@ impl Tenant {
|
||||
self.conf,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
new_metadata,
|
||||
&new_metadata,
|
||||
true,
|
||||
)
|
||||
.context("Failed to create timeline metadata")?;
|
||||
|
||||
@@ -315,26 +315,21 @@ pub async fn get_tenant(
|
||||
.get(&tenant_id)
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
tracing::warn!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
);
|
||||
Err(TenantStateError::NotActive(tenant_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
Tenant(#[from] TenantStateError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] crate::tenant::DeleteTimelineError),
|
||||
}
|
||||
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
) -> Result<(), TenantStateError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
tenant.delete_timeline(timeline_id, ctx).await?;
|
||||
Ok(())
|
||||
@@ -355,35 +350,17 @@ pub enum TenantStateError {
|
||||
pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} removal")
|
||||
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
|
||||
})?;
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let removal_result =
|
||||
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
local_files_cleanup_operation(tenant_id)
|
||||
.await
|
||||
.with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
removal_result
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn load_tenant(
|
||||
|
||||
@@ -210,6 +210,7 @@ pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
@@ -346,7 +347,7 @@ impl RemoteTimelineClient {
|
||||
.layer_metadata
|
||||
.values()
|
||||
// If we don't have the file size for the layer, don't account for it in the metric.
|
||||
.map(|ilmd| ilmd.file_size)
|
||||
.map(|ilmd| ilmd.file_size.unwrap_or(0))
|
||||
.sum()
|
||||
} else {
|
||||
0
|
||||
@@ -419,6 +420,34 @@ impl RemoteTimelineClient {
|
||||
.await?
|
||||
};
|
||||
|
||||
// Update the metadata for given layer file. The remote index file
|
||||
// might be missing some information for the file; this allows us
|
||||
// to fill in the missing details.
|
||||
if layer_metadata.file_size().is_none() {
|
||||
let new_metadata = LayerFileMetadata::new(downloaded_size);
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
|
||||
if upgraded.merge(&new_metadata) {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
// If we don't do an index file upload inbetween here and restart,
|
||||
// the value will go back down after pageserver restart, since we will
|
||||
// have lost this data point.
|
||||
// But, we upload index part fairly frequently, and restart pageserver rarely.
|
||||
// So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
|
||||
self.metrics
|
||||
.remote_physical_size_gauge()
|
||||
.add(downloaded_size);
|
||||
} else {
|
||||
// The file should exist, since we just downloaded it.
|
||||
warn!(
|
||||
"downloaded file {:?} not found in local copy of the index file",
|
||||
layer_file_name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
|
||||
REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
|
||||
|
||||
@@ -521,6 +550,13 @@ impl RemoteTimelineClient {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// The file size can be missing for files that were created before we tracked that
|
||||
// in the metadata, but it should be present for any new files we create.
|
||||
ensure!(
|
||||
layer_metadata.file_size().is_some(),
|
||||
"file size not initialized in metadata"
|
||||
);
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(layer_file_name.clone(), layer_metadata.clone());
|
||||
|
||||
@@ -21,7 +21,7 @@ use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
@@ -113,11 +113,16 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
|
||||
)));
|
||||
match layer_metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
)));
|
||||
}
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
@@ -256,12 +261,14 @@ pub(super) async fn download_index_part(
|
||||
)
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part = index_part.remove_unclean_layer_file_names();
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -19,7 +20,7 @@ use utils::lsn::Lsn;
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[cfg_attr(test, derive(Default))]
|
||||
pub struct LayerFileMetadata {
|
||||
file_size: u64,
|
||||
file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
@@ -32,16 +33,36 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||
|
||||
impl LayerFileMetadata {
|
||||
pub fn new(file_size: u64) -> Self {
|
||||
LayerFileMetadata { file_size }
|
||||
LayerFileMetadata {
|
||||
file_size: Some(file_size),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn file_size(&self) -> u64 {
|
||||
/// This is used to initialize the metadata for remote layers, for which
|
||||
/// the metadata was missing from the index part file.
|
||||
pub const MISSING: Self = LayerFileMetadata { file_size: None };
|
||||
|
||||
pub fn file_size(&self) -> Option<u64> {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
|
||||
/// other value.
|
||||
///
|
||||
/// This is called on the possibly outdated version. Returns true if any changes
|
||||
/// were made.
|
||||
pub fn merge(&mut self, other: &Self) -> bool {
|
||||
let mut changed = false;
|
||||
|
||||
if self.file_size != other.file_size {
|
||||
self.file_size = other.file_size.or(self.file_size);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
changed
|
||||
}
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
/// In-memory representation of an `index_part.json` file
|
||||
///
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
@@ -50,7 +71,10 @@ impl LayerFileMetadata {
|
||||
/// remember to add a test case for the changed version.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
pub struct IndexPartImpl<L>
|
||||
where
|
||||
L: std::hash::Hash + PartialEq + Eq,
|
||||
{
|
||||
/// Debugging aid describing the version of this type.
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
@@ -58,13 +82,14 @@ pub struct IndexPart {
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<LayerFileName>,
|
||||
pub timeline_layers: HashSet<L>,
|
||||
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
|
||||
#[serde(default = "HashMap::default")]
|
||||
pub layer_metadata: HashMap<L, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
@@ -73,6 +98,101 @@ pub struct IndexPart {
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
pub type IndexPart = IndexPartImpl<LayerFileName>;
|
||||
|
||||
pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum UncleanLayerFileName {
|
||||
Clean(LayerFileName),
|
||||
BackupFile(String),
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(UncleanLayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct UncleanLayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
|
||||
type Value = UncleanLayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid LayerFileName or '.old' backup file name"
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
let maybe_clean: Result<LayerFileName, _> = v.parse();
|
||||
match maybe_clean {
|
||||
Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
|
||||
Err(e) => {
|
||||
if v.ends_with(".old") || v == "metadata_backup" {
|
||||
Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
|
||||
} else {
|
||||
Err(E::custom(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UncleanLayerFileName {
|
||||
fn into_clean(self) -> Option<LayerFileName> {
|
||||
match self {
|
||||
UncleanLayerFileName::Clean(clean) => Some(clean),
|
||||
UncleanLayerFileName::BackupFile(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPartUnclean {
|
||||
pub fn remove_unclean_layer_file_names(self) -> IndexPart {
|
||||
let IndexPartUnclean {
|
||||
version,
|
||||
timeline_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
} = self;
|
||||
|
||||
IndexPart {
|
||||
version,
|
||||
timeline_layers: timeline_layers
|
||||
.into_iter()
|
||||
.filter_map(|unclean_file_name| match unclean_file_name {
|
||||
UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
|
||||
UncleanLayerFileName::BackupFile(backup_file_name) => {
|
||||
// For details see https://github.com/neondatabase/neon/issues/3024
|
||||
warn!(
|
||||
"got backup file on the remote storage, ignoring it {backup_file_name}"
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
layer_metadata: layer_metadata
|
||||
.into_iter()
|
||||
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
|
||||
.collect(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
@@ -112,7 +232,7 @@ impl IndexPart {
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
pub(super) file_size: u64,
|
||||
pub(super) file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
@@ -127,6 +247,27 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
@@ -146,19 +287,21 @@ mod tests {
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example)
|
||||
.unwrap()
|
||||
.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -182,64 +325,20 @@ mod tests {
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_layers_are_parsed() {
|
||||
let empty_layers_json = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":[],
|
||||
"layer_metadata":{},
|
||||
"disk_consistent_lsn":"0/2532648",
|
||||
"metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 1,
|
||||
timeline_layers: HashSet::new(),
|
||||
layer_metadata: HashMap::new(),
|
||||
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [
|
||||
136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
|
||||
38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
|
||||
210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
|
||||
240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,
|
||||
]
|
||||
.to_vec(),
|
||||
};
|
||||
|
||||
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
|
||||
|
||||
assert_eq!(empty_layers_parsed, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,9 +64,13 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
})?
|
||||
.len();
|
||||
|
||||
let metadata_size = known_metadata.file_size();
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
// FIXME: this looks bad
|
||||
if let Some(metadata_size) = known_metadata.file_size() {
|
||||
if metadata_size != fs_size {
|
||||
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
|
||||
}
|
||||
} else {
|
||||
// this is a silly state we would like to avoid
|
||||
}
|
||||
|
||||
let fs_size = usize::try_from(fs_size).with_context(|| {
|
||||
|
||||
@@ -6,7 +6,6 @@ use std::sync::Arc;
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
@@ -353,10 +352,6 @@ async fn fill_logical_sizes(
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
// be sure to cancel all spawned tasks if we are dropped
|
||||
let _dg = cancel.clone().drop_guard();
|
||||
|
||||
// For each point that would benefit from having a logical size available,
|
||||
// spawn a Task to fetch it, unless we have it cached already.
|
||||
for seg in segments.iter() {
|
||||
@@ -378,7 +373,6 @@ async fn fill_logical_sizes(
|
||||
timeline,
|
||||
lsn,
|
||||
ctx,
|
||||
cancel.child_token(),
|
||||
));
|
||||
}
|
||||
e.insert(cached_size);
|
||||
@@ -483,14 +477,13 @@ async fn calculate_logical_size(
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
|
||||
.spawn_ondemand_logical_size_calculation(lsn, ctx)
|
||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
||||
.await?;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
|
||||
@@ -385,7 +385,7 @@ pub trait PersistentLayer: Layer {
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> u64;
|
||||
fn file_size(&self) -> Option<u64>;
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
|
||||
@@ -444,8 +444,8 @@ impl PersistentLayer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
@@ -456,7 +456,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
lsn_end: lsn_range.end,
|
||||
remote: false,
|
||||
|
||||
@@ -258,15 +258,6 @@ impl serde::Serialize for LayerFileName {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for LayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(LayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
|
||||
@@ -258,8 +258,8 @@ impl PersistentLayer for ImageLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
@@ -268,7 +268,7 @@ impl PersistentLayer for ImageLayer {
|
||||
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name,
|
||||
layer_file_size: self.file_size,
|
||||
layer_file_size: Some(self.file_size),
|
||||
lsn_start: lsn_range.start,
|
||||
remote: false,
|
||||
access_stats: self.access_stats.as_api_model(reset),
|
||||
|
||||
@@ -167,7 +167,7 @@ impl PersistentLayer for RemoteLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
self.layer_metadata.file_size()
|
||||
}
|
||||
|
||||
|
||||
@@ -244,12 +244,14 @@ pub(crate) async fn random_init_delay(
|
||||
) -> Result<(), Cancelled> {
|
||||
use rand::Rng;
|
||||
|
||||
if period == Duration::ZERO {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// gen_range asserts that the range cannot be empty, which it could be because period can
|
||||
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
|
||||
let period = std::cmp::max(period, Duration::from_secs(10));
|
||||
|
||||
// semi-ok default as the source of jitter
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
|
||||
@@ -25,7 +25,6 @@ use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
@@ -57,6 +56,7 @@ use pageserver_api::reltag::RelTag;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
approx_accurate::ApproxAccurate,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
seqwait::SeqWait,
|
||||
@@ -73,9 +73,6 @@ use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
use walreceiver::spawn_connection_manager_task;
|
||||
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
@@ -221,8 +218,6 @@ pub struct Timeline {
|
||||
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
|
||||
|
||||
state: watch::Sender<TimelineState>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
@@ -335,12 +330,27 @@ impl LogicalSize {
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
}
|
||||
|
||||
/// Make the value computed by initial logical size computation
|
||||
/// available for re-use. This doesn't contain the incremental part.
|
||||
fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||
match self.initial_part_end {
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
|
||||
_ => None,
|
||||
/// Returns the initialized (already calculated) value, if any.
|
||||
fn initialized_size(&self) -> Option<u64> {
|
||||
self.initial_logical_size.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returned by [`Timeline::layer_size_sum`]
|
||||
pub enum LayerSizeSum {
|
||||
/// The result is accurate.
|
||||
Accurate(u64),
|
||||
// We don't know the layer file size of one or more layers.
|
||||
// They contribute to the sum with a value of 0.
|
||||
// Hence, the sum is a lower bound for the actualy layer file size sum.
|
||||
ApproximateLowerBound(u64),
|
||||
}
|
||||
|
||||
impl LayerSizeSum {
|
||||
pub fn approximate_is_ok(self) -> u64 {
|
||||
match self {
|
||||
LayerSizeSum::Accurate(v) => v,
|
||||
LayerSizeSum::ApproximateLowerBound(v) => v,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -542,13 +552,20 @@ impl Timeline {
|
||||
/// The sum of the file size of all historic layers in the layer map.
|
||||
/// This method makes no distinction between local and remote layers.
|
||||
/// Hence, the result **does not represent local filesystem usage**.
|
||||
pub fn layer_size_sum(&self) -> u64 {
|
||||
pub fn layer_size_sum(&self) -> LayerSizeSum {
|
||||
let layer_map = self.layers.read().unwrap();
|
||||
let mut size = 0;
|
||||
let mut no_size_cnt = 0;
|
||||
for l in layer_map.iter_historic_layers() {
|
||||
size += l.file_size();
|
||||
let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
|
||||
size += l_size;
|
||||
no_size_cnt += l_no_size;
|
||||
}
|
||||
if no_size_cnt == 0 {
|
||||
LayerSizeSum::Accurate(size)
|
||||
} else {
|
||||
LayerSizeSum::ApproximateLowerBound(size)
|
||||
}
|
||||
size
|
||||
}
|
||||
|
||||
pub fn get_resident_physical_size(&self) -> u64 {
|
||||
@@ -679,7 +696,8 @@ impl Timeline {
|
||||
|
||||
let mut failed = 0;
|
||||
|
||||
let mut cancelled = pin!(task_mgr::shutdown_watcher());
|
||||
let cancelled = task_mgr::shutdown_watcher();
|
||||
tokio::pin!(cancelled);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -816,11 +834,11 @@ impl Timeline {
|
||||
|
||||
let mut is_exact = true;
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
is_exact = false;
|
||||
self.try_spawn_size_init_task(initial_part_end, ctx);
|
||||
self.try_spawn_size_init_task(init_lsn, ctx);
|
||||
}
|
||||
|
||||
Ok((size, is_exact))
|
||||
@@ -844,7 +862,7 @@ impl Timeline {
|
||||
// S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
|
||||
// we want to stay below that with a big margin. The LSN distance determines how
|
||||
// much WAL the safekeepers need to store.
|
||||
if distance >= self.get_checkpoint_distance().into()
|
||||
if distance >= i128::from(self.get_checkpoint_distance())
|
||||
|| open_layer_size > self.get_checkpoint_distance()
|
||||
|| (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
|
||||
{
|
||||
@@ -1056,12 +1074,14 @@ impl Timeline {
|
||||
use super::layer_map::Replacement;
|
||||
|
||||
if local_layer.is_remote_layer() {
|
||||
// TODO(issue #3851): consider returning an err here instead of false,
|
||||
// which is the same out the match later
|
||||
// TODO: consider returning an err here instead of false, which is the same out the
|
||||
// match later
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let layer_file_size = local_layer.file_size();
|
||||
let layer_file_size = local_layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
|
||||
let local_layer_mtime = local_layer
|
||||
.local_path()
|
||||
@@ -1127,9 +1147,6 @@ impl Timeline {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.observe(delta);
|
||||
info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
|
||||
} else {
|
||||
info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
|
||||
}
|
||||
|
||||
true
|
||||
@@ -1151,6 +1168,8 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: update metrics for how
|
||||
|
||||
Ok(replaced)
|
||||
}
|
||||
}
|
||||
@@ -1206,7 +1225,7 @@ impl Timeline {
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
metadata: &TimelineMetadata,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
@@ -1291,10 +1310,6 @@ impl Timeline {
|
||||
download_all_remote_layers_task_info: RwLock::new(None),
|
||||
|
||||
state,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
};
|
||||
result.repartition_threshold = result.get_checkpoint_distance() / 10;
|
||||
result
|
||||
@@ -1533,12 +1548,7 @@ impl Timeline {
|
||||
.layer_metadata
|
||||
.get(remote_layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
remote_layer_name.file_name()
|
||||
)
|
||||
})?;
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
|
||||
// Is the local layer's size different from the size stored in the
|
||||
// remote index file?
|
||||
@@ -1554,27 +1564,34 @@ impl Timeline {
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let remote_size = remote_layer_metadata.file_size();
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
if let Some(remote_size) = remote_layer_metadata.file_size() {
|
||||
let metadata = local_layer_path.metadata().with_context(|| {
|
||||
format!(
|
||||
"get file size of local layer {}",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
let local_size = metadata.len();
|
||||
if local_size != remote_size {
|
||||
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
|
||||
if let Err(err) = rename_to_backup(&local_layer_path) {
|
||||
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"layer is present locally and file size matches remote, using it: {}",
|
||||
"layer is present locally and remote does not have file size, using it: {}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
continue;
|
||||
@@ -1676,8 +1693,6 @@ impl Timeline {
|
||||
.map(|l| (l.filename(), l))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
// If no writes happen, new branches do not have any layers, only the metadata file.
|
||||
let has_local_layers = !local_layers.is_empty();
|
||||
let local_only_layers = match index_part {
|
||||
Some(index_part) => {
|
||||
info!(
|
||||
@@ -1695,47 +1710,28 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
if has_local_layers {
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them.
|
||||
// Local timeline metadata will get uploaded to remove along witht he layers.
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
} else if index_part.is_none() {
|
||||
// No data on the remote storage, no local layers, local metadata file.
|
||||
//
|
||||
// TODO https://github.com/neondatabase/neon/issues/3865
|
||||
// Currently, console does not wait for the timeline data upload to the remote storage
|
||||
// and considers the timeline created, expecting other pageserver nodes to work with it.
|
||||
// Branch metadata upload could get interrupted (e.g pageserver got killed),
|
||||
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
|
||||
// otherwise any other pageserver won't see the branch on `attach`.
|
||||
//
|
||||
// After the issue gets implemented, pageserver should rather remove the branch,
|
||||
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
|
||||
// no need to keep the old files.
|
||||
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
} else {
|
||||
// Local timeline has a metadata file, remote one too, both have no layers to sync.
|
||||
// Are there local files that don't exist remotely? Schedule uploads for them
|
||||
for (layer_name, layer) in &local_only_layers {
|
||||
// XXX solve this in the type system
|
||||
let layer_path = layer
|
||||
.local_path()
|
||||
.expect("local_only_layers only contains local layers");
|
||||
let layer_size = layer_path
|
||||
.metadata()
|
||||
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
|
||||
.len();
|
||||
info!("scheduling {layer_path:?} for upload");
|
||||
remote_client
|
||||
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
|
||||
info!("Done");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
|
||||
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
|
||||
.try_acquire_owned()
|
||||
{
|
||||
@@ -1772,11 +1768,8 @@ impl Timeline {
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
// no cancellation here, because nothing really waits for this to complete compared
|
||||
// to spawn_ondemand_logical_size_calculation.
|
||||
let cancel = CancellationToken::new();
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, &background_ctx, cancel)
|
||||
.logical_size_calculation_task(init_lsn, &background_ctx)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
@@ -1831,7 +1824,6 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -1851,9 +1843,7 @@ impl Timeline {
|
||||
"ondemand logical size calculation",
|
||||
false,
|
||||
async move {
|
||||
let res = self_clone
|
||||
.logical_size_calculation_task(lsn, &ctx, cancel)
|
||||
.await;
|
||||
let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
|
||||
let _ = sender.send(res).ok();
|
||||
Ok(()) // Receiver is responsible for handling errors
|
||||
},
|
||||
@@ -1864,20 +1854,20 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
init_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
let mut timeline_state_updates = self.subscribe_for_state_updates();
|
||||
let self_calculation = Arc::clone(self);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut calculation = pin!(async {
|
||||
let calculation = async {
|
||||
let cancel = cancel.child_token();
|
||||
let ctx = ctx.attached_child();
|
||||
self_calculation
|
||||
.calculate_logical_size(lsn, cancel, &ctx)
|
||||
.calculate_logical_size(init_lsn, cancel, &ctx)
|
||||
.await
|
||||
});
|
||||
};
|
||||
let timeline_state_cancellation = async {
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
@@ -1906,6 +1896,7 @@ impl Timeline {
|
||||
"aborted because task_mgr shutdown requested".to_string()
|
||||
};
|
||||
|
||||
tokio::pin!(calculation);
|
||||
loop {
|
||||
tokio::select! {
|
||||
res = &mut calculation => { return res }
|
||||
@@ -1958,12 +1949,21 @@ impl Timeline {
|
||||
// need to return something
|
||||
Ok(0)
|
||||
});
|
||||
// See if we've already done the work for initial size calculation.
|
||||
// This is a short-cut for timelines that are mostly unused.
|
||||
if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
|
||||
return Ok(size);
|
||||
}
|
||||
let timer = self.metrics.logical_size_histo.start_timer();
|
||||
let timer = if up_to_lsn == self.initdb_lsn {
|
||||
if let Some(size) = self.current_logical_size.initialized_size() {
|
||||
if size != 0 {
|
||||
// non-zero size means that the size has already been calculated by this method
|
||||
// after startup. if the logical size is for a new timeline without layers the
|
||||
// size will be zero, and we cannot use that, or this caching strategy until
|
||||
// pageserver restart.
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
|
||||
self.metrics.init_logical_size_histo.start_timer()
|
||||
} else {
|
||||
self.metrics.logical_size_histo.start_timer()
|
||||
};
|
||||
let logical_size = self
|
||||
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
|
||||
.await?;
|
||||
@@ -2018,7 +2018,9 @@ impl Timeline {
|
||||
) -> anyhow::Result<()> {
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer.file_size();
|
||||
let layer_file_size = layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
@@ -4048,7 +4050,7 @@ impl Timeline {
|
||||
|
||||
pub struct DiskUsageEvictionInfo {
|
||||
/// Timeline's largest layer (remote or resident)
|
||||
pub max_layer_size: Option<u64>,
|
||||
pub max_layer_size: ApproxAccurate<u64>,
|
||||
/// Timeline's resident layers
|
||||
pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
|
||||
}
|
||||
@@ -4073,7 +4075,9 @@ impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
|
||||
|
||||
impl LocalLayerInfoForDiskUsageEviction {
|
||||
pub fn file_size(&self) -> u64 {
|
||||
self.layer.file_size()
|
||||
self.layer
|
||||
.file_size()
|
||||
.expect("we know this is a local layer")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4081,12 +4085,11 @@ impl Timeline {
|
||||
pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let layers = self.layers.read().unwrap();
|
||||
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
let mut max_layer_size = ApproxAccurate::default();
|
||||
let mut resident_layers = Vec::new();
|
||||
|
||||
for l in layers.iter_historic_layers() {
|
||||
let file_size = l.file_size();
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
max_layer_size = max_layer_size.max(l.file_size());
|
||||
|
||||
if l.is_remote_layer() {
|
||||
continue;
|
||||
|
||||
@@ -1,20 +1,6 @@
|
||||
//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
|
||||
//! than a given threshold.
|
||||
//!
|
||||
//! Data includes all kinds of caches, namely:
|
||||
//! - (in-memory layers)
|
||||
//! - on-demand downloaded layer files on disk
|
||||
//! - (cached layer file pages)
|
||||
//! - derived data from layer file contents, namely:
|
||||
//! - initial logical size
|
||||
//! - partitioning
|
||||
//! - (other currently missing unknowns)
|
||||
//!
|
||||
//! Items with parentheses are not (yet) touched by this task.
|
||||
//!
|
||||
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
|
||||
//! The per-timeline layer eviction task.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -25,27 +11,15 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTimelineState {
|
||||
last_layer_access_imitation: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTenantState {
|
||||
last_layer_access_imitation: Option<Instant>,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -79,10 +53,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
|
||||
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
@@ -103,8 +76,7 @@ impl Timeline {
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<(), Instant> {
|
||||
debug!("eviction iteration: {policy:?}");
|
||||
match policy {
|
||||
@@ -114,7 +86,7 @@ impl Timeline {
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
let start = Instant::now();
|
||||
match self.eviction_iteration_threshold(p, cancel, ctx).await {
|
||||
match self.eviction_iteration_threshold(p, cancel).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
@@ -128,40 +100,10 @@ impl Timeline {
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
// If we evict layers but keep cached values derived from those layers, then
|
||||
// we face a storm of on-demand downloads after pageserver restart.
|
||||
// The reason is that the restart empties the caches, and so, the values
|
||||
// need to be re-computed by accessing layers, which we evicted while the
|
||||
// caches were filled.
|
||||
//
|
||||
// Solutions here would be one of the following:
|
||||
// 1. Have a persistent cache.
|
||||
// 2. Count every access to a cached value to the access stats of all layers
|
||||
// that were accessed to compute the value in the first place.
|
||||
// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
|
||||
// get re-computed from layers, thereby counting towards layer access stats.
|
||||
// 4. Make the eviction task imitate the layer accesses that typically hit caches.
|
||||
//
|
||||
// We follow approach (4) here because in Neon prod deployment:
|
||||
// - page cache is quite small => high churn => low hit rate
|
||||
// => eviction gets correct access stats
|
||||
// - value-level caches such as logical size & repatition have a high hit rate,
|
||||
// especially for inactive tenants
|
||||
// => eviction sees zero accesses for these
|
||||
// => they cause the on-demand download storm on pageserver restart
|
||||
//
|
||||
// We should probably move to persistent caches in the future, or avoid
|
||||
// having inactive tenants attached to pageserver in the first place.
|
||||
match self.imitate_layer_accesses(p, cancel, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
@@ -171,7 +113,6 @@ impl Timeline {
|
||||
not_evictable: usize,
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
|
||||
let mut stats = EvictionStats::default();
|
||||
// Gather layers for eviction.
|
||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||
@@ -226,7 +167,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let results = match self
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel.clone())
|
||||
.evict_layer_batch(remote_client, &candidates[..], cancel)
|
||||
.await
|
||||
{
|
||||
Err(pre_err) => {
|
||||
@@ -268,144 +209,4 @@ impl Timeline {
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
// This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
|
||||
// likely, we're shutting down
|
||||
return ControlFlow::Break(());
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
/// Recompute the values which would cause on-demand downloads during restart.
|
||||
async fn imitate_timeline_cached_layer_accesses(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
// imitiate on-restart initial logical size
|
||||
let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
|
||||
|
||||
match &size {
|
||||
Ok(_size) => {
|
||||
// good, don't log it to avoid confusion
|
||||
}
|
||||
Err(_) => {
|
||||
// we have known issues for which we already log this on consumption metrics,
|
||||
// gc, and compaction. leave logging out for now.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/2539
|
||||
}
|
||||
}
|
||||
|
||||
// imitiate repartiting on first compactation
|
||||
if let Err(e) = self.collect_keyspace(lsn, ctx).await {
|
||||
// if this failed, we probably failed logical size because these use the same keys
|
||||
if size.is_err() {
|
||||
// ignore, see above comment
|
||||
} else {
|
||||
warn!(
|
||||
"failed to collect keyspace but succeeded in calculating logical size: {e:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Imitate the synthetic size calculation done by the consumption_metrics module.
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Arc<Tenant>,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
if self.conf.metric_collection_endpoint.is_none() {
|
||||
// We don't start the consumption metrics task if this is not set in the config.
|
||||
// So, no need to imitate the accesses in that case.
|
||||
return;
|
||||
}
|
||||
|
||||
// The consumption metrics are collected on a per-tenant basis, by a single
|
||||
// global background loop.
|
||||
// It limits the number of synthetic size calculations using the global
|
||||
// `concurrent_tenant_size_logical_size_queries` semaphore to not overload
|
||||
// the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
|
||||
//
|
||||
// If we used that same semaphore here, then we'd compete for the
|
||||
// same permits, which may impact timeliness of consumption metrics.
|
||||
// That is a no-go, as consumption metrics are much more important
|
||||
// than what we do here.
|
||||
//
|
||||
// So, we have a separate semaphore, initialized to the same
|
||||
// number of permits as the `concurrent_tenant_size_logical_size_queries`.
|
||||
// In the worst, we would have twice the amount of concurrenct size calculations.
|
||||
// But in practice, the `p.threshold` >> `consumption metric interval`, and
|
||||
// we spread out the eviction task using `random_init_delay`.
|
||||
// So, the chance of the worst case is quite low in practice.
|
||||
// It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
|
||||
// So, we must coordinate with other with other eviction tasks of this tenant.
|
||||
let limit = self
|
||||
.conf
|
||||
.eviction_task_immitated_concurrent_logical_size_queries
|
||||
.inner();
|
||||
|
||||
let mut throwaway_cache = HashMap::new();
|
||||
let gather =
|
||||
crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {}
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
// thus not encountering this error.
|
||||
warn!("failed to imitate synthetic size calculation accesses: {e:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,7 +237,11 @@ async fn connection_manager_loop_step(
|
||||
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
|
||||
info!("Switching to new connection candidate: {new_candidate:?}");
|
||||
walreceiver_state
|
||||
.change_connection(new_candidate, ctx)
|
||||
.change_connection(
|
||||
new_candidate.safekeeper_id,
|
||||
new_candidate.wal_source_connconf,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -342,8 +346,6 @@ struct WalConnection {
|
||||
started_at: NaiveDateTime,
|
||||
/// Current safekeeper pageserver is connected to for WAL streaming.
|
||||
sk_id: NodeId,
|
||||
/// Availability zone of the safekeeper.
|
||||
availability_zone: Option<String>,
|
||||
/// Status of the connection.
|
||||
status: WalConnectionStatus,
|
||||
/// WAL streaming task handle.
|
||||
@@ -403,7 +405,12 @@ impl WalreceiverState {
|
||||
}
|
||||
|
||||
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
||||
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
||||
async fn change_connection(
|
||||
&mut self,
|
||||
new_sk_id: NodeId,
|
||||
new_wal_source_connconf: PgConnectionConfig,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
self.drop_old_connection(true).await;
|
||||
|
||||
let id = self.id;
|
||||
@@ -417,7 +424,7 @@ impl WalreceiverState {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
new_sk.wal_source_connconf,
|
||||
new_wal_source_connconf,
|
||||
events_sender,
|
||||
cancellation,
|
||||
connect_timeout,
|
||||
@@ -426,16 +433,13 @@ impl WalreceiverState {
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(
|
||||
info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
|
||||
)
|
||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||
});
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
self.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: new_sk.safekeeper_id,
|
||||
availability_zone: new_sk.availability_zone,
|
||||
sk_id: new_sk_id,
|
||||
status: WalConnectionStatus {
|
||||
is_connected: false,
|
||||
has_processed_wal: false,
|
||||
@@ -542,7 +546,6 @@ impl WalreceiverState {
|
||||
/// * if connected safekeeper is not present, pick the candidate
|
||||
/// * if we haven't received any updates for some time, pick the candidate
|
||||
/// * if the candidate commit_lsn is much higher than the current one, pick the candidate
|
||||
/// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate
|
||||
/// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
|
||||
///
|
||||
/// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
|
||||
@@ -556,7 +559,6 @@ impl WalreceiverState {
|
||||
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(Some(connected_sk_node))?;
|
||||
let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone();
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
if let Ok(latest_interaciton) =
|
||||
@@ -567,7 +569,6 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::NoKeepAlives {
|
||||
last_keep_alive: Some(
|
||||
existing_wal_connection.status.latest_connection_update,
|
||||
@@ -593,7 +594,6 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::LaggingWal {
|
||||
current_commit_lsn,
|
||||
new_commit_lsn,
|
||||
@@ -601,20 +601,6 @@ impl WalreceiverState {
|
||||
},
|
||||
});
|
||||
}
|
||||
// If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver,
|
||||
// and the current one is not, switch to the new one.
|
||||
if self.availability_zone.is_some()
|
||||
&& existing_wal_connection.availability_zone
|
||||
!= self.availability_zone
|
||||
&& self.availability_zone == new_availability_zone
|
||||
{
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
availability_zone: new_availability_zone,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
reason: ReconnectReason::SwitchAvailabilityZone,
|
||||
});
|
||||
}
|
||||
}
|
||||
None => debug!(
|
||||
"Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
|
||||
@@ -682,7 +668,6 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::NoWalTimeout {
|
||||
current_lsn,
|
||||
current_commit_lsn,
|
||||
@@ -701,11 +686,10 @@ impl WalreceiverState {
|
||||
self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
|
||||
}
|
||||
None => {
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
let (new_sk_id, _, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(None)?;
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
availability_zone: new_safekeeper_broker_data.availability_zone.clone(),
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
reason: ReconnectReason::NoExistingConnection,
|
||||
});
|
||||
@@ -810,7 +794,6 @@ impl WalreceiverState {
|
||||
struct NewWalConnectionCandidate {
|
||||
safekeeper_id: NodeId,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
availability_zone: Option<String>,
|
||||
// This field is used in `derive(Debug)` only.
|
||||
#[allow(dead_code)]
|
||||
reason: ReconnectReason,
|
||||
@@ -825,7 +808,6 @@ enum ReconnectReason {
|
||||
new_commit_lsn: Lsn,
|
||||
threshold: NonZeroU64,
|
||||
},
|
||||
SwitchAvailabilityZone,
|
||||
NoWalTimeout {
|
||||
current_lsn: Lsn,
|
||||
current_commit_lsn: Lsn,
|
||||
@@ -891,7 +873,6 @@ mod tests {
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
@@ -952,7 +933,6 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1115,7 +1095,6 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1181,7 +1160,6 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1244,7 +1222,6 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
||||
discovered_new_wal: Some(NewCommittedWAL {
|
||||
@@ -1312,74 +1289,4 @@ mod tests {
|
||||
availability_zone: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn switch_to_same_availability_zone() -> anyhow::Result<()> {
|
||||
// Pageserver and one of safekeepers will be in the same availability zone
|
||||
// and pageserver should prefer to connect to it.
|
||||
let test_az = Some("test_az".to_owned());
|
||||
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
state.availability_zone = test_az.clone();
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let connected_sk_id = NodeId(0);
|
||||
|
||||
let connection_status = WalConnectionStatus {
|
||||
is_connected: true,
|
||||
has_processed_wal: true,
|
||||
latest_connection_update: now,
|
||||
latest_wal_update: now,
|
||||
commit_lsn: Some(current_lsn),
|
||||
streaming_lsn: Some(current_lsn),
|
||||
};
|
||||
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
discovered_new_wal: None,
|
||||
});
|
||||
|
||||
// We have another safekeeper with the same commit_lsn, and it have the same availability zone as
|
||||
// the current pageserver.
|
||||
let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
|
||||
same_az_sk.timeline.availability_zone = test_az.clone();
|
||||
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(NodeId(1), same_az_sk),
|
||||
]);
|
||||
|
||||
// We expect that pageserver will switch to the safekeeper in the same availability zone,
|
||||
// even if it has the same commit_lsn.
|
||||
let next_candidate = state.next_connection_candidate().expect(
|
||||
"Expected one candidate selected out of multiple valid data options, but got none",
|
||||
);
|
||||
|
||||
assert_eq!(next_candidate.safekeeper_id, NodeId(1));
|
||||
assert_eq!(
|
||||
next_candidate.reason,
|
||||
ReconnectReason::SwitchAvailabilityZone,
|
||||
"Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
next_candidate.wal_source_connconf.host(),
|
||||
&Host::Domain("same_az".to_owned())
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
use std::{
|
||||
error::Error,
|
||||
pin::pin,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -18,7 +17,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::{select, sync::watch, time};
|
||||
use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
@@ -37,7 +36,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::PageserverFeedback;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -188,7 +187,8 @@ pub async fn handle_walreceiver_connection(
|
||||
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
|
||||
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
pin!(physical_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
@@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection(
|
||||
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let last_received_lsn = u64::from(last_lsn);
|
||||
let write_lsn = u64::from(last_lsn);
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
|
||||
let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
|
||||
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
|
||||
let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
let apply_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
let ts = SystemTime::now();
|
||||
|
||||
// Update the status about what we just received. This is shown in the mgmt API.
|
||||
@@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection(
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size(&ctx)
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = PageserverFeedback {
|
||||
let status_update = ReplicationFeedback {
|
||||
current_timeline_size: timeline_logical_size,
|
||||
last_received_lsn,
|
||||
disk_consistent_lsn,
|
||||
remote_consistent_lsn,
|
||||
replytime: ts,
|
||||
ps_writelsn: write_lsn,
|
||||
ps_flushlsn: flush_lsn,
|
||||
ps_applylsn: apply_lsn,
|
||||
ps_replytime: ts,
|
||||
};
|
||||
|
||||
debug!("neon_status_update {status_update:?}");
|
||||
|
||||
@@ -127,21 +127,12 @@ impl UploadQueue {
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
for layer_name in &index_part.timeline_layers {
|
||||
match index_part
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
{
|
||||
Some(layer_metadata) => {
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
None => {
|
||||
anyhow::bail!(
|
||||
"No remote layer metadata found for layer {}",
|
||||
layer_name.file_name()
|
||||
);
|
||||
}
|
||||
}
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
|
||||
@@ -883,7 +883,7 @@ impl PostgresRedoManager {
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
while nresult < usize::from(BLCKSZ) {
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
*/
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <sys/statvfs.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
@@ -35,9 +34,6 @@
|
||||
#include "storage/fd.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
@@ -63,9 +59,6 @@
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
|
||||
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
|
||||
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
BufferTag key;
|
||||
@@ -78,7 +71,6 @@ typedef struct FileCacheEntry
|
||||
typedef struct FileCacheControl
|
||||
{
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
dlist_head lru; /* double linked list for LRU replacement algorithm */
|
||||
} FileCacheControl;
|
||||
|
||||
@@ -87,14 +79,12 @@ static int lfc_desc;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_free_space_watermark;
|
||||
static char* lfc_path;
|
||||
static FileCacheControl* lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
|
||||
|
||||
static void
|
||||
lfc_shmem_startup(void)
|
||||
@@ -122,7 +112,6 @@ lfc_shmem_startup(void)
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->size = 0;
|
||||
lfc_ctl->used = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Remove file cache on restart */
|
||||
@@ -176,7 +165,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
}
|
||||
}
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
|
||||
while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
@@ -186,86 +175,12 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
|
||||
lfc_ctl->used -= 1;
|
||||
lfc_ctl->size -= 1;
|
||||
}
|
||||
elog(LOG, "set local file cache limit to %d", new_size);
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Local file system state monitor check available free space.
|
||||
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
|
||||
* but throwing away least recently accessed chunks.
|
||||
* First time low space watermark is reached cache size is divided by two,
|
||||
* second time by four,... Finally we remove all chunks from local cache.
|
||||
*
|
||||
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
|
||||
* We only throw away cached chunks but do not prevent from filling cache by new chunks.
|
||||
*
|
||||
* Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
|
||||
* disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
|
||||
* Calling statvfs each second should not add any noticeable overhead.
|
||||
*/
|
||||
void
|
||||
FileCacheMonitorMain(Datum main_arg)
|
||||
{
|
||||
/*
|
||||
* Choose file system state monitor interval so that space can not be exosted
|
||||
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
|
||||
*/
|
||||
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
/* Periodically dump buffers until terminated. */
|
||||
while (!ShutdownRequestPending)
|
||||
{
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
struct statvfs sfs;
|
||||
if (statvfs(lfc_path, &sfs) < 0)
|
||||
{
|
||||
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
|
||||
{
|
||||
if (lfc_shrinking_factor < 31) {
|
||||
lfc_shrinking_factor += 1;
|
||||
}
|
||||
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
|
||||
}
|
||||
else
|
||||
lfc_shrinking_factor = 0; /* reset to initial value */
|
||||
}
|
||||
}
|
||||
pg_usleep(monitor_interval);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lfc_register_free_space_monitor(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
RegisterBackgroundWorker(&bgw);
|
||||
}
|
||||
|
||||
void
|
||||
lfc_init(void)
|
||||
{
|
||||
@@ -302,19 +217,6 @@ lfc_init(void)
|
||||
lfc_change_limit_hook,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.free_space_watermark",
|
||||
"Minimal free space in local file system after reaching which local file cache will be truncated",
|
||||
NULL,
|
||||
&lfc_free_space_watermark,
|
||||
1024, /* 1GB */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MB,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.file_cache_path",
|
||||
"Path to local file cache (can be raw device)",
|
||||
NULL,
|
||||
@@ -329,9 +231,6 @@ lfc_init(void)
|
||||
if (lfc_max_size == 0)
|
||||
return;
|
||||
|
||||
if (lfc_free_space_watermark != 0)
|
||||
lfc_register_free_space_monitor();
|
||||
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = lfc_shmem_startup;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
@@ -481,7 +380,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
|
||||
* we prefer not to complicate code and use second approach.
|
||||
*/
|
||||
if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
|
||||
if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
@@ -491,10 +390,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
elog(LOG, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
|
||||
}
|
||||
entry->access_count = 1;
|
||||
memset(entry->bitmap, 0, sizeof entry->bitmap);
|
||||
}
|
||||
|
||||
@@ -46,12 +46,8 @@ PGconn *pageserver_conn = NULL;
|
||||
*/
|
||||
WaitEventSet *pageserver_conn_wes = NULL;
|
||||
|
||||
/* GUCs */
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
int32 max_cluster_size;
|
||||
char *page_server_connstring;
|
||||
char *neon_auth_token;
|
||||
char *page_server_connstring_raw;
|
||||
char *safekeeper_token_env;
|
||||
|
||||
int n_unflushed_requests = 0;
|
||||
int flush_every_n_requests = 8;
|
||||
@@ -64,37 +60,10 @@ pageserver_connect(int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = page_server_connstring;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pageserver_conn = PQconnectdbParams(keywords, values, 1);
|
||||
pageserver_conn = PQconnectdb(page_server_connstring);
|
||||
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
@@ -156,7 +125,7 @@ pageserver_connect(int elevel)
|
||||
}
|
||||
}
|
||||
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
|
||||
|
||||
connected = true;
|
||||
return true;
|
||||
@@ -385,6 +354,105 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
static char *
|
||||
substitute_pageserver_password(const char *page_server_connstring_raw)
|
||||
{
|
||||
char *host = NULL;
|
||||
char *port = NULL;
|
||||
char *user = NULL;
|
||||
char *auth_token = NULL;
|
||||
char *err = NULL;
|
||||
char *page_server_connstring = NULL;
|
||||
PQconninfoOption *conn_options;
|
||||
PQconninfoOption *conn_option;
|
||||
MemoryContext oldcontext;
|
||||
|
||||
/*
|
||||
* Here we substitute password in connection string with an environment
|
||||
* variable. To simplify things we construct a connection string back with
|
||||
* only known options. In particular: host port user and password. We do
|
||||
* not currently use other options and constructing full connstring in an
|
||||
* URI shape is quite messy.
|
||||
*/
|
||||
|
||||
if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
|
||||
return NULL;
|
||||
|
||||
/* extract the auth token from the connection string */
|
||||
conn_options = PQconninfoParse(page_server_connstring_raw, &err);
|
||||
if (conn_options == NULL)
|
||||
{
|
||||
/* The error string is malloc'd, so we must free it explicitly */
|
||||
char *errcopy = err ? pstrdup(err) : "out of memory";
|
||||
|
||||
PQfreemem(err);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid connection string syntax: %s", errcopy)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Trying to populate pageserver connection string with auth token from
|
||||
* environment. We are looking for password in with placeholder value like
|
||||
* $ENV_VAR_NAME, so if password field is present and starts with $ we try
|
||||
* to fetch environment variable value and fail loudly if it is not set.
|
||||
*/
|
||||
for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
|
||||
{
|
||||
if (strcmp(conn_option->keyword, "host") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
host = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "port") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
port = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "user") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
user = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "password") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
{
|
||||
/* ensure that this is a template */
|
||||
if (strncmp(conn_option->val, "$", 1) != 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
|
||||
|
||||
neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
|
||||
auth_token = getenv(&conn_option->val[1]);
|
||||
if (!auth_token)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_log(LOG, "using auth token from environment passed via env");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* allocate connection string in TopMemoryContext to make sure it is not
|
||||
* freed
|
||||
*/
|
||||
oldcontext = CurrentMemoryContext;
|
||||
MemoryContextSwitchTo(TopMemoryContext);
|
||||
page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
PQconninfoFree(conn_options);
|
||||
return page_server_connstring;
|
||||
}
|
||||
|
||||
/*
|
||||
* Module initialization function
|
||||
*/
|
||||
@@ -394,12 +462,21 @@ pg_init_libpagestore(void)
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
&page_server_connstring,
|
||||
&page_server_connstring_raw,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.safekeeper_token_env",
|
||||
"the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
|
||||
NULL,
|
||||
&safekeeper_token_env,
|
||||
NULL,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.timeline_id",
|
||||
"Neon timeline_id the server is running on",
|
||||
NULL,
|
||||
@@ -456,10 +533,30 @@ pg_init_libpagestore(void)
|
||||
neon_log(PageStoreTrace, "libpagestore already loaded");
|
||||
page_server = &api;
|
||||
|
||||
/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
|
||||
neon_auth_token = getenv("NEON_AUTH_TOKEN");
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
/* substitute password in pageserver_connstring */
|
||||
page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
|
||||
|
||||
/* Is there more correct way to pass CustomGUC to postgres code? */
|
||||
neon_timeline_walproposer = neon_timeline;
|
||||
neon_tenant_walproposer = neon_tenant;
|
||||
|
||||
/* retrieve the token for Safekeeper, if present */
|
||||
if (safekeeper_token_env != NULL) {
|
||||
if (safekeeper_token_env[0] != '$') {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s",
|
||||
safekeeper_token_env)));
|
||||
}
|
||||
neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]);
|
||||
if (!neon_safekeeper_token_walproposer) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("cannot get safekeeper auth token, environment variable %s is not set",
|
||||
&safekeeper_token_env[1])));
|
||||
}
|
||||
neon_log(LOG, "using safekeeper auth token from environment variable");
|
||||
}
|
||||
|
||||
if (page_server_connstring && page_server_connstring[0])
|
||||
{
|
||||
|
||||
@@ -51,39 +51,12 @@ walprop_status(WalProposerConn *conn)
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
walprop_connect_start(char *conninfo, char *password)
|
||||
walprop_connect_start(char *conninfo)
|
||||
{
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the
|
||||
* NEON_AUTH_TOKEN environment variable was set, use that as
|
||||
* the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (password)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pg_conn = PQconnectStartParams(keywords, values, 1);
|
||||
pg_conn = PQconnectStart(conninfo);
|
||||
|
||||
/*
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully
|
||||
|
||||
@@ -12,11 +12,6 @@
|
||||
#ifndef NEON_H
|
||||
#define NEON_H
|
||||
|
||||
/* GUCs */
|
||||
extern char *neon_auth_token;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
|
||||
@@ -92,6 +92,14 @@ const int SmgrTrace = DEBUG5;
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* GUCs */
|
||||
char *page_server_connstring;
|
||||
|
||||
/*with substituted password*/
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
int32 max_cluster_size;
|
||||
|
||||
/* unlogged relation build states */
|
||||
typedef enum
|
||||
{
|
||||
|
||||
@@ -78,6 +78,10 @@ int wal_acceptor_reconnect_timeout;
|
||||
int wal_acceptor_connection_timeout;
|
||||
bool am_wal_proposer;
|
||||
|
||||
char *neon_timeline_walproposer = NULL;
|
||||
char *neon_tenant_walproposer = NULL;
|
||||
char *neon_safekeeper_token_walproposer = NULL;
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
static int n_safekeepers = 0;
|
||||
@@ -510,9 +514,17 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
Safekeeper *sk = &safekeeper[n_safekeepers];
|
||||
int written = 0;
|
||||
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_timeline, neon_tenant);
|
||||
if (neon_safekeeper_token_walproposer != NULL) {
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer,
|
||||
neon_tenant_walproposer);
|
||||
} else {
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer);
|
||||
}
|
||||
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
@@ -538,16 +550,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
|
||||
greetRequest.pgVersion = PG_VERSION_NUM;
|
||||
pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
|
||||
greetRequest.systemId = systemId;
|
||||
if (!neon_timeline)
|
||||
if (!neon_timeline_walproposer)
|
||||
elog(FATAL, "neon.timeline_id is not provided");
|
||||
if (*neon_timeline != '\0' &&
|
||||
!HexDecodeString(greetRequest.timeline_id, neon_timeline, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline);
|
||||
if (!neon_tenant)
|
||||
if (*neon_timeline_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
|
||||
if (!neon_tenant_walproposer)
|
||||
elog(FATAL, "neon.tenant_id is not provided");
|
||||
if (*neon_tenant != '\0' &&
|
||||
!HexDecodeString(greetRequest.tenant_id, neon_tenant, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant);
|
||||
if (*neon_tenant_walproposer != '\0' &&
|
||||
!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
|
||||
elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* FIXME don't use hardcoded timeline id */
|
||||
@@ -688,7 +700,7 @@ ResetConnection(Safekeeper *sk)
|
||||
/*
|
||||
* Try to establish new connection
|
||||
*/
|
||||
sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token);
|
||||
sk->conn = walprop_connect_start((char *) &sk->conninfo);
|
||||
|
||||
/*
|
||||
* "If the result is null, then libpq has been unable to allocate a new
|
||||
@@ -1872,9 +1884,9 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
return sk->state == SS_ACTIVE;
|
||||
}
|
||||
|
||||
/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
|
||||
/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
|
||||
void
|
||||
ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf)
|
||||
ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf)
|
||||
{
|
||||
uint8 nkeys;
|
||||
int i;
|
||||
@@ -1892,45 +1904,45 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||
else if (strcmp(key, "ps_writelsn") == 0)
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
rf->ps_writelsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||
else if (strcmp(key, "ps_flushlsn") == 0)
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
rf->ps_flushlsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_flushlsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||
else if (strcmp(key, "ps_applylsn") == 0)
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
rf->ps_applylsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_applylsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||
else if (strcmp(key, "ps_replytime") == 0)
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->replytime = pq_getmsgint64(reply_message);
|
||||
rf->ps_replytime = pq_getmsgint64(reply_message);
|
||||
{
|
||||
char *replyTimeStr;
|
||||
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
|
||||
rf->ps_replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
}
|
||||
@@ -1944,7 +1956,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf
|
||||
* Skip unknown keys to support backward compatibile protocol
|
||||
* changes
|
||||
*/
|
||||
elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
pq_getmsgbytes(reply_message, len);
|
||||
};
|
||||
}
|
||||
@@ -2024,7 +2036,7 @@ GetAcknowledgedByQuorumWALPosition(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* WalproposerShmemSize --- report amount of shared memory space needed
|
||||
* ReplicationFeedbackShmemSize --- report amount of shared memory space needed
|
||||
*/
|
||||
Size
|
||||
WalproposerShmemSize(void)
|
||||
@@ -2054,10 +2066,10 @@ WalproposerShmemInit(void)
|
||||
}
|
||||
|
||||
void
|
||||
replication_feedback_set(PageserverFeedback * rf)
|
||||
replication_feedback_set(ReplicationFeedback * rf)
|
||||
{
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
|
||||
memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
|
||||
SpinLockRelease(&walprop_shared->mutex);
|
||||
}
|
||||
|
||||
@@ -2065,43 +2077,43 @@ void
|
||||
replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
|
||||
{
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
*writeLsn = walprop_shared->feedback.last_received_lsn;
|
||||
*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
|
||||
*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
|
||||
*writeLsn = walprop_shared->feedback.ps_writelsn;
|
||||
*flushLsn = walprop_shared->feedback.ps_flushlsn;
|
||||
*applyLsn = walprop_shared->feedback.ps_applylsn;
|
||||
SpinLockRelease(&walprop_shared->mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get PageserverFeedback fields from the most advanced safekeeper
|
||||
* Get ReplicationFeedback fields from the most advanced safekeeper
|
||||
*/
|
||||
static void
|
||||
GetLatestNeonFeedback(PageserverFeedback * rf)
|
||||
GetLatestNeonFeedback(ReplicationFeedback * rf)
|
||||
{
|
||||
int latest_safekeeper = 0;
|
||||
XLogRecPtr last_received_lsn = InvalidXLogRecPtr;
|
||||
XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < n_safekeepers; i++)
|
||||
{
|
||||
if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
|
||||
if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
|
||||
{
|
||||
latest_safekeeper = i;
|
||||
last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn;
|
||||
ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
|
||||
}
|
||||
}
|
||||
|
||||
rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
|
||||
rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
|
||||
rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
|
||||
rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
|
||||
rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime;
|
||||
rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
|
||||
rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
|
||||
rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
|
||||
rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
|
||||
|
||||
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
||||
" ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
|
||||
rf->currentClusterSize,
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||
rf->replytime);
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn),
|
||||
LSN_FORMAT_ARGS(rf->ps_flushlsn),
|
||||
LSN_FORMAT_ARGS(rf->ps_applylsn),
|
||||
rf->ps_replytime);
|
||||
|
||||
replication_feedback_set(rf);
|
||||
}
|
||||
@@ -2115,16 +2127,16 @@ HandleSafekeeperResponse(void)
|
||||
XLogRecPtr minFlushLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
|
||||
diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
|
||||
diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
|
||||
|
||||
if (!syncSafekeepers)
|
||||
{
|
||||
/* Get PageserverFeedback fields from the most advanced safekeeper */
|
||||
/* Get ReplicationFeedback fields from the most advanced safekeeper */
|
||||
GetLatestNeonFeedback(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
}
|
||||
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
|
||||
{
|
||||
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn)
|
||||
@@ -2142,7 +2154,7 @@ HandleSafekeeperResponse(void)
|
||||
* apply_lsn - This is what processed and durably saved at*
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
quorumFeedback.rf.ps_flushlsn,
|
||||
GetCurrentTimestamp(), false);
|
||||
}
|
||||
|
||||
@@ -2326,7 +2338,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
|
||||
ParsePageserverFeedbackMessage(&s, &msg->rf);
|
||||
ParseReplicationFeedbackMessage(&s, &msg->rf);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
@@ -2462,7 +2474,7 @@ backpressure_lag_impl(void)
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
LSN_FORMAT_ARGS(writePtr),
|
||||
LSN_FORMAT_ARGS(flushPtr),
|
||||
|
||||
@@ -39,6 +39,10 @@ typedef struct WalProposerConn WalProposerConn;
|
||||
struct WalMessage;
|
||||
typedef struct WalMessage WalMessage;
|
||||
|
||||
extern char *neon_timeline_walproposer;
|
||||
extern char *neon_tenant_walproposer;
|
||||
extern char *neon_safekeeper_token_walproposer;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
@@ -280,21 +284,21 @@ typedef struct HotStandbyFeedback
|
||||
FullTransactionId catalog_xmin;
|
||||
} HotStandbyFeedback;
|
||||
|
||||
typedef struct PageserverFeedback
|
||||
typedef struct ReplicationFeedback
|
||||
{
|
||||
/* current size of the timeline on pageserver */
|
||||
uint64 currentClusterSize;
|
||||
/* standby_status_update fields that safekeeper received from pageserver */
|
||||
XLogRecPtr last_received_lsn;
|
||||
XLogRecPtr disk_consistent_lsn;
|
||||
XLogRecPtr remote_consistent_lsn;
|
||||
TimestampTz replytime;
|
||||
} PageserverFeedback;
|
||||
XLogRecPtr ps_writelsn;
|
||||
XLogRecPtr ps_flushlsn;
|
||||
XLogRecPtr ps_applylsn;
|
||||
TimestampTz ps_replytime;
|
||||
} ReplicationFeedback;
|
||||
|
||||
typedef struct WalproposerShmemState
|
||||
{
|
||||
slock_t mutex;
|
||||
PageserverFeedback feedback;
|
||||
ReplicationFeedback feedback;
|
||||
term_t mineLastElectedTerm;
|
||||
pg_atomic_uint64 backpressureThrottlingTime;
|
||||
} WalproposerShmemState;
|
||||
@@ -320,10 +324,10 @@ typedef struct AppendResponse
|
||||
/* Feedback recieved from pageserver includes standby_status_update fields */
|
||||
/* and custom neon feedback. */
|
||||
/* This part of the message is extensible. */
|
||||
PageserverFeedback rf;
|
||||
ReplicationFeedback rf;
|
||||
} AppendResponse;
|
||||
|
||||
/* PageserverFeedback is extensible part of the message that is parsed separately */
|
||||
/* ReplicationFeedback is extensible part of the message that is parsed separately */
|
||||
/* Other fields are fixed part */
|
||||
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
|
||||
|
||||
@@ -383,13 +387,13 @@ extern void WalProposerSync(int argc, char *argv[]);
|
||||
extern void WalProposerMain(Datum main_arg);
|
||||
extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(void);
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
|
||||
ReplicationFeedback *rf);
|
||||
extern void StartProposerReplication(StartReplicationCmd *cmd);
|
||||
|
||||
extern Size WalproposerShmemSize(void);
|
||||
extern bool WalproposerShmemInit(void);
|
||||
extern void replication_feedback_set(PageserverFeedback *rf);
|
||||
extern void replication_feedback_set(ReplicationFeedback *rf);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
/* libpqwalproposer hooks & helper type */
|
||||
@@ -454,7 +458,7 @@ extern char *walprop_error_message(WalProposerConn *conn);
|
||||
extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
|
||||
|
||||
/* Re-exported PQconnectStart */
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
|
||||
extern WalProposerConn * walprop_connect_start(char *conninfo);
|
||||
|
||||
/* Re-exported PQconectPoll */
|
||||
extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
|
||||
|
||||
38
poetry.lock
generated
38
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -79,35 +79,37 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "allure-pytest"
|
||||
version = "2.13.1"
|
||||
version = "2.10.0"
|
||||
description = "Allure pytest integration"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
|
||||
{file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
|
||||
{file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
|
||||
{file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
allure-python-commons = "2.13.1"
|
||||
allure-python-commons = "2.10.0"
|
||||
pytest = ">=4.5.0"
|
||||
six = ">=1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "allure-python-commons"
|
||||
version = "2.13.1"
|
||||
version = "2.10.0"
|
||||
description = "Common module for integrate allure with python-based frameworks"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
{file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
|
||||
{file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
|
||||
{file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
|
||||
{file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
attrs = ">=16.0.0"
|
||||
pluggy = ">=0.4.0"
|
||||
six = ">=1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
@@ -1930,22 +1932,6 @@ pytest = [
|
||||
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-rerunfailures"
|
||||
version = "11.1.2"
|
||||
description = "pytest plugin to re-run tests to eliminate flaky failures"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
|
||||
{file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
packaging = ">=17.1"
|
||||
pytest = ">=5.3"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-timeout"
|
||||
version = "2.1.0"
|
||||
@@ -2611,4 +2597,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
|
||||
content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
|
||||
|
||||
@@ -140,7 +140,7 @@ async fn auth_quirks(
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
#[tracing::instrument(fields(allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
|
||||
@@ -53,7 +53,7 @@ pub async fn password_hack(
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
creds.project = Some(payload.project);
|
||||
creds.project = Some(payload.project.into());
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use crate::error::UserFacingError;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::collections::HashSet;
|
||||
use std::borrow::Cow;
|
||||
use thiserror::Error;
|
||||
use tracing::info;
|
||||
|
||||
@@ -19,10 +19,11 @@ pub enum ClientCredsParseError {
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
|
||||
#[error(
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
.cn,
|
||||
"SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
.sni, .cn, .cn,
|
||||
)]
|
||||
UnknownCommonName { cn: String },
|
||||
InconsistentSni { sni: String, cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(String),
|
||||
@@ -36,7 +37,7 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<String>,
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
@@ -50,7 +51,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
pub fn parse(
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<HashSet<String>>,
|
||||
common_name: Option<&str>,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
@@ -59,43 +60,37 @@ impl<'a> ClientCredentials<'a> {
|
||||
let user = get_param("user")?;
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params
|
||||
.options_raw()
|
||||
.and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
|
||||
.map(|name| name.to_string());
|
||||
let project_option = params.options_raw().and_then(|mut options| {
|
||||
options
|
||||
.find_map(|opt| opt.strip_prefix("project="))
|
||||
.map(Cow::Borrowed)
|
||||
});
|
||||
|
||||
let project_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
|
||||
|
||||
let project = common_name_from_sni
|
||||
.and_then(|domain| {
|
||||
if cn.contains(domain) {
|
||||
subdomain_from_sni(sni_str, domain)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
// Alternative project name is in fact a subdomain from SNI.
|
||||
// NOTE: we do not consider SNI if `common_name` is missing.
|
||||
let project_domain = sni
|
||||
.zip(common_name)
|
||||
.map(|(sni, cn)| {
|
||||
subdomain_from_sni(sni, cn)
|
||||
.ok_or_else(|| InconsistentSni {
|
||||
sni: sni.into(),
|
||||
cn: cn.into(),
|
||||
})
|
||||
.ok_or_else(|| UnknownCommonName {
|
||||
cn: common_name_from_sni.unwrap_or("").into(),
|
||||
})?;
|
||||
.map(Cow::<'static, str>::Owned)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
Some(project)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let project = match (project_option, project_from_domain) {
|
||||
let project = match (project_option, project_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(option), Some(domain)) if option != domain => {
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
Some(Err(InconsistentProjectNames {
|
||||
domain: domain.into(),
|
||||
option: option.into(),
|
||||
}))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
|
||||
false => Err(MalformedProjectName(name)),
|
||||
false => Err(MalformedProjectName(name.into())),
|
||||
true => Ok(name),
|
||||
}),
|
||||
}
|
||||
@@ -154,9 +149,9 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("foo.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
|
||||
@@ -182,41 +177,24 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
|
||||
|
||||
let sni = Some("baz.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_multi_common_names() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_projects_different() {
|
||||
let options =
|
||||
StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
|
||||
|
||||
let sni = Some("second.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -231,12 +209,13 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
let common_name = Some("example.com");
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
match err {
|
||||
UnknownCommonName { cn } => {
|
||||
assert_eq!(cn, "localhost");
|
||||
InconsistentSni { sni, cn } => {
|
||||
assert_eq!(sni, "project.localhost");
|
||||
assert_eq!(cn, "example.com");
|
||||
}
|
||||
_ => panic!("bad error: {err:?}"),
|
||||
}
|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
use crate::auth;
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use rustls::sign;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use std::{str::FromStr, sync::Arc, time::Duration};
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
@@ -22,7 +16,7 @@ pub struct MetricCollectionConfig {
|
||||
|
||||
pub struct TlsConfig {
|
||||
pub config: Arc<rustls::ServerConfig>,
|
||||
pub common_names: Option<HashSet<String>>,
|
||||
pub common_name: Option<String>,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
@@ -32,33 +26,28 @@ impl TlsConfig {
|
||||
}
|
||||
|
||||
/// Configure TLS for the main endpoint.
|
||||
pub fn configure_tls(
|
||||
key_path: &str,
|
||||
cert_path: &str,
|
||||
certs_dir: Option<&String>,
|
||||
) -> anyhow::Result<TlsConfig> {
|
||||
let mut cert_resolver = CertResolver::new();
|
||||
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
|
||||
// add default certificate
|
||||
cert_resolver.add_cert(key_path, cert_path)?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||
};
|
||||
|
||||
// add extra certificates
|
||||
if let Some(certs_dir) = certs_dir {
|
||||
for entry in std::fs::read_dir(certs_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
let key_path = path.join("key.pem");
|
||||
let cert_path = path.join("cert.pem");
|
||||
if key_path.exists() && cert_path.exists() {
|
||||
cert_resolver
|
||||
.add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let common_names = cert_resolver.get_common_names();
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.context(format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.into_iter()
|
||||
.map(rustls::Certificate)
|
||||
.collect()
|
||||
};
|
||||
|
||||
let config = rustls::ServerConfig::builder()
|
||||
.with_safe_default_cipher_suites()
|
||||
@@ -66,105 +55,27 @@ pub fn configure_tls(
|
||||
// allow TLS 1.2 to be compatible with older client libraries
|
||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(Arc::new(cert_resolver))
|
||||
.with_single_cert(cert_chain, key)?
|
||||
.into();
|
||||
|
||||
// determine common name from tls-cert (-c server.crt param).
|
||||
// used in asserting project name formatting invariant.
|
||||
let common_name = {
|
||||
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||
.context(format!(
|
||||
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.1;
|
||||
let common_name = pem.parse_x509()?.subject().to_string();
|
||||
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
|
||||
};
|
||||
|
||||
Ok(TlsConfig {
|
||||
config,
|
||||
common_names: Some(common_names),
|
||||
common_name,
|
||||
})
|
||||
}
|
||||
|
||||
struct CertResolver {
|
||||
certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
|
||||
}
|
||||
|
||||
impl CertResolver {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
certs: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
|
||||
let priv_key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||
};
|
||||
|
||||
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.context(format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.into_iter()
|
||||
.map(rustls::Certificate)
|
||||
.collect()
|
||||
};
|
||||
|
||||
let common_name = {
|
||||
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||
.context(format!(
|
||||
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.1;
|
||||
let common_name = pem.parse_x509()?.subject().to_string();
|
||||
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
|
||||
}
|
||||
.context(format!(
|
||||
"Failed to parse common name from certificate at '{cert_path}'."
|
||||
))?;
|
||||
|
||||
self.certs.insert(
|
||||
common_name,
|
||||
Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_common_names(&self) -> HashSet<String> {
|
||||
self.certs.keys().map(|s| s.to_string()).collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl rustls::server::ResolvesServerCert for CertResolver {
|
||||
fn resolve(
|
||||
&self,
|
||||
_client_hello: rustls::server::ClientHello,
|
||||
) -> Option<Arc<rustls::sign::CertifiedKey>> {
|
||||
// loop here and cut off more and more subdomains until we find
|
||||
// a match to get a proper wildcard support. OTOH, we now do not
|
||||
// use nested domains, so keep this simple for now.
|
||||
//
|
||||
// With the current coding foo.com will match *.foo.com and that
|
||||
// repeats behavior of the old code.
|
||||
if let Some(mut sni_name) = _client_hello.server_name() {
|
||||
loop {
|
||||
if let Some(cert) = self.certs.get(sni_name) {
|
||||
return Some(cert.clone());
|
||||
}
|
||||
if let Some((_, rest)) = sni_name.split_once('.') {
|
||||
sni_name = rest;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for cmdline cache options parsing.
|
||||
pub struct CacheOptions {
|
||||
/// Max number of entries.
|
||||
|
||||
@@ -132,11 +132,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
args.get_one::<String>("tls-key"),
|
||||
args.get_one::<String>("tls-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
key_path,
|
||||
cert_path,
|
||||
args.get_one::<String>("certs-dir"),
|
||||
)?),
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
|
||||
};
|
||||
@@ -258,12 +254,6 @@ fn cli() -> clap::Command {
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
|
||||
.arg(
|
||||
Arg::new("certs-dir")
|
||||
.long("certs-dir")
|
||||
.help("path to directory with TLS certificates for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-endpoint")
|
||||
.long("metric-collection-endpoint")
|
||||
|
||||
@@ -98,7 +98,7 @@ pub async fn task_main(
|
||||
}
|
||||
|
||||
// TODO(tech debt): unite this with its twin below.
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
pub async fn handle_ws_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -124,11 +124,11 @@ pub async fn handle_ws_client(
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -140,7 +140,7 @@ pub async fn handle_ws_client(
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -163,11 +163,11 @@ async fn handle_client(
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let sni = stream.get_ref().sni_hostname();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
|
||||
@@ -54,11 +54,9 @@ fn generate_tls_config<'a>(
|
||||
.with_single_cert(vec![cert], key)?
|
||||
.into();
|
||||
|
||||
let common_names = Some([common_name.to_owned()].iter().cloned().collect());
|
||||
|
||||
TlsConfig {
|
||||
config,
|
||||
common_names,
|
||||
common_name: Some(common_name.to_string()),
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^2.2.3"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.13.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
toml = "^0.10.2"
|
||||
psutil = "^5.9.4"
|
||||
@@ -34,7 +34,6 @@ types-psutil = "^5.9.5.4"
|
||||
types-toml = "^0.10.8"
|
||||
pytest-httpserver = "^1.0.6"
|
||||
aiohttp = "3.7.4"
|
||||
pytest-rerunfailures = "^11.1.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.1.0"
|
||||
@@ -70,9 +69,6 @@ strict = true
|
||||
module = [
|
||||
"asyncpg.*",
|
||||
"pg8000.*",
|
||||
"allure.*",
|
||||
"allure_commons.*",
|
||||
"allure_pytest.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.68.2"
|
||||
channel = "1.66.1"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -5,7 +5,6 @@ use anyhow::{bail, Context, Result};
|
||||
use clap::Parser;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use toml_edit::Document;
|
||||
use utils::signals::ShutdownSignals;
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
@@ -40,7 +39,7 @@ use utils::{
|
||||
logging::{self, LogFormat},
|
||||
project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
tcp_listener,
|
||||
signals, tcp_listener,
|
||||
};
|
||||
|
||||
const PID_FILE_NAME: &str = "safekeeper.pid";
|
||||
@@ -217,6 +216,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
@@ -274,12 +274,15 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
// TODO: put more thoughts into handling of failed threads
|
||||
// We should catch & die if they are in trouble.
|
||||
// We probably should restart them.
|
||||
|
||||
// On any shutdown signal, log receival and exit. Additionally, handling
|
||||
// SIGQUIT prevents coredump.
|
||||
ShutdownSignals::handle(|signal| {
|
||||
info!("received {}, terminating", signal.name());
|
||||
// NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
|
||||
signals.handle(|signal| {
|
||||
// TODO: implement graceful shutdown with joining threads etc
|
||||
info!(
|
||||
"received {}, terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
std::process::exit(0);
|
||||
})
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user