Compare commits

..

3 Commits

Author SHA1 Message Date
Heikki Linnakangas
7eefad691c Improve the scaling, add Play/Stop buttons 2023-03-22 19:40:54 +02:00
Heikki Linnakangas
fe59a063ea WIP: Collect and draw layer trace 2023-03-22 19:40:54 +02:00
Heikki Linnakangas
ae8e5b3a8e Add test from PR #3673 2023-03-22 19:40:54 +02:00
145 changed files with 2356 additions and 5279 deletions

View File

@@ -15,32 +15,10 @@ outputs:
report-url:
description: 'Allure report URL'
value: ${{ steps.generate-report.outputs.report-url }}
report-json-url:
description: 'Allure report JSON URL'
value: ${{ steps.generate-report.outputs.report-json-url }}
runs:
using: "composite"
steps:
# We're using some of env variables quite offen, so let's set them once.
#
# It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
#
# - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
# - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
#
- name: Set common environment variables
shell: bash -euxo pipefail {0}
run: |
echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
env:
BUILD_TYPE: ${{ inputs.build_type }}
BUCKET: neon-github-public-dev
TEST_OUTPUT: /tmp/test_output
- name: Validate input parameters
shell: bash -euxo pipefail {0}
run: |
@@ -98,14 +76,16 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.21.0
ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
ALLURE_VERSION: 2.19.0
ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
- name: Upload Allure results
if: ${{ inputs.action == 'store' }}
env:
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
shell: bash -euxo pipefail {0}
run: |
@@ -124,7 +104,7 @@ runs:
EOF
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
TEST_SELECTION=${{ inputs.test_selection }}
BUILD_TYPE=${BUILD_TYPE}
BUILD_TYPE=${{ inputs.build_type }}
EOF
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
@@ -133,12 +113,13 @@ runs:
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
- name: Acquire Allure lock
if: ${{ inputs.action == 'generate' }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
run: |
LOCK_TIMEOUT=300 # seconds
@@ -168,6 +149,8 @@ runs:
env:
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
shell: bash -euxo pipefail {0}
run: |
# Get previously uploaded data for this run
@@ -203,24 +186,24 @@ runs:
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
# Generate redirect
cat <<EOF > ${TEST_OUTPUT}/allure/index.html
cat <<EOF > ./index.html
<!DOCTYPE html>
<meta charset="utf-8">
<title>Redirecting to ${REPORT_URL}</title>
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
EOF
aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
- name: Release Allure lock
if: ${{ inputs.action == 'generate' && always() }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
run: |
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
@@ -229,16 +212,11 @@ runs:
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi
- name: Cleanup
if: always()
shell: bash -euxo pipefail {0}
run: |
rm -rf ${TEST_OUTPUT}/allure
- uses: actions/github-script@v6
if: ${{ inputs.action == 'generate' && always() }}
env:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
BUILD_TYPE: ${{ inputs.build_type }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |

View File

@@ -14,12 +14,6 @@ inputs:
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
provisioner:
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:
dsn:
@@ -37,10 +31,6 @@ runs:
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
fi
project=$(curl \
"https://${API_HOST}/api/v2/projects" \
--fail \
@@ -52,9 +42,6 @@ runs:
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
\"pg_version\": ${POSTGRES_VERSION},
\"region_id\": \"${REGION_ID}\",
\"provisioner\": \"${PROVISIONER}\",
\"autoscaling_limit_min_cu\": ${MIN_CU},
\"autoscaling_limit_max_cu\": ${MAX_CU},
\"settings\": { }
}
}")
@@ -75,6 +62,3 @@ runs:
API_KEY: ${{ inputs.api_key }}
REGION_ID: ${{ inputs.region_id }}
POSTGRES_VERSION: ${{ inputs.postgres_version }}
PROVISIONER: ${{ inputs.provisioner }}
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}

View File

@@ -44,10 +44,6 @@ inputs:
description: 'Secret access key'
required: false
default: ''
rerun_flaky:
description: 'Whether to rerun flaky tests'
required: false
default: 'false'
runs:
using: "composite"
@@ -105,7 +101,6 @@ runs:
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -148,13 +143,6 @@ runs:
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
fi
if [ "${RERUN_FLAKY}" == "true" ]; then
mkdir -p $TEST_OUTPUT
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
fi
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then

View File

@@ -8,16 +8,6 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "10m"
threshold: &default_eviction_threshold "24h"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -8,16 +8,6 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "10m"
threshold: &default_eviction_threshold "24h"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -8,16 +8,6 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "10m"
threshold: &default_eviction_threshold "24h"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -8,16 +8,6 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "10m"
threshold: &default_eviction_threshold "24h"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -3,8 +3,6 @@
# fetch params from meta-data service
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type)
DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}')
# store fqdn hostname in var
HOST=$(hostname -f)
@@ -20,9 +18,7 @@ cat <<EOF | tee /tmp/payload
"http_host": "${HOST}",
"http_port": 9898,
"active": false,
"availability_zone_id": "${AZ_ID}",
"disk_size": ${DISK_SIZE},
"instance_type": "${INSTANCE_TYPE}"
"availability_zone_id": "${AZ_ID}"
}
EOF

View File

@@ -8,16 +8,11 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "20m"
threshold: &default_eviction_threshold "20m"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
threshold: "20m"
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -8,16 +8,11 @@ storage:
pg_distrib_dir: /usr/local
metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
metric_collection_interval: 10min
disk_usage_based_eviction:
max_usage_pct: 80
min_avail_bytes: 0
period: "10s"
tenant_config:
eviction_policy:
kind: "LayerAccessThreshold"
period: "20m"
threshold: &default_eviction_threshold "20m"
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
threshold: "20m"
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"

View File

@@ -30,9 +30,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: dev
neon_region: eu-west-1
zenith_service: proxy-scram
zenith_env: dev
zenith_region: eu-west-1
zenith_region_slug: eu-west-1
exposedService:
annotations:

View File

@@ -15,9 +15,10 @@ settings:
# -- Additional labels for neon-proxy-link pods
podLabels:
neon_service: proxy
neon_env: dev
neon_region: us-east-2
zenith_service: proxy
zenith_env: dev
zenith_region: us-east-2
zenith_region_slug: us-east-2
service:
type: LoadBalancer

View File

@@ -15,9 +15,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram-legacy
neon_env: dev
neon_region: us-east-2
zenith_service: proxy-scram-legacy
zenith_env: dev
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:

View File

@@ -23,7 +23,6 @@ settings:
authBackend: "console"
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
domain: "*.us-east-2.aws.neon.build"
extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
sentryEnvironment: "staging"
wssPort: 8443
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
@@ -31,9 +30,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: dev
neon_region: us-east-2
zenith_service: proxy-scram
zenith_env: dev
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:

View File

@@ -31,9 +31,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: prod
neon_region: ap-southeast-1
zenith_service: proxy-scram
zenith_env: prod
zenith_region: ap-southeast-1
zenith_region_slug: ap-southeast-1
exposedService:
annotations:

View File

@@ -31,9 +31,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: prod
neon_region: eu-central-1
zenith_service: proxy-scram
zenith_env: prod
zenith_region: eu-central-1
zenith_region_slug: eu-central-1
exposedService:
annotations:

View File

@@ -13,9 +13,10 @@ settings:
# -- Additional labels for zenith-proxy pods
podLabels:
neon_service: proxy
neon_env: production
neon_region: us-east-2
zenith_service: proxy
zenith_env: production
zenith_region: us-east-2
zenith_region_slug: us-east-2
service:
type: LoadBalancer

View File

@@ -31,9 +31,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: prod
neon_region: us-east-2
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:

View File

@@ -31,9 +31,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: prod
neon_region: us-west-2
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-west-2
zenith_region_slug: us-west-2
exposedService:
annotations:

View File

@@ -31,9 +31,10 @@ settings:
# -- Additional labels for neon-proxy pods
podLabels:
neon_service: proxy-scram
neon_env: prod
neon_region: us-west-2
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-west-2
zenith_region_slug: us-west-2
exposedService:
annotations:

View File

@@ -3,12 +3,8 @@
## Issue ticket number and link
## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
## Checklist before merging
- [ ] Do not forget to reformat commit message to not include the above checklist

View File

@@ -111,7 +111,6 @@ jobs:
strategy:
fail-fast: false
matrix:
# neon-captest-freetier: Run pgbench with freetier-limited compute
# neon-captest-new: Run pgbench in a freshly created project
# neon-captest-reuse: Same, but reusing existing project
# neon-captest-prefetch: Same, with prefetching enabled (new project)
@@ -121,9 +120,6 @@ jobs:
db_size: [ 10gb ]
runner: [ us-east-2 ]
include:
- platform: neon-captest-freetier
db_size: 3gb
runner: us-east-2
- platform: neon-captest-prefetch
db_size: 50gb
runner: us-east-2
@@ -164,14 +160,13 @@ jobs:
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch", "neon-captest-freetier"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
- name: Set up Connection String
id: set-up-connstr
@@ -180,7 +175,7 @@ jobs:
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neon-captest-new | neon-captest-prefetch | neon-captest-freetier)
neon-captest-new | neon-captest-prefetch)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -190,7 +185,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;;
*)
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', neon-captest-freetier, 'rds-aurora', or 'rds-postgres'"
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac

View File

@@ -184,10 +184,10 @@ jobs:
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked"
CARGO_FLAGS="--locked $CARGO_FEATURES"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release"
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
@@ -240,18 +240,11 @@ jobs:
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
- name: Run cargo test
run: |
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
${cov_prefix} cargo test $CARGO_FLAGS
- name: Install rust binaries
run: |
@@ -275,7 +268,7 @@ jobs:
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $test_exe_paths; do
@@ -335,10 +328,6 @@ jobs:
real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
rerun_flaky: true
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
@@ -375,90 +364,42 @@ jobs:
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
create-test-report:
merge-allure-report:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: ${{ !cancelled() }}
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- uses: actions/checkout@v3
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: Create Allure report (debug)
if: ${{ !cancelled() }}
id: create-allure-report-debug
- name: Create Allure report
id: create-allure-report
uses: ./.github/actions/allure-report
with:
action: generate
build_type: debug
- name: Create Allure report (release)
if: ${{ !cancelled() }}
id: create-allure-report-release
uses: ./.github/actions/allure-report
with:
action: generate
build_type: release
- uses: actions/github-script@v6
if: >
!cancelled() &&
github.event_name == 'pull_request' && (
steps.create-allure-report-debug.outputs.report-url ||
steps.create-allure-report-release.outputs.report-url
)
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const reports = [{
buildType: "debug",
reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
}, {
buildType: "release",
reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}",
}]
const script = require("./scripts/pr-comment-test-report.js")
await script({
github,
context,
fetch,
reports,
})
build_type: ${{ matrix.build_type }}
- name: Store Allure test stat in the DB
if: >
!cancelled() && (
steps.create-allure-report-debug.outputs.report-url ||
steps.create-allure-report-release.outputs.report-url
)
if: ${{ steps.create-allure-report.outputs.report-url }}
env:
BUILD_TYPE: ${{ matrix.build_type }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
run: |
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
./scripts/pysync
for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
if [ -z "$report_url" ]; then
continue
fi
if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
BUILD_TYPE=debug
else
BUILD_TYPE=release
fi
curl --fail --output suites.json "${report_url}"
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
done
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
runs-on: [ self-hosted, gen3, small ]
@@ -950,16 +891,6 @@ jobs:
needs: [ push-docker-hub, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Checkout
uses: actions/checkout@v3
with:

View File

@@ -53,14 +53,14 @@ jobs:
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Set extra env for macOS
run: |

View File

@@ -31,3 +31,4 @@ jobs:
head: releases/${{ steps.date.outputs.date }}
base: release
title: Release ${{ steps.date.outputs.date }}
team_reviewers: release

24
Cargo.lock generated
View File

@@ -2474,7 +2474,6 @@ dependencies = [
"strum",
"strum_macros",
"svg_fmt",
"sync_wrapper",
"tempfile",
"tenant_size_model",
"thiserror",
@@ -3086,7 +3085,6 @@ dependencies = [
"serde",
"serde_json",
"tempfile",
"test-context",
"tokio",
"tokio-util",
"toml_edit",
@@ -3890,27 +3888,6 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "test-context"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
dependencies = [
"async-trait",
"futures",
"test-context-macros",
]
[[package]]
name = "test-context-macros"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "textwrap"
version = "0.16.0"
@@ -4557,7 +4534,6 @@ dependencies = [
"once_cell",
"pin-project-lite",
"rand",
"regex",
"routerify",
"sentry",
"serde",

View File

@@ -97,7 +97,6 @@ strum_macros = "0.24"
svg_fmt = "0.4.1"
sync_wrapper = "0.1.2"
tar = "0.4"
test-context = "0.1"
thiserror = "1.0"
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
tokio = { version = "1.17", features = ["macros"] }

View File

@@ -38,7 +38,6 @@ RUN cd postgres && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
@@ -301,27 +300,6 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
#########################################################################################
#
# Layer "timescaledb-pg-build"
# compile timescaledb extension
#
#########################################################################################
FROM build-deps AS timescaledb-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN apt-get update && \
apt-get install -y cmake && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
cd build && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
#########################################################################################
#
# Layer "rust extensions"
@@ -426,7 +404,6 @@ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \

View File

@@ -40,8 +40,6 @@ pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf
```
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
2. [Install Rust](https://www.rust-lang.org/tools/install)
```
# recommended approach from https://www.rust-lang.org/tools/install

View File

@@ -34,14 +34,13 @@ use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{Arc, Condvar, Mutex};
use std::sync::{Arc, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use tracing::{error, info};
use url::Url;
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
use compute_tools::http::api::launch_http_server;
@@ -50,6 +49,7 @@ use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::pg_helpers::*;
use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -62,7 +62,7 @@ fn main() -> Result<()> {
let connstr = matches
.get_one::<String>("connstr")
.expect("Postgres connection string is required");
let spec_json = matches.get_one::<String>("spec");
let spec = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let compute_id = matches.get_one::<String>("compute-id");
@@ -71,107 +71,40 @@ fn main() -> Result<()> {
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let mut spec = Default::default();
let mut spec_set = false;
let mut live_config_allowed = false;
match spec_json {
let spec: ComputeSpec = match spec {
// First, try to get cluster spec from the cli argument
Some(json) => {
spec = serde_json::from_str(json)?;
spec_set = true;
}
Some(json) => serde_json::from_str(json)?,
None => {
// Second, try to read it from the file if path is provided
if let Some(sp) = spec_path {
let path = Path::new(sp);
let file = File::open(path)?;
spec = serde_json::from_reader(file)?;
spec_set = true;
serde_json::from_reader(file)?
} else if let Some(id) = compute_id {
if let Some(cp_base) = control_plane_uri {
live_config_allowed = true;
if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
spec = s;
spec_set = true;
}
let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
Ok(v) => v,
Err(_) => "".to_string(),
};
reqwest::blocking::Client::new()
.get(cp_uri)
.header("Authorization", jwt)
.send()?
.json()?
} else {
panic!("must specify both --control-plane-uri and --compute-id or none");
panic!(
"must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
control_plane_uri, compute_id
);
}
} else {
panic!(
"compute spec should be provided by one of the following ways: \
--spec OR --spec-path OR --control-plane-uri and --compute-id"
);
panic!("compute spec should be provided via --spec or --spec-path argument");
}
}
};
let mut new_state = ComputeState::new();
if spec_set {
new_state.spec = spec;
}
let compute_node = ComputeNode {
start_time: Utc::now(),
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
live_config_allowed,
metrics: ComputeMetrics::default(),
state: Mutex::new(new_state),
state_changed: Condvar::new(),
};
let compute = Arc::new(compute_node);
// Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress.
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if state.status == ComputeStatus::ConfigurationPending {
info!("got spec, continue configuration");
// Spec is already set by the http server handler.
break;
}
}
}
// We got all we need, fill in the state.
let mut state = compute.state.lock().unwrap();
let pageserver_connstr = state
.spec
.cluster
.settings
.find("neon.pageserver_connstring")
.expect("pageserver connstr should be provided");
let storage_auth_token = state.spec.storage_auth_token.clone();
let tenant = state
.spec
.cluster
.settings
.find("neon.tenant_id")
.expect("tenant id should be provided");
let timeline = state
.spec
.cluster
.settings
.find("neon.timeline_id")
.expect("tenant id should be provided");
let startup_tracing_context = state.spec.startup_tracing_context.clone();
state.pageserver_connstr = pageserver_connstr;
state.storage_auth_token = storage_auth_token;
state.tenant = tenant;
state.timeline = timeline;
state.status = ComputeStatus::Init;
compute.state_changed.notify_all();
drop(state);
// Extract OpenTelemetry context for the startup actions from the spec, and
// attach it to the current tracing context.
//
@@ -187,7 +120,7 @@ fn main() -> Result<()> {
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
Some(TraceContextPropagator::new().extract(carrier).attach())
@@ -195,7 +128,41 @@ fn main() -> Result<()> {
None
};
// Launch remaining service threads
let pageserver_connstr = spec
.cluster
.settings
.find("neon.pageserver_connstring")
.expect("pageserver connstr should be provided");
let storage_auth_token = spec.storage_auth_token.clone();
let tenant = spec
.cluster
.settings
.find("neon.tenant_id")
.expect("tenant id should be provided");
let timeline = spec
.cluster
.settings
.find("neon.timeline_id")
.expect("tenant id should be provided");
let compute_state = ComputeNode {
start_time: Utc::now(),
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
spec,
tenant,
timeline,
pageserver_connstr,
storage_auth_token,
metrics: ComputeMetrics::default(),
state: RwLock::new(ComputeState::new()),
};
let compute = Arc::new(compute_state);
// Launch service threads first, so we were able to serve availability
// requests, while configuration is still in progress.
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
// Start Postgres
@@ -205,7 +172,7 @@ fn main() -> Result<()> {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
let mut state = compute.state.lock().unwrap();
let mut state = compute.state.write().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
drop(state);
@@ -236,14 +203,13 @@ fn main() -> Result<()> {
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
info!("shutting down");
}
info!("shutting down tracing");
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
tracing_utils::shutdown_tracing();
info!("shutting down");
exit(exit_code.unwrap_or(1))
}
@@ -295,7 +261,7 @@ fn cli() -> clap::Command {
Arg::new("control-plane-uri")
.short('p')
.long("control-plane-uri")
.value_name("CONTROL_PLANE_API_BASE_URI"),
.value_name("CONTROL_PLANE"),
)
}

View File

@@ -20,12 +20,12 @@ use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Condvar, Mutex};
use std::sync::RwLock;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
use serde::Serialize;
use serde::{Serialize, Serializer};
use tokio_postgres;
use tracing::{info, instrument, warn};
@@ -41,52 +41,41 @@ pub struct ComputeNode {
pub connstr: url::Url,
pub pgdata: String,
pub pgbin: String,
pub metrics: ComputeMetrics,
/// We should only allow live re- / configuration of the compute node if
/// it uses 'pull model', i.e. it can go to control-plane and fetch
/// the latest configuration. Otherwise, there could be a case:
/// - we start compute with some spec provided as argument
/// - we push new spec and it does reconfiguration
/// - but then something happens and compute pod / VM is destroyed,
/// so k8s controller starts it again with the **old** spec
/// and the same for empty computes:
/// - we started compute without any spec
/// - we push spec and it does configuration
/// - but then it is restarted without any spec again
pub live_config_allowed: bool,
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
/// To allow HTTP API server to serving status requests, while configuration
/// is in progress, lock should be held only for short periods of time to do
/// read/write, not the whole configuration process.
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
}
#[derive(Clone, Debug)]
pub struct ComputeState {
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
pub last_active: DateTime<Utc>,
pub error: Option<String>,
pub spec: ComputeSpec,
pub tenant: String,
pub timeline: String,
pub pageserver_connstr: String,
pub storage_auth_token: Option<String>,
pub metrics: ComputeMetrics,
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
/// to allow HTTP API server to serve status requests, while configuration
/// is in progress.
pub state: RwLock<ComputeState>,
}
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
x.to_rfc3339().serialize(s)
}
#[derive(Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
#[serde(serialize_with = "rfc3339_serialize")]
pub last_active: DateTime<Utc>,
pub error: Option<String>,
}
impl ComputeState {
pub fn new() -> Self {
Self {
status: ComputeStatus::Empty,
status: ComputeStatus::Init,
last_active: Utc::now(),
error: None,
spec: ComputeSpec::default(),
tenant: String::new(),
timeline: String::new(),
pageserver_connstr: String::new(),
storage_auth_token: None,
}
}
}
@@ -97,22 +86,11 @@ impl Default for ComputeState {
}
}
#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)]
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
// Spec wasn't provided at start, waiting for it to be
// provided by control-plane.
Empty,
// Compute configuration was requested.
ConfigurationPending,
// Compute node has spec and initial startup and
// configuration is in progress.
Init,
// Compute is configured and running.
Running,
// Either startup or configuration failed,
// compute will exit soon or is waiting for
// control-plane to terminate it.
Failed,
}
@@ -126,13 +104,11 @@ pub struct ComputeMetrics {
impl ComputeNode {
pub fn set_status(&self, status: ComputeStatus) {
let mut state = self.state.lock().unwrap();
state.status = status;
self.state_changed.notify_all();
self.state.write().unwrap().status = status;
}
pub fn get_status(&self) -> ComputeStatus {
self.state.lock().unwrap().status
self.state.read().unwrap().status
}
// Remove `pgdata` directory and create it again with right permissions.
@@ -148,15 +124,15 @@ impl ComputeNode {
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
#[instrument(skip(self, compute_state))]
fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> {
#[instrument(skip(self))]
fn get_basebackup(&self, lsn: &str) -> Result<()> {
let start_time = Utc::now();
let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?;
let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
if let Some(storage_auth_token) = &compute_state.storage_auth_token {
if let Some(storage_auth_token) = &self.storage_auth_token {
info!("Got storage auth token from spec file");
config.password(storage_auth_token);
} else {
@@ -165,14 +141,8 @@ impl ComputeNode {
let mut client = config.connect(NoTls)?;
let basebackup_cmd = match lsn {
"0/0" => format!(
"basebackup {} {}",
&compute_state.tenant, &compute_state.timeline
), // First start of the compute
_ => format!(
"basebackup {} {} {}",
&compute_state.tenant, &compute_state.timeline, lsn
),
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
@@ -199,14 +169,14 @@ impl ComputeNode {
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
#[instrument(skip(self, storage_auth_token))]
fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<String> {
#[instrument(skip(self))]
fn sync_safekeepers(&self) -> Result<String> {
let start_time = Utc::now();
let sync_handle = Command::new(&self.pgbin)
.args(["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.envs(if let Some(storage_auth_token) = &storage_auth_token {
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
} else {
vec![]
@@ -247,9 +217,9 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip(self, compute_state))]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
let spec = &compute_state.spec;
#[instrument(skip(self))]
pub fn prepare_pgdata(&self) -> Result<()> {
let spec = &self.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
@@ -258,18 +228,18 @@ impl ComputeNode {
info!("starting safekeepers syncing");
let lsn = self
.sync_safekeepers(compute_state.storage_auth_token.clone())
.sync_safekeepers()
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
info!(
"getting basebackup@{} from pageserver {}",
lsn, &compute_state.pageserver_connstr
lsn, &self.pageserver_connstr
);
self.get_basebackup(compute_state, &lsn).with_context(|| {
self.get_basebackup(&lsn).with_context(|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, &compute_state.pageserver_connstr
lsn, &self.pageserver_connstr
)
})?;
@@ -282,16 +252,13 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip(self))]
pub fn start_postgres(
&self,
storage_auth_token: Option<String>,
) -> Result<std::process::Child> {
pub fn start_postgres(&self) -> Result<std::process::Child> {
let pgdata_path = Path::new(&self.pgdata);
// Run postgres as a child process.
let mut pg = Command::new(&self.pgbin)
.args(["-D", &self.pgdata])
.envs(if let Some(storage_auth_token) = &storage_auth_token {
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
} else {
vec![]
@@ -304,9 +271,8 @@ impl ComputeNode {
Ok(pg)
}
/// Do initial configuration of the already started Postgres.
#[instrument(skip(self, compute_state))]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
#[instrument(skip(self))]
pub fn apply_config(&self) -> Result<()> {
// If connection fails,
// it may be the old node with `zenith_admin` superuser.
//
@@ -337,19 +303,19 @@ impl ComputeNode {
};
// Proceed with post-startup configuration. Note, that order of operations is important.
handle_roles(&compute_state.spec, &mut client)?;
handle_databases(&compute_state.spec, &mut client)?;
handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?;
handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?;
handle_roles(&self.spec, &mut client)?;
handle_databases(&self.spec, &mut client)?;
handle_role_deletions(self, &mut client)?;
handle_grants(self, &mut client)?;
create_writability_check_data(&mut client)?;
handle_extensions(&compute_state.spec, &mut client)?;
handle_extensions(&self.spec, &mut client)?;
// 'Close' connection
drop(client);
info!(
"finished configuration of compute for project {}",
compute_state.spec.cluster.cluster_id
self.spec.cluster.cluster_id
);
Ok(())
@@ -357,22 +323,21 @@ impl ComputeNode {
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
compute_state.spec.cluster.cluster_id,
compute_state.spec.operation_uuid.as_ref().unwrap(),
compute_state.tenant,
compute_state.timeline,
self.spec.cluster.cluster_id,
self.spec.operation_uuid.as_ref().unwrap(),
self.tenant,
self.timeline,
);
self.prepare_pgdata(&compute_state)?;
self.prepare_pgdata()?;
let start_time = Utc::now();
let pg = self.start_postgres(compute_state.storage_auth_token.clone())?;
let pg = self.start_postgres()?;
self.apply_config(&compute_state)?;
self.apply_config()?;
let startup_end_time = Utc::now();
self.metrics.config_ms.store(

View File

@@ -3,16 +3,12 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::compute::{ComputeNode, ComputeStatus};
use crate::http::requests::ConfigurationRequest;
use crate::http::responses::{ComputeStatusResponse, GenericAPIError};
use crate::compute::ComputeNode;
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use num_cpus;
use serde_json;
use tokio::task;
use tracing::{error, info};
use tracing_utils::http::OtelName;
@@ -27,10 +23,8 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
// Serialized compute state.
(&Method::GET, "/status") => {
info!("serving /status GET request");
let state = compute.state.lock().unwrap();
let status_response = ComputeStatusResponse::from(state.clone());
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
let state = compute.state.read().unwrap();
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
}
// Startup metrics in JSON format. Keep /metrics reserved for a possible
@@ -43,29 +37,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
// Collect Postgres current usage insights
(&Method::GET, "/insights") => {
info!("serving /insights GET request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!("compute is not running, current status: {:?}", status);
error!(msg);
return Response::new(Body::from(msg));
}
let insights = compute.collect_insights().await;
Response::new(Body::from(insights))
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let status = compute.get_status();
if status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for check_writability request: {:?}",
status
);
error!(msg);
return Response::new(Body::from(msg));
}
let res = crate::checker::check_writability(compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
@@ -84,23 +61,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
))
}
// Accept spec in JSON format and request compute configuration. If
// anything goes wrong after we set the compute status to `ConfigurationPending`
// and update compute state with new spec, we basically leave compute
// in the potentially wrong state. That said, it's control-plane's
// responsibility to watch compute state after reconfiguration request
// and to clean restart in case of errors.
(&Method::POST, "/configure") => {
info!("serving /configure POST request");
match handle_configure_request(req, compute).await {
Ok(msg) => Response::new(Body::from(msg)),
Err((msg, code)) => {
error!("error handling /configure request: {msg}");
render_json_error(&msg, code)
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));
@@ -110,88 +70,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
async fn handle_configure_request(
req: Request<Body>,
compute: &Arc<ComputeNode>,
) -> Result<String, (String, StatusCode)> {
if !compute.live_config_allowed {
return Err((
"live configuration is not allowed for this compute node".to_string(),
StatusCode::PRECONDITION_FAILED,
));
}
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
let spec = request.spec;
// XXX: wrap state update under lock in code blocks. Otherwise,
// we will try to `Send` `mut state` into the spawned thread
// bellow, which will cause error:
// ```
// error: future cannot be sent between threads safely
// ```
{
let mut state = compute.state.lock().unwrap();
if state.status != ComputeStatus::Empty {
let msg = format!(
"invalid compute status for configuration request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.spec = spec;
state.status = ComputeStatus::ConfigurationPending;
compute.state_changed.notify_all();
drop(state);
info!("set new spec and notified waiters");
}
// Spawn a blocking thread to wait for compute to become Running.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Running, current status: {:?}",
state.status
);
if state.status == ComputeStatus::Failed {
let err = state.error.clone().unwrap_or("unknown error".to_string());
let msg = format!("compute configuration failed: {:?}", err);
return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
}
}
Ok(())
})
.await
.unwrap()?;
// Return current compute state if everything went well.
let state = compute.state.lock().unwrap().clone();
let status_response = ComputeStatusResponse::from(state);
Ok(serde_json::to_string(&status_response).unwrap())
} else {
Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
}
}
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
let error = GenericAPIError {
error: e.to_string(),
};
Response::builder()
.status(status)
.body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(state: Arc<ComputeNode>) {

View File

@@ -1,3 +1 @@
pub mod api;
pub mod requests;
pub mod responses;

View File

@@ -11,7 +11,7 @@ paths:
get:
tags:
- Info
summary: Get compute node internal status.
summary: Get compute node internal status
description: ""
operationId: getComputeStatus
responses:
@@ -26,7 +26,7 @@ paths:
get:
tags:
- Info
summary: Get compute node startup metrics in JSON format.
summary: Get compute node startup metrics in JSON format
description: ""
operationId: getComputeMetricsJSON
responses:
@@ -41,9 +41,9 @@ paths:
get:
tags:
- Info
summary: Get current compute insights in JSON format.
summary: Get current compute insights in JSON format
description: |
Note, that this doesn't include any historical data.
Note, that this doesn't include any historical data
operationId: getComputeInsights
responses:
200:
@@ -56,12 +56,12 @@ paths:
/info:
get:
tags:
- Info
summary: Get info about the compute pod / VM.
- "info"
summary: Get info about the compute Pod/VM
description: ""
operationId: getInfo
responses:
200:
"200":
description: Info
content:
application/json:
@@ -72,7 +72,7 @@ paths:
post:
tags:
- Check
summary: Check that we can write new data on this compute.
summary: Check that we can write new data on this compute
description: ""
operationId: checkComputeWritability
responses:
@@ -82,64 +82,9 @@ paths:
text/plain:
schema:
type: string
description: Error text or 'true' if check passed.
description: Error text or 'true' if check passed
example: "true"
/configure:
post:
tags:
- Configure
summary: Perform compute node configuration.
description: |
This is a blocking API endpoint, i.e. it blocks waiting until
compute is finished configuration and is in `Running` state.
Optional non-blocking mode could be added later.
operationId: configureCompute
requestBody:
description: Configuration request.
required: true
content:
application/json:
schema:
type: object
required:
- spec
properties:
spec:
# XXX: I don't want to explain current spec in the OpenAPI format,
# as it could be changed really soon. Consider doing it later.
type: object
responses:
200:
description: Compute configuration finished.
content:
application/json:
schema:
$ref: "#/components/schemas/ComputeState"
400:
description: Provided spec is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
412:
description: |
It's not possible to do live-configuration of the compute.
It's either in the wrong state, or compute doesn't use pull
mode of configuration.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: |
Compute configuration request was processed, but error
occurred. Compute will likely shutdown soon.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:
@@ -150,7 +95,7 @@ components:
schemas:
ComputeMetrics:
type: object
description: Compute startup metrics.
description: Compute startup metrics
required:
- sync_safekeepers_ms
- basebackup_ms
@@ -168,7 +113,7 @@ components:
Info:
type: object
description: Information about VM/Pod.
description: Information about VM/Pod
required:
- num_cpus
properties:
@@ -185,26 +130,17 @@ components:
$ref: '#/components/schemas/ComputeStatus'
last_active:
type: string
description: The last detected compute activity timestamp in UTC and RFC3339 format.
description: The last detected compute activity timestamp in UTC and RFC3339 format
example: "2022-10-12T07:20:50.52Z"
error:
type: string
description: Text of the error during compute startup, if any.
example: ""
tenant:
type: string
description: Identifier of the current tenant served by compute node, if any.
example: c9269c359e9a199fad1ea0981246a78f
timeline:
type: string
description: Identifier of the current timeline served by compute node, if any.
example: ece7de74d4b8cbe5433a68ce4d1b97b4
description: Text of the error during compute startup, if any
ComputeInsights:
type: object
properties:
pg_stat_statements:
description: Contains raw output from pg_stat_statements in JSON format.
description: Contains raw output from pg_stat_statements in JSON format
type: array
items:
type: object
@@ -215,19 +151,6 @@ components:
- init
- failed
- running
example: running
#
# Errors
#
GenericError:
type: object
required:
- error
properties:
error:
type: string
security:
- JWT: []

View File

@@ -1,11 +0,0 @@
use serde::Deserialize;
use crate::spec::ComputeSpec;
/// We now pass only `spec` in the configuration request, but later we can
/// extend it and something like `restart: bool` or something else. So put
/// `spec` into a struct initially to be more flexible in the future.
#[derive(Deserialize, Debug)]
pub struct ConfigurationRequest {
pub spec: ComputeSpec,
}

View File

@@ -1,40 +0,0 @@
use serde::{Serialize, Serializer};
use chrono::{DateTime, Utc};
use crate::compute::{ComputeState, ComputeStatus};
#[derive(Serialize, Debug)]
pub struct GenericAPIError {
pub error: String,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "snake_case")]
pub struct ComputeStatusResponse {
pub tenant: String,
pub timeline: String,
pub status: ComputeStatus,
#[serde(serialize_with = "rfc3339_serialize")]
pub last_active: DateTime<Utc>,
pub error: Option<String>,
}
impl From<ComputeState> for ComputeStatusResponse {
fn from(state: ComputeState) -> Self {
ComputeStatusResponse {
tenant: state.tenant,
timeline: state.timeline,
status: state.status,
last_active: state.last_active,
error: state.error,
}
}
}
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
x.to_rfc3339().serialize(s)
}

View File

@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
&[],
);
let mut last_active = compute.state.lock().unwrap().last_active;
let mut last_active = compute.state.read().unwrap().last_active;
if let Ok(backs) = backends {
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -87,7 +87,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
// Update the last activity in the shared state if we got a more recent one.
let mut state = compute.state.lock().unwrap();
let mut state = compute.state.write().unwrap();
if last_active > state.last_active {
state.last_active = last_active;
debug!("set the last compute activity time to: {}", last_active);

View File

@@ -17,7 +17,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
/// Rust representation of Postgres role info with only those fields
/// that matter for us.
#[derive(Clone, Deserialize, Debug)]
#[derive(Clone, Deserialize)]
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
@@ -26,7 +26,7 @@ pub struct Role {
/// Rust representation of Postgres database info with only those fields
/// that matter for us.
#[derive(Clone, Deserialize, Debug)]
#[derive(Clone, Deserialize)]
pub struct Database {
pub name: PgIdent,
pub owner: PgIdent,
@@ -36,7 +36,7 @@ pub struct Database {
/// Common type representing both SQL statement params with or without value,
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
/// options like `wal_level = logical`.
#[derive(Clone, Deserialize, Debug)]
#[derive(Clone, Deserialize)]
pub struct GenericOption {
pub name: String,
pub value: Option<String>,
@@ -74,9 +74,18 @@ impl GenericOption {
/// Represent `GenericOption` as configuration option.
pub fn to_pg_setting(&self) -> String {
if let Some(val) = &self.value {
// TODO: check in the console DB that we don't have these settings
// set for any non-deleted project and drop this override.
let name = match self.name.as_str() {
"safekeepers" => "neon.safekeepers",
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
it => it,
};
match self.vartype.as_ref() {
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
_ => format!("{} = {}", self.name, val),
"string" => format!("{} = '{}'", name, escape_conf_value(val)),
_ => format!("{} = {}", name, val),
}
} else {
self.name.to_owned()

View File

@@ -8,13 +8,14 @@ use postgres::{Client, NoTls};
use serde::Deserialize;
use tracing::{info, info_span, instrument, span_enabled, warn, Level};
use crate::compute::ComputeNode;
use crate::config;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Deserialize, Debug, Default)]
#[derive(Clone, Deserialize)]
pub struct ComputeSpec {
pub format_version: f32,
pub timestamp: String,
@@ -30,7 +31,7 @@ pub struct ComputeSpec {
/// Cluster state seen from the perspective of the external tools
/// like Rails web console.
#[derive(Clone, Deserialize, Debug, Default)]
#[derive(Clone, Deserialize)]
pub struct Cluster {
pub cluster_id: String,
pub name: String,
@@ -46,36 +47,13 @@ pub struct Cluster {
/// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Deserialize, Debug)]
#[derive(Clone, Deserialize)]
pub struct DeltaOp {
pub action: String,
pub name: PgIdent,
pub new_name: Option<PgIdent>,
}
/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
/// env variable is set, it will be used for authorization.
pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
Ok(v) => v,
Err(_) => "".to_string(),
};
info!("getting spec from control plane: {}", cp_uri);
// TODO: check the response. We should distinguish cases when it's
// - network error, then retry
// - no spec for compute yet, then wait
// - compute id is unknown or any other error, then bail out
let spec = reqwest::blocking::Client::new()
.get(cp_uri)
.header("Authorization", jwt)
.send()?
.json()?;
Ok(spec)
}
/// It takes cluster specification and does the following:
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
/// - Update `pg_hba.conf` to allow external connections.
@@ -248,8 +226,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Reassign all dependent objects and delete requested roles.
#[instrument(skip_all)]
pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
if let Some(ops) = &spec.delta_operations {
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
if let Some(ops) = &node.spec.delta_operations {
// First, reassign all dependent objects to db owners.
info!("reassigning dependent objects of to-be-deleted roles");
@@ -266,7 +244,7 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
// Check that role is still present in Postgres, as this could be a
// restart with the same spec after role deletion.
if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
reassign_owned_objects(spec, connstr, &op.name)?;
reassign_owned_objects(node, &op.name)?;
}
}
@@ -290,10 +268,10 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
}
// Reassign all owned objects in all databases to the owner of the database.
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
for db in &spec.cluster.databases {
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
for db in &node.spec.cluster.databases {
if db.owner != *role_name {
let mut conf = Config::from_str(connstr)?;
let mut conf = Config::from_str(node.connstr.as_str())?;
conf.dbname(&db.name);
let mut client = conf.connect(NoTls)?;
@@ -438,7 +416,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
#[instrument(skip_all)]
pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
info!("cluster spec grants:");
// We now have a separate `web_access` role to connect to the database
@@ -470,8 +450,8 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) ->
// Do some per-database access adjustments. We'd better do this at db creation time,
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
// atomically.
for db in &spec.cluster.databases {
let mut conf = Config::from_str(connstr)?;
for db in &node.spec.cluster.databases {
let mut conf = Config::from_str(node.connstr.as_str())?;
conf.dbname(&db.name);
let mut db_client = conf.connect(NoTls)?;

View File

@@ -87,9 +87,11 @@ impl ComputeControlPlane {
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
is_test: false,
timeline_id,
lsn,
tenant_id,
uses_wal_proposer: false,
pg_version,
});
@@ -111,9 +113,11 @@ pub struct PostgresNode {
name: String,
pub env: LocalEnv,
pageserver: Arc<PageServerNode>,
is_test: bool,
pub timeline_id: TimelineId,
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
pub tenant_id: TenantId,
uses_wal_proposer: bool,
pg_version: u32,
}
@@ -147,6 +151,7 @@ impl PostgresNode {
let port: u16 = conf.parse_field("port", &context)?;
let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
// Read postgres version from PG_VERSION file to determine which postgres version binary to use.
// If it doesn't exist, assume broken data directory and use default pg version.
@@ -166,9 +171,11 @@ impl PostgresNode {
name,
env: env.clone(),
pageserver: Arc::clone(pageserver),
is_test: false,
timeline_id,
lsn: recovery_target_lsn,
tenant_id,
uses_wal_proposer,
pg_version,
})
}
@@ -360,7 +367,7 @@ impl PostgresNode {
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
let backup_lsn = if let Some(lsn) = self.lsn {
Some(lsn)
} else if !self.env.safekeepers.is_empty() {
} else if self.uses_wal_proposer {
// LSN 0 means that it is bootstrap and we need to download just
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
@@ -399,7 +406,7 @@ impl PostgresNode {
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
let mut cmd = Command::new(&pg_ctl_path);
let mut cmd = Command::new(pg_ctl_path);
cmd.args(
[
&[
@@ -428,9 +435,7 @@ impl PostgresNode {
cmd.env("NEON_AUTH_TOKEN", token);
}
let pg_ctl = cmd
.output()
.context(format!("{} failed", pg_ctl_path.display()))?;
let pg_ctl = cmd.output().context("pg_ctl failed")?;
if !pg_ctl.status.success() {
anyhow::bail!(
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
@@ -475,6 +480,10 @@ impl PostgresNode {
self.pg_ctl(&["start"], auth_token)
}
pub fn restart(&self, auth_token: &Option<String>) -> Result<()> {
self.pg_ctl(&["restart"], auth_token)
}
pub fn stop(&self, destroy: bool) -> Result<()> {
// If we are going to destroy data directory,
// use immediate shutdown mode, otherwise,
@@ -505,4 +514,26 @@ impl PostgresNode {
"postgres"
)
}
// XXX: cache that in control plane
pub fn whoami(&self) -> String {
let output = Command::new("whoami")
.output()
.expect("failed to execute whoami");
assert!(output.status.success(), "whoami failed");
String::from_utf8(output.stdout).unwrap().trim().to_string()
}
}
impl Drop for PostgresNode {
// destructor to clean up state after test is done
// XXX: we may detect failed test by setting some flag in catch_unwind()
// and checking it here. But let just clean datadirs on start.
fn drop(&mut self) {
if self.is_test {
let _ = self.stop(true);
}
}
}

View File

@@ -363,11 +363,6 @@ impl PageServerNode {
.map(|x| serde_json::from_str(x))
.transpose()
.context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings
.remove("min_resident_size_override")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'min_resident_size_override' as integer")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -440,11 +435,6 @@ impl PageServerNode {
.map(|x| serde_json::from_str(x))
.transpose()
.context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings
.get("min_resident_size_override")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'min_resident_size_override' as an integer")?,
})
.send()?
.error_from_body()?;

View File

@@ -156,7 +156,7 @@ impl SafekeeperNode {
}
background_process::start_process(
&format!("safekeeper-{id}"),
&format!("safekeeper {id}"),
&datadir,
&self.env.safekeeper_bin(),
&args,

View File

@@ -120,7 +120,6 @@ pub struct TenantCreateRequest {
// We might do that once the eviction feature has stabilizied.
// For now, this field is not even documented in the openapi_spec.yml.
pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>,
}
#[serde_as]
@@ -166,7 +165,6 @@ pub struct TenantConfigRequest {
// We might do that once the eviction feature has stabilizied.
// For now, this field is not even documented in the openapi_spec.yml.
pub eviction_policy: Option<serde_json::Value>,
pub min_resident_size_override: Option<u64>,
}
impl TenantConfigRequest {
@@ -187,7 +185,6 @@ impl TenantConfigRequest {
max_lsn_wal_lag: None,
trace_read_requests: None,
eviction_policy: None,
min_resident_size_override: None,
}
}
}

View File

@@ -293,9 +293,6 @@ impl FeStartupPacket {
// We shouldn't advance `buf` as probably full message is not there yet,
// so can't directly use Bytes::get_u32 etc.
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
// which is less readable
#[allow(clippy::manual_range_contains)]
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
return Err(ProtocolError::Protocol(format!(
"invalid startup packet message length {}",
@@ -939,40 +936,35 @@ impl<'a> BeMessage<'a> {
}
}
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
/// Serialized in custom flexible key/value format. In replication protocol, it
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
/// Standby status update / Hot standby feedback messages.
// Neon extension of postgres replication protocol
// See NEON_STATUS_UPDATE_TAG_BYTE
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct PageserverFeedback {
/// Last known size of the timeline. Used to enforce timeline size limit.
pub struct ReplicationFeedback {
// Last known size of the timeline. Used to enforce timeline size limit.
pub current_timeline_size: u64,
/// LSN last received and ingested by the pageserver.
pub last_received_lsn: u64,
/// LSN up to which data is persisted by the pageserver to its local disc.
pub disk_consistent_lsn: u64,
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
/// consider WAL before it can be removed.
pub remote_consistent_lsn: u64,
pub replytime: SystemTime,
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
pub ps_writelsn: u64,
pub ps_applylsn: u64,
pub ps_flushlsn: u64,
pub ps_replytime: SystemTime,
}
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
// Do not remove previously available fields because this might be backwards incompatible.
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
impl PageserverFeedback {
pub fn empty() -> PageserverFeedback {
PageserverFeedback {
impl ReplicationFeedback {
pub fn empty() -> ReplicationFeedback {
ReplicationFeedback {
current_timeline_size: 0,
last_received_lsn: 0,
remote_consistent_lsn: 0,
disk_consistent_lsn: 0,
replytime: SystemTime::now(),
ps_writelsn: 0,
ps_applylsn: 0,
ps_flushlsn: 0,
ps_replytime: SystemTime::now(),
}
}
// Serialize PageserverFeedback using custom format
// Serialize ReplicationFeedback using custom format
// to support protocol extensibility.
//
// Following layout is used:
@@ -982,26 +974,24 @@ impl PageserverFeedback {
// null-terminated string - key,
// uint32 - value length in bytes
// value itself
//
// TODO: change serialized fields names once all computes migrate to rename.
pub fn serialize(&self, buf: &mut BytesMut) {
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
buf.put_slice(b"current_timeline_size\0");
buf.put_i32(8);
buf.put_u64(self.current_timeline_size);
buf.put_slice(b"ps_writelsn\0");
buf.put_i32(8);
buf.put_u64(self.last_received_lsn);
buf.put_u64(self.ps_writelsn);
buf.put_slice(b"ps_flushlsn\0");
buf.put_i32(8);
buf.put_u64(self.disk_consistent_lsn);
buf.put_u64(self.ps_flushlsn);
buf.put_slice(b"ps_applylsn\0");
buf.put_i32(8);
buf.put_u64(self.remote_consistent_lsn);
buf.put_u64(self.ps_applylsn);
let timestamp = self
.replytime
.ps_replytime
.duration_since(*PG_EPOCH)
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
.as_micros() as i64;
@@ -1011,10 +1001,9 @@ impl PageserverFeedback {
buf.put_i64(timestamp);
}
// Deserialize PageserverFeedback message
// TODO: change serialized fields names once all computes migrate to rename.
pub fn parse(mut buf: Bytes) -> PageserverFeedback {
let mut rf = PageserverFeedback::empty();
// Deserialize ReplicationFeedback message
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
let mut rf = ReplicationFeedback::empty();
let nfields = buf.get_u8();
for _ in 0..nfields {
let key = read_cstr(&mut buf).unwrap();
@@ -1027,39 +1016,39 @@ impl PageserverFeedback {
b"ps_writelsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
rf.last_received_lsn = buf.get_u64();
rf.ps_writelsn = buf.get_u64();
}
b"ps_flushlsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
rf.disk_consistent_lsn = buf.get_u64();
rf.ps_flushlsn = buf.get_u64();
}
b"ps_applylsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
rf.remote_consistent_lsn = buf.get_u64();
rf.ps_applylsn = buf.get_u64();
}
b"ps_replytime" => {
let len = buf.get_i32();
assert_eq!(len, 8);
let raw_time = buf.get_i64();
if raw_time > 0 {
rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
} else {
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
}
}
_ => {
let len = buf.get_i32();
warn!(
"PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
"ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
String::from_utf8_lossy(key.as_ref())
);
buf.advance(len as usize);
}
}
}
trace!("PageserverFeedback parsed is {:?}", rf);
trace!("ReplicationFeedback parsed is {:?}", rf);
rf
}
}
@@ -1070,33 +1059,33 @@ mod tests {
#[test]
fn test_replication_feedback_serialization() {
let mut rf = PageserverFeedback::empty();
let mut rf = ReplicationFeedback::empty();
// Fill rf with some values
rf.current_timeline_size = 12345678;
// Set rounded time to be able to compare it with deserialized value,
// because it is rounded up to microseconds during serialization.
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
let mut data = BytesMut::new();
rf.serialize(&mut data);
let rf_parsed = PageserverFeedback::parse(data.freeze());
let rf_parsed = ReplicationFeedback::parse(data.freeze());
assert_eq!(rf, rf_parsed);
}
#[test]
fn test_replication_feedback_unknown_key() {
let mut rf = PageserverFeedback::empty();
let mut rf = ReplicationFeedback::empty();
// Fill rf with some values
rf.current_timeline_size = 12345678;
// Set rounded time to be able to compare it with deserialized value,
// because it is rounded up to microseconds during serialization.
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
let mut data = BytesMut::new();
rf.serialize(&mut data);
// Add an extra field to the buffer and adjust number of keys
if let Some(first) = data.first_mut() {
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
}
data.put_slice(b"new_field_one\0");
@@ -1104,7 +1093,7 @@ mod tests {
data.put_u64(42);
// Parse serialized data and check that new field is not parsed
let rf_parsed = PageserverFeedback::parse(data.freeze());
let rf_parsed = ReplicationFeedback::parse(data.freeze());
assert_eq!(rf, rf_parsed);
}

View File

@@ -26,4 +26,3 @@ workspace_hack.workspace = true
[dev-dependencies]
tempfile.workspace = true
test-context.workspace = true

View File

@@ -39,9 +39,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// No limits on the client side, which currenltly means 1000 for AWS S3.
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -67,10 +64,6 @@ impl RemotePath {
pub fn object_name(&self) -> Option<&str> {
self.0.file_name().and_then(|os_str| os_str.to_str())
}
pub fn join(&self, segment: &Path) -> Self {
Self(self.0.join(segment))
}
}
/// Storage (potentially remote) API to manage its state.
@@ -78,6 +71,9 @@ impl RemotePath {
/// providing basic CRUD operations for storage files.
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync + 'static {
/// Lists all items the storage has right now.
async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
@@ -270,7 +266,6 @@ pub struct S3Config {
/// AWS S3 has various limits on its API calls, we need not to exceed those.
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
pub concurrency_limit: NonZeroUsize,
pub max_keys_per_list_response: Option<i32>,
}
impl Debug for S3Config {
@@ -280,10 +275,6 @@ impl Debug for S3Config {
.field("bucket_region", &self.bucket_region)
.field("prefix_in_bucket", &self.prefix_in_bucket)
.field("concurrency_limit", &self.concurrency_limit)
.field(
"max_keys_per_list_response",
&self.max_keys_per_list_response,
)
.finish()
}
}
@@ -312,11 +303,6 @@ impl RemoteStorageConfig {
)
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
let max_keys_per_list_response =
parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
let storage = match (local_path, bucket_name, bucket_region) {
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
(None, None, None) => return Ok(None),
@@ -338,7 +324,6 @@ impl RemoteStorageConfig {
.map(|endpoint| parse_toml_string("endpoint", endpoint))
.transpose()?,
concurrency_limit,
max_keys_per_list_response,
}),
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
parse_toml_string("local_path", local_path)?,

View File

@@ -73,8 +73,10 @@ impl LocalFs {
Ok(None)
}
}
}
#[cfg(test)]
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
Ok(get_all_files(&self.storage_root, true)
.await?
@@ -89,10 +91,7 @@ impl LocalFs {
})
.collect())
}
}
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,

View File

@@ -102,7 +102,6 @@ pub struct S3Bucket {
client: Client,
bucket_name: String,
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
// The helps to ensure we don't exceed the thresholds.
@@ -165,7 +164,6 @@ impl S3Bucket {
Ok(Self {
client,
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
})
@@ -275,6 +273,48 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
let mut document_keys = Vec::new();
let mut continuation_token = None;
loop {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list")?;
metrics::inc_list_objects();
let fetch_response = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(self.prefix_in_bucket.clone())
.set_continuation_token(continuation_token)
.send()
.await
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})?;
document_keys.extend(
fetch_response
.contents
.unwrap_or_default()
.into_iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
);
match fetch_response.continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(document_keys)
}
/// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(
@@ -314,7 +354,6 @@ impl RemoteStorage for S3Bucket {
.set_prefix(list_prefix.clone())
.set_continuation_token(continuation_token)
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.map_err(|e| {
@@ -332,7 +371,7 @@ impl RemoteStorage for S3Bucket {
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
);
match fetch_response.next_continuation_token {
match fetch_response.continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}

View File

@@ -20,6 +20,7 @@ pub struct UnreliableWrapper {
/// Used to identify retries of different unique operation.
#[derive(Debug, Hash, Eq, PartialEq)]
enum RemoteOp {
List,
ListPrefixes(Option<RemotePath>),
Upload(RemotePath),
Download(RemotePath),
@@ -74,6 +75,12 @@ impl UnreliableWrapper {
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
/// Lists all items the storage has right now.
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
self.attempt(RemoteOp::List)?;
self.inner.list().await
}
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,

View File

@@ -1,275 +0,0 @@
use std::collections::HashSet;
use std::env;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::{test_context, AsyncTestContext};
use tokio::task::JoinSet;
use tracing::{debug, error, info};
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
/// See the client creation in [`create_s3_client`] for details on the required env vars.
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
///
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
/// where
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
///
/// Then, verifies that the client does return correct prefixes when queried:
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
///
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
/// since current default AWS S3 pagination limit is 1000.
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
///
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
};
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix =
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None)
.await
.context("client list root prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
);
let nested_remote_prefixes = test_client
.list_prefixes(Some(&base_prefix))
.await
.context("client list nested prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
let remote_only_prefixes = nested_remote_prefixes
.difference(&expected_remote_prefixes)
.collect::<HashSet<_>>();
let missing_uploaded_prefixes = expected_remote_prefixes
.difference(&nested_remote_prefixes)
.collect::<HashSet<_>>();
assert_eq!(
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
);
Ok(())
}
enum MaybeEnabledS3 {
Enabled(S3WithTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithTestBlobs),
}
struct S3WithTestBlobs {
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
remote_prefixes: HashSet<RemotePath>,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3 {
async fn setup() -> Self {
utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
.context("S3 client creation")
.expect("S3 client creation failed");
let base_prefix_str = "test/";
match upload_s3_data(
&client_with_excessive_pagination,
base_prefix_str,
upload_tasks_count,
)
.await
{
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
})
}
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
},
),
}
}
async fn teardown(self) {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
}
}
}
}
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(max_keys_per_list_response),
}),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
))
}
struct Uploads {
prefixes: HashSet<RemotePath>,
blobs: HashSet<RemotePath>,
}
async fn upload_s3_data(
client: &Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
upload_tasks_count: usize,
) -> ControlFlow<Uploads, Uploads> {
info!("Creating {upload_tasks_count} S3 files");
let mut upload_tasks = JoinSet::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
let blob_prefix = RemotePath::new(&prefix)
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(
Box::new(std::io::Cursor::new(data)),
data_len,
&blob_path,
None,
)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
}
let mut upload_tasks_failed = false;
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
while let Some(task_run_result) = upload_tasks.join_next().await {
match task_run_result
.context("task join failed")
.and_then(|task_result| task_result.context("upload task failed"))
{
Ok((upload_prefix, upload_path)) => {
uploaded_prefixes.insert(upload_prefix);
uploaded_blobs.insert(upload_path);
}
Err(e) => {
error!("Upload task failed: {e:?}");
upload_tasks_failed = true;
}
}
}
let uploads = Uploads {
prefixes: uploaded_prefixes,
blobs: uploaded_blobs,
};
if upload_tasks_failed {
ControlFlow::Break(uploads)
} else {
ControlFlow::Continue(uploads)
}
}
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
info!(
"Removing {} objects from the remote storage during cleanup",
objects_to_delete.len()
);
let mut delete_tasks = JoinSet::new();
for object_to_delete in objects_to_delete {
let task_client = Arc::clone(client);
delete_tasks.spawn(async move {
debug!("Deleting remote item at path {object_to_delete:?}");
task_client
.delete(&object_to_delete)
.await
.with_context(|| format!("{object_to_delete:?} removal"))
});
}
while let Some(task_run_result) = delete_tasks.join_next().await {
match task_run_result {
Ok(task_result) => match task_result {
Ok(()) => {}
Err(e) => error!("Delete task failed: {e:?}"),
},
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
}
}
}

View File

@@ -19,7 +19,6 @@ jsonwebtoken.workspace = true
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true
regex.workspace = true
routerify.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -20,9 +20,6 @@ pub enum ApiError {
#[error("Conflict: {0}")]
Conflict(String),
#[error("Precondition failed: {0}")]
PreconditionFailed(&'static str),
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -47,10 +44,6 @@ impl ApiError {
ApiError::Conflict(_) => {
HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT)
}
ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -23,7 +23,7 @@ pub enum IdError {
struct Id([u8; 16]);
impl Id {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id {
let mut arr = [0u8; 16];
buf.copy_to_slice(&mut arr);
Id::from(arr)
@@ -112,7 +112,7 @@ impl fmt::Debug for Id {
macro_rules! id_newtype {
($t:ident) => {
impl $t {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t {
$t(Id::get_from_buf(buf))
}

View File

@@ -51,9 +51,6 @@ pub mod history_buffer;
pub mod measured_stream;
pub mod serde_percent;
pub mod serde_regex;
/// use with fail::cfg("$name", "return(2000)")
#[macro_export]
macro_rules! failpoint_sleep_millis_async {

View File

@@ -1,91 +0,0 @@
//! A serde::Deserialize type for percentages.
//!
//! See [`Percent`] for details.
use serde::{Deserialize, Serialize};
/// If the value is not an integer between 0 and 100,
/// deserialization fails with a descriptive error.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
impl Percent {
pub const fn new(pct: u8) -> Option<Self> {
if pct <= 100 {
Some(Percent(pct))
} else {
None
}
}
pub fn get(&self) -> u8 {
self.0
}
}
fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u8, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let v: u8 = serde::de::Deserialize::deserialize(deserializer)?;
if v > 100 {
return Err(serde::de::Error::custom(
"must be an integer between 0 and 100",
));
}
Ok(v)
}
#[cfg(test)]
mod tests {
use super::Percent;
#[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)]
struct Foo {
bar: Percent,
}
#[test]
fn basics() {
let input = r#"{ "bar": 50 }"#;
let foo: Foo = serde_json::from_str(input).unwrap();
assert_eq!(foo.bar.get(), 50);
}
#[test]
fn null_handling() {
let input = r#"{ "bar": null }"#;
let res: Result<Foo, _> = serde_json::from_str(input);
assert!(res.is_err());
}
#[test]
fn zero() {
let input = r#"{ "bar": 0 }"#;
let foo: Foo = serde_json::from_str(input).unwrap();
assert_eq!(foo.bar.get(), 0);
}
#[test]
fn out_of_range_above() {
let input = r#"{ "bar": 101 }"#;
let res: Result<Foo, _> = serde_json::from_str(input);
assert!(res.is_err());
}
#[test]
fn out_of_range_below() {
let input = r#"{ "bar": -1 }"#;
let res: Result<Foo, _> = serde_json::from_str(input);
assert!(res.is_err());
}
#[test]
fn float() {
let input = r#"{ "bar": 50.5 }"#;
let res: Result<Foo, _> = serde_json::from_str(input);
assert!(res.is_err());
}
#[test]
fn string() {
let input = r#"{ "bar": "50 %" }"#;
let res: Result<Foo, _> = serde_json::from_str(input);
assert!(res.is_err());
}
}

View File

@@ -1,60 +0,0 @@
//! A `serde::{Deserialize,Serialize}` type for regexes.
use std::ops::Deref;
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct Regex(
#[serde(
deserialize_with = "deserialize_regex",
serialize_with = "serialize_regex"
)]
regex::Regex,
);
fn deserialize_regex<'de, D>(deserializer: D) -> Result<regex::Regex, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?;
Ok(re)
}
fn serialize_regex<S>(re: &regex::Regex, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
serializer.collect_str(re.as_str())
}
impl Deref for Regex {
type Target = regex::Regex;
fn deref(&self) -> &regex::Regex {
&self.0
}
}
impl PartialEq for Regex {
fn eq(&self, other: &Regex) -> bool {
// comparing the automatons would be quite complicated
self.as_str() == other.as_str()
}
}
impl Eq for Regex {}
#[cfg(test)]
mod tests {
#[test]
fn roundtrip() {
let input = r#""foo.*bar""#;
let re: super::Regex = serde_json::from_str(input).unwrap();
assert!(re.is_match("foo123bar"));
assert!(!re.is_match("foo"));
let output = serde_json::to_string(&re).unwrap();
assert_eq!(output, input);
}
}

View File

@@ -1,7 +1,25 @@
use signal_hook::flag;
use signal_hook::iterator::Signals;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
pub use signal_hook::consts::{signal::*, TERM_SIGNALS};
pub fn install_shutdown_handlers() -> anyhow::Result<ShutdownSignals> {
let term_now = Arc::new(AtomicBool::new(false));
for sig in TERM_SIGNALS {
// When terminated by a second term signal, exit with exit code 1.
// This will do nothing the first time (because term_now is false).
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
// But this will "arm" the above for the second time, by setting it to true.
// The order of registering these is important, if you put this one first, it will
// first arm and then terminate all in the first round.
flag::register(*sig, Arc::clone(&term_now))?;
}
Ok(ShutdownSignals)
}
pub enum Signal {
Quit,
Interrupt,
@@ -21,7 +39,10 @@ impl Signal {
pub struct ShutdownSignals;
impl ShutdownSignals {
pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> {
pub fn handle(
self,
mut handler: impl FnMut(Signal) -> anyhow::Result<()>,
) -> anyhow::Result<()> {
for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() {
let signal = match raw_signal {
SIGINT => Signal::Interrupt,

View File

@@ -48,7 +48,6 @@ serde_json = { workspace = true, features = ["raw_value"] }
serde_with.workspace = true
signal-hook.workspace = true
svg_fmt.workspace = true
sync_wrapper.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }

View File

@@ -0,0 +1,541 @@
use anyhow::Result;
use pageserver::repository::Key;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::{
collections::{BTreeMap, BTreeSet, HashMap},
fmt::Write,
ops::Range,
};
use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
use utils::{lsn::Lsn, project_git_version};
project_git_version!(GIT_VERSION);
// Map values to their compressed coordinate - the index the value
// would have in a sorted and deduplicated list of all values.
struct CoordinateMap<T: Ord + Copy> {
map: BTreeMap<T, usize>,
stretch: f32
}
impl<T: Ord + Copy> CoordinateMap<T> {
fn new(coords: Vec<T>, stretch: f32) -> Self {
let set: BTreeSet<T> = coords.into_iter().collect();
let mut map: BTreeMap<T, usize> = BTreeMap::new();
for (i, e) in set.iter().enumerate() {
map.insert(*e, i);
}
Self { map, stretch }
}
fn map(&self, val: T) -> f32 {
*self.map.get(&val).unwrap() as f32 * self.stretch
}
fn max(&self) -> f32 {
self.map.len() as f32 * self.stretch
}
}
fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
let split: Vec<&str> = name.split("__").collect();
let keys: Vec<&str> = split[0].split('-').collect();
let mut lsns: Vec<&str> = split[1].split('-').collect();
if lsns.len() == 1 {
lsns.push(lsns[0]);
}
let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
(keys, lsns)
}
#[derive(Serialize, Deserialize, PartialEq)]
enum LayerTraceOp {
#[serde(rename = "evict")]
Evict,
#[serde(rename = "flush")]
Flush,
#[serde(rename = "compact_create")]
CompactCreate,
#[serde(rename = "compact_delete")]
CompactDelete,
#[serde(rename = "image_create")]
ImageCreate,
#[serde(rename = "gc_delete")]
GcDelete,
#[serde(rename = "gc_start")]
GcStart,
}
impl std::fmt::Display for LayerTraceOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
let op_str = match self {
LayerTraceOp::Evict => "evict",
LayerTraceOp::Flush => "flush",
LayerTraceOp::CompactCreate => "compact_create",
LayerTraceOp::CompactDelete => "compact_delete",
LayerTraceOp::ImageCreate => "image_create",
LayerTraceOp::GcDelete => "gc_delete",
LayerTraceOp::GcStart => "gc_start",
};
f.write_str(op_str)
}
}
#[serde_with::serde_as]
#[derive(Serialize, Deserialize)]
struct LayerTraceLine {
time: u64,
op: LayerTraceOp,
#[serde(default)]
filename: String,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
cutoff: Option<Lsn>,
}
struct LayerTraceFile {
filename: String,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
}
impl LayerTraceFile {
fn is_image(&self) -> bool {
self.lsn_range.start == self.lsn_range.end
}
}
struct LayerTraceEvent {
time_rel: u64,
op: LayerTraceOp,
filename: String,
}
struct GcEvent {
time_rel: u64,
cutoff: Lsn,
}
fn main() -> Result<()> {
// Parse trace lines from stdin
let stdin = io::stdin();
let mut files: HashMap<String, LayerTraceFile> = HashMap::new();
let mut layer_events: Vec<LayerTraceEvent> = Vec::new();
let mut gc_events: Vec<GcEvent> = Vec::new();
let mut first_time: Option<u64> = None;
for line in stdin.lock().lines() {
let line = line.unwrap();
let parsed_line: LayerTraceLine = serde_json::from_str(&line)?;
let time_rel = if let Some(first_time) = first_time {
parsed_line.time - first_time
} else {
first_time = Some(parsed_line.time);
0
};
if parsed_line.op == LayerTraceOp::GcStart {
gc_events.push(GcEvent {
time_rel,
cutoff: parsed_line.cutoff.unwrap(),
});
} else {
layer_events.push(LayerTraceEvent {
time_rel,
filename: parsed_line.filename.clone(),
op: parsed_line.op,
});
if !files.contains_key(&parsed_line.filename) {
let (key_range, lsn_range) = parse_filename(&parsed_line.filename);
files.insert(parsed_line.filename.clone(), LayerTraceFile {
filename: parsed_line.filename.clone(),
key_range,
lsn_range,
});
};
}
}
let mut last_time_rel = layer_events.last().unwrap().time_rel;
if let Some(last_gc) = gc_events.last() {
last_time_rel = std::cmp::min(last_gc.time_rel, last_time_rel);
}
// Collect all coordinates
let mut keys: Vec<Key> = vec![];
let mut lsns: Vec<Lsn> = vec![];
for f in files.values() {
keys.push(f.key_range.start);
keys.push(f.key_range.end);
lsns.push(f.lsn_range.start);
lsns.push(f.lsn_range.end);
}
for gc_event in &gc_events {
lsns.push(gc_event.cutoff);
}
// Analyze
let key_map = CoordinateMap::new(keys, 2.0);
// Stretch out vertically for better visibility
let lsn_map = CoordinateMap::new(lsns, 3.0);
// Initialize stats
let mut num_deltas = 0;
let mut num_images = 0;
let mut svg = String::new();
// Draw
writeln!(svg,
"{}",
BeginSvg {
w: key_map.max(),
h: lsn_map.max(),
}
)?;
let lsn_max = lsn_map.max();
// Sort the files by LSN, but so that image layers go after all delta layers
// The SVG is painted in the order the elements appear, and we want to draw
// image layers on top of the delta layers if they overlap
let mut files_sorted: Vec<LayerTraceFile> = files.into_values().collect();
files_sorted.sort_by(|a, b| {
if a.is_image() && !b.is_image() {
Ordering::Greater
} else if !a.is_image() && b.is_image() {
Ordering::Less
} else {
a.lsn_range.end.cmp(&b.lsn_range.end)
}
});
for f in files_sorted {
let key_start = key_map.map(f.key_range.start);
let key_end = key_map.map(f.key_range.end);
let key_diff = key_end - key_start;
if key_start >= key_end {
panic!("Invalid key range {}-{}", key_start, key_end);
}
let lsn_start = lsn_map.map(f.lsn_range.start);
let lsn_end = lsn_map.map(f.lsn_range.end);
// Fill in and thicken rectangle if it's an
// image layer so that we can see it.
let mut style = Style::default();
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
let y_start = (lsn_max - lsn_start) as f32;
let y_end = (lsn_max - lsn_end) as f32;
let x_margin = 0.25;
let y_margin = 0.5;
match f.lsn_range.start.cmp(&f.lsn_range.end) {
Ordering::Less => {
num_deltas += 1;
write!(svg,
r#" <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
f.filename,
key_start as f32 + x_margin,
y_end + y_margin,
key_diff as f32 - x_margin * 2.0,
y_start - y_end - y_margin * 2.0,
1.0, // border_radius,
style.to_string(),
)?;
write!(svg, "<title>{}<br>{} - {}</title>", f.filename, lsn_end, y_end)?;
writeln!(svg, "</rect>")?;
}
Ordering::Equal => {
num_images += 1;
//lsn_diff = 0.3;
//lsn_offset = -lsn_diff / 2.0;
//margin = 0.05;
style.fill = Fill::Color(rgb(0x80, 0, 0x80));
style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
write!(svg,
r#" <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
f.filename,
key_start as f32 + x_margin,
y_end,
key_end as f32 - x_margin,
y_end,
style.to_string(),
)?;
write!(svg, "<title>{}<br>{} - {}</title>", f.filename, lsn_end, y_end)?;
writeln!(svg, "</line>")?;
}
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
}
}
for (idx, gc) in gc_events.iter().enumerate() {
let cutoff_lsn = lsn_map.map(gc.cutoff);
let mut style = Style::default();
style.fill = Fill::None;
style.stroke = Stroke::Color(rgb(0xff, 0, 0), 0.5);
let y = lsn_max - cutoff_lsn;
writeln!(svg,
r#" <line id="gc_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}" />"#,
idx,
0,
y,
key_map.max(),
y,
style.to_string(),
)?;
}
writeln!(svg, "{}", EndSvg)?;
let mut layer_events_str = String::new();
let mut first = true;
for e in layer_events {
if !first {
writeln!(layer_events_str, ",")?;
}
write!(layer_events_str,
r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
e.time_rel, e.filename, e.op)?;
first = false;
}
writeln!(layer_events_str)?;
let mut gc_events_str = String::new();
let mut first = true;
for e in gc_events {
if !first {
writeln!(gc_events_str, ",")?;
}
write!(gc_events_str,
r#" {{"time_rel": {}, "cutoff_lsn": "{}"}}"#,
e.time_rel, e.cutoff)?;
first = false;
}
writeln!(gc_events_str)?;
println!(r#"<!DOCTYPE html>
<html>
<head>
<style>
/* Keep the slider pinned at top */
.topbar {{
display: block;
overflow: hidden;
background-color: lightgrey;
position: fixed;
top: 0;
width: 100%;
/* width: 500px; */
}}
.slidercontainer {{
float: left;
width: 50%;
margin-right: 200px;
}}
.slider {{
float: left;
width: 100%;
}}
.legend {{
width: 200px;
float: right;
}}
/* Main content */
.main {{
margin-top: 50px; /* Add a top margin to avoid content overlay */
}}
</style>
</head>
<body onload="init()">
<script type="text/javascript">
var layer_events = [{layer_events_str}]
var gc_events = [{gc_events_str}]
let ticker;
function init() {{
moveSlider({last_time_rel})
moveSlider(0)
moveSlider(last_slider_pos)
}}
function startAnimation() {{
ticker = setInterval(animateStep, 100);
}}
function stopAnimation() {{
clearInterval(ticker);
}}
function animateStep() {{
if (last_layer_event < layer_events.length - 1) {{
var slider = document.getElementById("time-slider");
let prevPos = slider.value
let nextEvent = last_layer_event
while (nextEvent < layer_events.length - 1) {{
if (layer_events[nextEvent].time_rel > prevPos) {{
break;
}}
nextEvent += 1;
}}
let nextPos = layer_events[nextEvent].time_rel
slider.value = nextPos
moveSlider(nextPos)
}}
}}
function redoLayerEvent(n, dir) {{
var layer = document.getElementById("layer_" + layer_events[n].filename);
switch (layer_events[n].op) {{
case "evict":
break;
case "flush":
layer.style.visibility = "visible";
break;
case "compact_create":
layer.style.visibility = "visible";
break;
case "image_create":
layer.style.visibility = "visible";
break;
case "compact_delete":
layer.style.visibility = "hidden";
break;
case "gc_delete":
layer.style.visibility = "hidden";
break;
case "gc_start":
layer.style.visibility = "hidden";
break;
}}
}}
function undoLayerEvent(n) {{
var layer = document.getElementById("layer_" + layer_events[n].filename);
switch (layer_events[n].op) {{
case "evict":
break;
case "flush":
layer.style.visibility = "hidden";
break;
case "compact_create":
layer.style.visibility = "hidden";
break;
case "image_create":
layer.style.visibility = "hidden";
break;
case "compact_delete":
layer.style.visibility = "visible";
break;
case "gc_delete":
layer.style.visibility = "visible";
break;
}}
}}
function redoGcEvent(n) {{
var prev_gc_bar = document.getElementById("gc_" + (n - 1));
var new_gc_bar = document.getElementById("gc_" + n);
prev_gc_bar.style.visibility = "hidden"
new_gc_bar.style.visibility = "visible"
}}
function undoGcEvent(n) {{
var prev_gc_bar = document.getElementById("gc_" + n);
var new_gc_bar = document.getElementById("gc_" + (n - 1));
prev_gc_bar.style.visibility = "hidden"
new_gc_bar.style.visibility = "visible"
}}
var last_slider_pos = 0
var last_layer_event = 0
var last_gc_event = 0
var moveSlider = function(new_pos) {{
if (new_pos > last_slider_pos) {{
while (last_layer_event < layer_events.length - 1) {{
if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
break;
}}
last_layer_event += 1;
redoLayerEvent(last_layer_event)
}}
while (last_gc_event < gc_events.length - 1) {{
if (gc_events[last_gc_event + 1].time_rel > new_pos) {{
break;
}}
last_gc_event += 1;
redoGcEvent(last_gc_event)
}}
}}
if (new_pos < last_slider_pos) {{
while (last_layer_event > 0) {{
if (layer_events[last_layer_event - 1].time_rel < new_pos) {{
break;
}}
undoLayerEvent(last_layer_event)
last_layer_event -= 1;
}}
while (last_gc_event > 0) {{
if (gc_events[last_gc_event - 1].time_rel < new_pos) {{
break;
}}
undoGcEvent(last_gc_event)
last_gc_event -= 1;
}}
}}
last_slider_pos = new_pos;
document.getElementById("debug_pos").textContent=new_pos;
document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
document.getElementById("debug_gc_event").textContent=last_gc_event + " " + gc_events[last_gc_event].time_rel;
}}
</script>
<div class="topbar">
<div class="slidercontainer">
<label for="time-slider">TIME</label>:
<input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
pos: <span id="debug_pos"></span><br>
event: <span id="debug_layer_event"></span><br>
gc: <span id="debug_gc_event"></span><br>
</div>
<button onclick="startAnimation()">Play</button>
<button onclick="stopAnimation()">Stop</button>
<svg class="legend">
<rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
<line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
<line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
</svg>
</div>
<div class="main">
{svg}
</div>
</body>
</html>
"#);
eprintln!("num_images: {}", num_images);
eprintln!("num_deltas: {}", num_deltas);
Ok(())
}

View File

@@ -8,7 +8,6 @@ use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use remote_storage::GenericRemoteStorage;
use tracing::*;
@@ -25,9 +24,11 @@ use pageserver::{
virtual_file,
};
use postgres_backend::AuthType;
use utils::signals::ShutdownSignals;
use utils::{
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
auth::JwtAuth,
logging, project_git_version,
sentry_init::init_sentry,
signals::{self, Signal},
tcp_listener,
};
@@ -262,6 +263,9 @@ fn start_pageserver(
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Install signal handlers
let signals = signals::install_shutdown_handlers()?;
// Launch broker client
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
@@ -315,34 +319,14 @@ fn start_pageserver(
// Scan the local 'tenants/' directory and start loading the tenants
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
// been configured.
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
if let Some(remote_storage) = &remote_storage {
launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
)?;
}
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let router = http::make_router(
conf,
launch_ts,
http_auth,
remote_storage,
disk_usage_eviction_state,
)?
.build()
.map_err(|err| anyhow!(err))?;
let router = http::make_router(conf, launch_ts, http_auth, remote_storage)?
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
.serve(service)
@@ -425,7 +409,7 @@ fn start_pageserver(
}
// All started up! Now just sit and wait for shutdown signal.
ShutdownSignals::handle(|signal| match signal {
signals.handle(|signal| match signal {
Signal::Quit => {
info!(
"Got {}. Terminating in immediate shutdown mode",

View File

@@ -27,7 +27,6 @@ use utils::{
logging::LogFormat,
};
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
@@ -93,8 +92,6 @@ pub mod defaults {
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -107,8 +104,6 @@ pub mod defaults {
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
#min_resident_size_override = .. # in bytes
# [remote_storage]
"###
@@ -170,10 +165,6 @@ pub struct PageServerConf {
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
/// See the comment in `eviction_task` for details.
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint.
pub metric_collection_interval: Duration,
@@ -185,8 +176,6 @@ pub struct PageServerConf {
// See the corresponding metric's help string.
pub evictions_low_residence_duration_metric_threshold: Duration,
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
pub test_remote_failures: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -250,7 +239,7 @@ struct PageServerConfigBuilder {
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
metric_collection_interval: BuilderValue<Duration>,
cached_metric_collection_interval: BuilderValue<Duration>,
@@ -259,8 +248,6 @@ struct PageServerConfigBuilder {
evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
test_remote_failures: BuilderValue<u64>,
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
@@ -299,9 +286,7 @@ impl Default for PageServerConfigBuilder {
.expect("cannot parse default keepalive interval")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(
ConfigurableSemaphore::DEFAULT_INITIAL,
),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
metric_collection_interval: Set(humantime::parse_duration(
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
@@ -321,8 +306,6 @@ impl Default for PageServerConfigBuilder {
)
.expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
disk_usage_based_eviction: Set(None),
test_remote_failures: Set(0),
ondemand_download_behavior_treat_error_as_warn: Set(false),
@@ -406,7 +389,7 @@ impl PageServerConfigBuilder {
self.log_format = BuilderValue::Set(log_format)
}
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
@@ -442,10 +425,6 @@ impl PageServerConfigBuilder {
self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
}
pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
self.disk_usage_based_eviction = BuilderValue::Set(value);
}
pub fn ondemand_download_behavior_treat_error_as_warn(
&mut self,
ondemand_download_behavior_treat_error_as_warn: bool,
@@ -455,11 +434,6 @@ impl PageServerConfigBuilder {
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?;
Ok(PageServerConf {
listen_pg_addr: self
.listen_pg_addr
@@ -507,12 +481,11 @@ impl PageServerConfigBuilder {
.broker_keepalive_interval
.ok_or(anyhow!("No broker keepalive interval provided"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
concurrent_tenant_size_logical_size_queries,
),
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
concurrent_tenant_size_logical_size_queries,
),
concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
metric_collection_interval: self
.metric_collection_interval
.ok_or(anyhow!("missing metric_collection_interval"))?,
@@ -530,9 +503,6 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!(
"missing evictions_low_residence_duration_metric_threshold"
))?,
disk_usage_based_eviction: self
.disk_usage_based_eviction
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
test_remote_failures: self
.test_remote_failures
.ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -710,7 +680,8 @@ impl PageServerConf {
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
@@ -722,12 +693,6 @@ impl PageServerConf {
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
"evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
"disk_usage_based_eviction" => {
tracing::info!("disk_usage_based_eviction: {:#?}", &item);
builder.disk_usage_based_eviction(
toml_edit::de::from_item(item.clone())
.context("parse disk_usage_based_eviction")?)
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
@@ -832,13 +797,6 @@ impl PageServerConf {
);
}
if let Some(item) = item.get("min_resident_size_override") {
t_conf.min_resident_size_override = Some(
toml_edit::de::from_item(item.clone())
.context("parse min_resident_size_override")?,
);
}
Ok(t_conf)
}
@@ -871,8 +829,6 @@ impl PageServerConf {
broker_keepalive_interval: Duration::from_secs(5000),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
),
metric_collection_interval: Duration::from_secs(60),
cached_metric_collection_interval: Duration::from_secs(60 * 60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
@@ -881,7 +837,6 @@ impl PageServerConf {
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
)
.unwrap(),
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
}
@@ -966,11 +921,6 @@ impl ConfigurableSemaphore {
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
}
}
/// Returns the configured amount of permits.
pub fn initial_permits(&self) -> NonZeroUsize {
self.initial_permits
}
}
impl Default for ConfigurableSemaphore {
@@ -1075,8 +1025,6 @@ log_format = 'json'
)?,
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
)?,
@@ -1090,7 +1038,6 @@ log_format = 'json'
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
)?,
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
},
@@ -1138,14 +1085,11 @@ log_format = 'json'
broker_keepalive_interval: Duration::from_secs(5),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(222),
cached_metric_collection_interval: Duration::from_secs(22200),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
synthetic_size_calculation_interval: Duration::from_secs(333),
evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
},
@@ -1272,7 +1216,6 @@ broker_endpoint = '{broker_endpoint}'
prefix_in_bucket: Some(prefix_in_bucket.clone()),
endpoint: Some(endpoint.clone()),
concurrency_limit: s3_concurrency_limit,
max_keys_per_list_response: None,
}),
},
"Remote storage config should correctly parse the S3 config"

View File

@@ -1,728 +0,0 @@
//! This module implements the pageserver-global disk-usage-based layer eviction task.
//!
//! # Mechanics
//!
//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
//! loop that evicts layers in response to a shortage of available bytes
//! in the $repo/tenants directory's filesystem.
//!
//! The loop runs periodically at a configurable `period`.
//!
//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
//! It compares the returned usage data against two different types of thresholds.
//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
//!
//! # Eviction Policy
//!
//! There are two thresholds:
//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
//! If the actual usage is higher, the threshold is exceeded.
//! `min_avail_bytes` is the absolute available space in bytes.
//! If the actual usage is lower, the threshold is exceeded.
//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
//! The reservation is to keep the most recently accessed X bytes per tenant resident.
//! If we cannot relieve pressure by evicting layers outside of the reservation, we
//! start evicting layers that are part of the reservation, LRU first.
//!
//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
//! throughout the code, but, no actual variable carries that name.
//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
//! during page reconstruction.
//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
// Implementation notes:
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
// reading these fields. We use the Debug impl for semi-structured logging, though.
use std::{
collections::HashMap,
path::Path,
sync::Arc,
time::{Duration, SystemTime},
};
use anyhow::Context;
use remote_storage::GenericRemoteStorage;
use serde::{Deserialize, Serialize};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, instrument, warn, Instrument};
use utils::serde_percent::Percent;
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{self, storage_layer::PersistentLayer, Timeline},
};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DiskUsageEvictionTaskConfig {
pub max_usage_pct: Percent,
pub min_avail_bytes: u64,
#[serde(with = "humantime_serde")]
pub period: Duration,
#[cfg(feature = "testing")]
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
}
#[derive(Default)]
pub struct State {
/// Exclude http requests and background task from running at the same time.
mutex: tokio::sync::Mutex<()>,
}
pub fn launch_disk_usage_global_eviction_task(
conf: &'static PageServerConf,
storage: GenericRemoteStorage,
state: Arc<State>,
) -> anyhow::Result<()> {
let Some(task_config) = &conf.disk_usage_based_eviction else {
info!("disk usage based eviction task not configured");
return Ok(());
};
info!("launching disk usage based eviction task");
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
"disk usage based eviction",
false,
async move {
disk_usage_eviction_task(
&state,
task_config,
storage,
&conf.tenants_path(),
task_mgr::shutdown_token(),
)
.await;
info!("disk usage based eviction task finishing");
Ok(())
},
);
Ok(())
}
#[instrument(skip_all)]
async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: GenericRemoteStorage,
tenants_dir: &Path,
cancel: CancellationToken,
) {
use crate::tenant::tasks::random_init_delay;
{
if random_init_delay(task_config.period, &cancel)
.await
.is_err()
{
info!("shutting down");
return;
}
}
let mut iteration_no = 0;
loop {
iteration_no += 1;
let start = Instant::now();
async {
let res = disk_usage_eviction_task_iteration(
state,
task_config,
&storage,
tenants_dir,
&cancel,
)
.await;
match res {
Ok(()) => {}
Err(e) => {
// these stat failures are expected to be very rare
warn!("iteration failed, unexpected error: {e:#}");
}
}
}
.instrument(tracing::info_span!("iteration", iteration_no))
.await;
let sleep_until = start + task_config.period;
tokio::select! {
_ = tokio::time::sleep_until(sleep_until) => {},
_ = cancel.cancelled() => {
info!("shutting down");
break
}
}
}
}
pub trait Usage: Clone + Copy + std::fmt::Debug {
fn has_pressure(&self) -> bool;
fn add_available_bytes(&mut self, bytes: u64);
}
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
match outcome {
IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
// nothing to do, select statement below will handle things
}
IterationOutcome::Finished(outcome) => {
// Verify with statvfs whether we made any real progress
let after = filesystem_level_usage::get(tenants_dir, task_config)
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
.context("get filesystem-level disk usage after evictions")?;
debug!(?after, "disk usage");
if after.has_pressure() {
// Don't bother doing an out-of-order iteration here now.
// In practice, the task period is set to a value in the tens-of-seconds range,
// which will cause another iteration to happen soon enough.
// TODO: deltas between the three different usages would be helpful,
// consider MiB, GiB, TiB
warn!(?outcome, ?after, "disk usage still high");
} else {
info!(?outcome, ?after, "disk usage pressure relieved");
}
}
}
}
Err(e) => {
error!("disk_usage_eviction_iteration failed: {:#}", e);
}
}
Ok(())
}
#[derive(Debug, Serialize)]
#[allow(clippy::large_enum_variant)]
pub enum IterationOutcome<U> {
NoPressure,
Cancelled,
Finished(IterationOutcomeFinished<U>),
}
#[allow(dead_code)]
#[derive(Debug, Serialize)]
pub struct IterationOutcomeFinished<U> {
/// The actual usage observed before we started the iteration.
before: U,
/// The expected value for `after`, according to internal accounting, after phase 1.
planned: PlannedUsage<U>,
/// The outcome of phase 2, where we actually do the evictions.
///
/// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
/// be the same as `planned`.
assumed: AssumedUsage<U>,
}
#[derive(Debug, Serialize)]
#[allow(dead_code)]
struct AssumedUsage<U> {
/// The expected value for `after`, after phase 2.
projected_after: U,
/// The layers we failed to evict during phase 2.
failed: LayerCount,
}
#[allow(dead_code)]
#[derive(Debug, Serialize)]
struct PlannedUsage<U> {
respecting_tenant_min_resident_size: U,
fallback_to_global_lru: Option<U>,
}
#[allow(dead_code)]
#[derive(Debug, Default, Serialize)]
struct LayerCount {
file_sizes: u64,
count: usize,
}
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
storage: &GenericRemoteStorage,
usage_pre: U,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
let _g = state
.mutex
.try_lock()
.map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
debug!(?usage_pre, "disk usage");
if !usage_pre.has_pressure() {
return Ok(IterationOutcome::NoPressure);
}
warn!(
?usage_pre,
"running disk usage based eviction due to pressure"
);
let candidates = match collect_eviction_candidates(cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
EvictionCandidates::Finished(partitioned) => partitioned,
};
// Debug-log the list of candidates
let now = SystemTime::now();
for (i, (partition, candidate)) in candidates.iter().enumerate() {
debug!(
"cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
i + 1,
candidates.len(),
candidate.layer.file_size(),
now.duration_since(candidate.last_activity_ts)
.unwrap()
.as_micros(),
partition,
candidate.layer.get_tenant_id(),
candidate.layer.get_timeline_id(),
candidate.layer.filename().file_name(),
);
}
// phase1: select victims to relieve pressure
//
// Walk through the list of candidates, until we have accumulated enough layers to get
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
// how much disk space would be used after evicting all the layers up to the current
// point in the list. The layers are collected in 'batched', grouped per timeline.
//
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
"took enough candidates for pressure to be relieved"
);
break;
}
if partition == MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.file_size());
batched
.entry(TimelineKey(candidate.timeline))
.or_default()
.push(candidate.layer);
}
let usage_planned = match warned {
Some(respecting_tenant_min_resident_size) => PlannedUsage {
respecting_tenant_min_resident_size,
fallback_to_global_lru: Some(usage_planned),
},
None => PlannedUsage {
respecting_tenant_min_resident_size: usage_planned,
fallback_to_global_lru: None,
},
};
debug!(?usage_planned, "usage planned");
// phase2: evict victims batched by timeline
// After the loop, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
for (timeline, batch) in batched {
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
let batch_size = batch.len();
debug!(%timeline_id, "evicting batch for timeline");
async {
let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
match results {
Err(e) => {
warn!("failed to evict batch: {:#}", e);
}
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
match result {
Some(Ok(true)) => {
usage_assumed.add_available_bytes(layer.file_size());
}
Some(Ok(false)) => {
// this is:
// - Replacement::{NotFound, Unexpected}
// - it cannot be is_remote_layer, filtered already
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
return;
}
Some(Err(e)) => {
// we really shouldn't be getting this, precondition failure
error!("failed to evict layer: {:#}", e);
}
}
}
}
}
}
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
.await;
if cancel.is_cancelled() {
return Ok(IterationOutcome::Cancelled);
}
}
Ok(IterationOutcome::Finished(IterationOutcomeFinished {
before: usage_pre,
planned: usage_planned,
assumed: AssumedUsage {
projected_after: usage_assumed,
failed: evictions_failed,
},
}))
}
#[derive(Clone)]
struct EvictionCandidate {
timeline: Arc<Timeline>,
layer: Arc<dyn PersistentLayer>,
last_activity_ts: SystemTime,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
enum MinResidentSizePartition {
Above,
Below,
}
enum EvictionCandidates {
Cancelled,
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
}
/// Gather the eviction candidates.
///
/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
/// order. A caller that evicts in that order, until pressure is relieved, implements
/// the eviction policy outlined in the module comment.
///
/// # Example
///
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
/// Each layer has size 100, and both tenant's min_resident_size is 150.
/// The eviction order would be
///
/// ```text
/// partition last_activity_ts tenant/layer
/// Above 18:30 A/c
/// Above 19:00 A/b
/// Above 18:29 B/c
/// Above 19:05 B/b
/// Above 20:00 B/a
/// Above 20:03 A/a
/// Below 20:30 A/d
/// Below 20:40 B/d
/// Below 20:45 B/e
/// Below 20:58 A/e
/// ```
///
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
///
/// But, if we need to evict 900 bytes to relieve pressure, we'd evict
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
/// after exhauting the `Above` partition.
/// So, we did not respect each tenant's min_resident_size.
async fn collect_eviction_candidates(
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
// get a snapshot of the list of tenants
let tenants = tenant::mgr::list_tenants()
.await
.context("get list of tenants")?;
let mut candidates = Vec::new();
for (tenant_id, _state) in &tenants {
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
Ok(tenant) => tenant,
Err(e) => {
// this can happen if tenant has lifecycle transition after we fetched it
debug!("failed to get tenant: {e:#}");
continue;
}
};
// collect layers from all timelines in this tenant
//
// If one of the timelines becomes `!is_active()` during the iteration,
// for example because we're shutting down, then `max_layer_size` can be too small.
// That's OK. This code only runs under a disk pressure situation, and being
// a little unfair to tenants during shutdown in such a situation is tolerable.
let mut tenant_candidates = Vec::new();
let mut max_layer_size = 0;
for tl in tenant.list_timelines() {
if !tl.is_active() {
continue;
}
let info = tl.get_local_layers_for_disk_usage_eviction();
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(
info.resident_layers
.into_iter()
.map(|layer_infos| (tl.clone(), layer_infos)),
);
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}
}
// `min_resident_size` defaults to maximum layer file size of the tenant.
// This ensures that each tenant can have at least one layer resident at a given time,
// ensuring forward progress for a single Timeline::get in that tenant.
// It's a questionable heuristic since, usually, there are many Timeline::get
// requests going on for a tenant, and, at least in Neon prod, the median
// layer file size is much smaller than the compaction target size.
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
// That's what's typically used by the various background loops.
//
// The default can be overriden with a fixed value in the tenant conf.
// A default override can be put in the default tenant conf in the pageserver.toml.
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
debug!(
tenant_id=%tenant.tenant_id(),
overriden_size=s,
"using overridden min resident size for tenant"
);
s
} else {
debug!(
tenant_id=%tenant.tenant_id(),
max_layer_size,
"using max layer size as min_resident_size for tenant",
);
max_layer_size
};
// Sort layers most-recently-used first, then partition by
// cumsum above/below min_resident_size.
tenant_candidates
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
for (timeline, layer_info) in tenant_candidates.into_iter() {
let file_size = layer_info.file_size();
let candidate = EvictionCandidate {
timeline,
last_activity_ts: layer_info.last_activity_ts,
layer: layer_info.layer,
};
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
} else {
MinResidentSizePartition::Below
};
candidates.push((partition, candidate));
cumsum += i128::from(file_size);
}
}
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
Ok(EvictionCandidates::Finished(candidates))
}
struct TimelineKey(Arc<Timeline>);
impl PartialEq for TimelineKey {
fn eq(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.0, &other.0)
}
}
impl Eq for TimelineKey {}
impl std::hash::Hash for TimelineKey {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
Arc::as_ptr(&self.0).hash(state);
}
}
impl std::ops::Deref for TimelineKey {
type Target = Timeline;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
mod filesystem_level_usage {
use std::path::Path;
use anyhow::Context;
use crate::statvfs::Statvfs;
use super::DiskUsageEvictionTaskConfig;
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)]
pub struct Usage<'a> {
config: &'a DiskUsageEvictionTaskConfig,
/// Filesystem capacity
total_bytes: u64,
/// Free filesystem space
avail_bytes: u64,
}
impl super::Usage for Usage<'_> {
fn has_pressure(&self) -> bool {
let usage_pct =
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
let pressures = [
(
"min_avail_bytes",
self.avail_bytes < self.config.min_avail_bytes,
),
(
"max_usage_pct",
usage_pct >= self.config.max_usage_pct.get() as u64,
),
];
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
}
fn add_available_bytes(&mut self, bytes: u64) {
self.avail_bytes += bytes;
}
}
pub fn get<'a>(
tenants_dir: &Path,
config: &'a DiskUsageEvictionTaskConfig,
) -> anyhow::Result<Usage<'a>> {
let mock_config = {
#[cfg(feature = "testing")]
{
config.mock_statvfs.as_ref()
}
#[cfg(not(feature = "testing"))]
{
None
}
};
let stat = Statvfs::get(tenants_dir, mock_config)
.context("statvfs failed, presumably directory got unlinked")?;
// https://unix.stackexchange.com/a/703650
let blocksize = if stat.fragment_size() > 0 {
stat.fragment_size()
} else {
stat.block_size()
};
// use blocks_available (b_avail) since, pageserver runs as unprivileged user
let avail_bytes = stat.blocks_available() * blocksize;
let total_bytes = stat.blocks() * blocksize;
Ok(Usage {
config,
total_bytes,
avail_bytes,
})
}
#[test]
fn max_usage_pct_pressure() {
use super::Usage as _;
use std::time::Duration;
use utils::serde_percent::Percent;
let mut usage = Usage {
config: &DiskUsageEvictionTaskConfig {
max_usage_pct: Percent::new(85).unwrap(),
min_avail_bytes: 0,
period: Duration::MAX,
#[cfg(feature = "testing")]
mock_statvfs: None,
},
total_bytes: 100_000,
avail_bytes: 0,
};
assert!(usage.has_pressure(), "expected pressure at 100%");
usage.add_available_bytes(14_000);
assert!(usage.has_pressure(), "expected pressure at 86%");
usage.add_available_bytes(999);
assert!(usage.has_pressure(), "expected pressure at 85.001%");
usage.add_available_bytes(1);
assert!(usage.has_pressure(), "expected pressure at precisely 85%");
usage.add_available_bytes(1);
assert!(!usage.has_pressure(), "no pressure at 84.999%");
usage.add_available_bytes(999);
assert!(!usage.has_pressure(), "no pressure at 84%");
usage.add_available_bytes(16_000);
assert!(!usage.has_pressure());
}
}

View File

@@ -27,31 +27,6 @@ paths:
id:
type: integer
/v1/disk_usage_eviction/run:
put:
description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space.
security: []
requestBody:
content:
application/json:
schema:
type: object
required:
- evict_bytes
properties:
evict_bytes:
type: integer
responses:
"200":
description: |
The run completed.
This does not necessarily mean that we actually evicted `evict_bytes`.
Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`.
content:
application/json:
schema:
type: object
/v1/tenant/{tenant_id}:
parameters:
- name: tenant_id
@@ -208,19 +183,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Timeline not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"412":
description: Tenant is missing
content:
application/json:
schema:
$ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
@@ -421,12 +383,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Tenant not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
@@ -898,9 +854,13 @@ components:
type: object
properties:
tenant_specific_overrides:
$ref: "#/components/schemas/TenantConfigInfo"
type: object
schema:
$ref: "#/components/schemas/TenantConfigInfo"
effective_config:
$ref: "#/components/schemas/TenantConfigInfo"
type: object
schema:
$ref: "#/components/schemas/TenantConfigInfo"
TimelineInfo:
type: object
required:
@@ -986,13 +946,6 @@ components:
properties:
msg:
type: string
PreconditionFailedError:
type: object
required:
- msg
properties:
msg:
type: string
security:
- JWT: []

View File

@@ -18,7 +18,6 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::disk_usage_eviction_task;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::TenantConfOpt;
@@ -49,7 +48,6 @@ struct State {
auth: Option<Arc<JwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
}
impl State {
@@ -57,7 +55,6 @@ impl State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
remote_storage: Option<GenericRemoteStorage>,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
@@ -68,7 +65,6 @@ impl State {
auth,
allowlist_routes,
remote_storage,
disk_usage_eviction_state,
})
}
}
@@ -135,34 +131,6 @@ impl From<TenantStateError> for ApiError {
}
}
impl From<crate::tenant::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
use crate::tenant::DeleteTimelineError::*;
match value {
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
HasChildren => ApiError::BadRequest(anyhow::anyhow!(
"Cannot delete timeline which has child timelines"
)),
Other(e) => ApiError::InternalServerError(e),
}
}
}
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
use crate::tenant::mgr::DeleteTimelineError::*;
match value {
// Report Precondition failed so client can distinguish between
// "tenant is missing" case from "timeline is missing"
Tenant(TenantStateError::NotFound(..)) => {
ApiError::PreconditionFailed("Requested tenant is missing")
}
Tenant(t) => ApiError::from(t),
Timeline(t) => ApiError::from(t),
}
}
}
// Helper function to construct a TimelineInfo struct for a timeline
async fn build_timeline_info(
timeline: &Arc<Timeline>,
@@ -779,8 +747,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
);
}
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
let target_tenant_id = request_data
.new_tenant_id
.map(TenantId::from)
@@ -912,8 +878,6 @@ async fn update_tenant_config_handler(
);
}
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
let state = get_state(&request);
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
@@ -922,20 +886,6 @@ async fn update_tenant_config_handler(
json_response(StatusCode::OK, ())
}
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
#[cfg(feature = "testing")]
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.await
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test");
json_response(StatusCode::OK, ())
}
#[cfg(feature = "testing")]
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
@@ -1085,89 +1035,6 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
json_response(StatusCode::NO_CONTENT, ())
}
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
struct Config {
/// How many bytes to evict before reporting that pressure is relieved.
evict_bytes: u64,
}
#[derive(Debug, Clone, Copy, serde::Serialize)]
struct Usage {
// remains unchanged after instantiation of the struct
config: Config,
// updated by `add_available_bytes`
freed_bytes: u64,
}
impl crate::disk_usage_eviction_task::Usage for Usage {
fn has_pressure(&self) -> bool {
self.config.evict_bytes > self.freed_bytes
}
fn add_available_bytes(&mut self, bytes: u64) {
self.freed_bytes += bytes;
}
}
let config = json_request::<Config>(&mut r)
.await
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
let usage = Usage {
config,
freed_bytes: 0,
};
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
let (tx, rx) = tokio::sync::oneshot::channel();
let state = get_state(&r);
let Some(storage) = state.remote_storage.clone() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)))
};
let state = state.disk_usage_eviction_state.clone();
let cancel = CancellationToken::new();
let child_cancel = cancel.clone();
let _g = cancel.drop_guard();
crate::task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
"ondemand disk usage eviction",
false,
async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
&storage,
usage,
&child_cancel,
)
.await;
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
let _ = tx.send(res);
Ok(())
}
.in_current_span(),
);
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, response)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -1180,7 +1047,6 @@ pub fn make_router(
launch_ts: &'static LaunchTimestamp,
auth: Option<Arc<JwtAuth>>,
remote_storage: Option<GenericRemoteStorage>,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1225,8 +1091,7 @@ pub fn make_router(
Ok(router
.data(Arc::new(
State::new(conf, auth, remote_storage, disk_usage_eviction_state)
.context("Failed to initialize router state")?,
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
))
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
.put(
@@ -1307,13 +1172,6 @@ pub fn make_router(
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
)
.put("/v1/disk_usage_eviction/run", |r| {
RequestSpan(disk_usage_eviction_run).handle(r)
})
.put(
"/v1/tenant/:tenant_id/break",
testing_api!("set tenant state to broken", handle_tenant_break),
)
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
.any(handler_404))
}

View File

@@ -1,11 +1,12 @@
use crate::repository::{key_range_size, singleton_range, Key};
use postgres_ffi::BLCKSZ;
use std::ops::Range;
use tracing::debug;
///
/// Represents a set of Keys, in a compact form.
///
#[derive(Clone, Debug)]
#[derive(Clone, Debug, Default)]
pub struct KeySpace {
/// Contiguous ranges of keys that belong to the key space. In key order,
/// and with no overlap.
@@ -61,6 +62,60 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Add range to keyspace.
///
/// Unlike KeySpaceAccum, it accepts key ranges in any order and overlapping ranges.
pub fn add_range(&mut self, range: Range<Key>) {
let start = range.start;
let mut end = range.end;
let mut prev_index = match self.ranges.binary_search_by_key(&end, |r| r.start) {
Ok(index) => index,
Err(0) => {
self.ranges.insert(0, range);
return;
}
Err(index) => index - 1,
};
loop {
let mut prev = &mut self.ranges[prev_index];
if prev.end >= start {
// two ranges overlap
if prev.start <= start {
// combine with prev range
if prev.end < end {
prev.end = end;
debug!("Extend wanted image {}..{}", prev.start, end);
}
return;
} else {
if prev.end > end {
end = prev.end;
}
self.ranges.remove(prev_index);
}
} else {
break;
}
if prev_index == 0 {
break;
}
prev_index -= 1;
}
debug!("Wanted image {}..{}", start, end);
self.ranges.insert(prev_index, start..end);
}
///
/// Check if key space contains overlapping range
///
pub fn overlaps(&self, range: &Range<Key>) -> bool {
match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
Ok(_) => false,
Err(0) => false,
Err(index) => self.ranges[index - 1].end > range.start,
}
}
}
///

View File

@@ -4,7 +4,6 @@ pub mod broker_client;
pub mod config;
pub mod consumption_metrics;
pub mod context;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
@@ -13,7 +12,6 @@ pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod repository;
pub(crate) mod statvfs;
pub mod task_mgr;
pub mod tenant;
pub mod trace;

View File

@@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration {
}
pub fn observe(&self, observed_value: Duration) {
if observed_value < self.threshold {
if self.threshold < observed_value {
self.counter
.as_ref()
.expect("nobody calls this function after `remove_from_vec`")
@@ -586,6 +586,7 @@ pub struct TimelineMetrics {
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
pub init_logical_size_histo: StorageTimeMetrics,
pub logical_size_histo: StorageTimeMetrics,
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
@@ -618,6 +619,8 @@ impl TimelineMetrics {
let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
let create_images_time_histo =
StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
let init_logical_size_histo =
StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
let load_layer_map_histo =
StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
@@ -654,6 +657,7 @@ impl TimelineMetrics {
flush_time_histo,
compact_time_histo,
create_images_time_histo,
init_logical_size_histo,
logical_size_histo,
garbage_collect_histo,
load_layer_map_histo,

View File

@@ -27,7 +27,6 @@ use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::io;
use std::net::TcpListener;
use std::pin::pin;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
@@ -467,7 +466,8 @@ impl PageServerHandler {
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
let copyin_reader = StreamReader::new(copyin_stream(pgb));
tokio::pin!(copyin_reader);
timeline
.import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
.await?;
@@ -512,7 +512,8 @@ impl PageServerHandler {
info!("importing wal");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
let copyin_reader = StreamReader::new(copyin_stream(pgb));
tokio::pin!(copyin_reader);
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete");

View File

@@ -1,150 +0,0 @@
//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
use std::path::Path;
pub enum Statvfs {
Real(nix::sys::statvfs::Statvfs),
Mock(mock::Statvfs),
}
// NB: on macOS, the block count type of struct statvfs is u32.
// The workaround seems to be to use the non-standard statfs64 call.
// Sincce it should only be a problem on > 2TiB disks, let's ignore
// the problem for now and upcast to u64.
impl Statvfs {
pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
if let Some(mocked) = mocked {
Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
} else {
Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
}
}
// NB: allow() because the block count type is u32 on macOS.
#[allow(clippy::useless_conversion)]
pub fn blocks(&self) -> u64 {
match self {
Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
Statvfs::Mock(stat) => stat.blocks,
}
}
// NB: allow() because the block count type is u32 on macOS.
#[allow(clippy::useless_conversion)]
pub fn blocks_available(&self) -> u64 {
match self {
Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
Statvfs::Mock(stat) => stat.blocks_available,
}
}
pub fn fragment_size(&self) -> u64 {
match self {
Statvfs::Real(stat) => stat.fragment_size(),
Statvfs::Mock(stat) => stat.fragment_size,
}
}
pub fn block_size(&self) -> u64 {
match self {
Statvfs::Real(stat) => stat.block_size(),
Statvfs::Mock(stat) => stat.block_size,
}
}
}
pub mod mock {
use anyhow::Context;
use regex::Regex;
use std::path::Path;
use tracing::log::info;
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "type")]
pub enum Behavior {
Success {
blocksize: u64,
total_blocks: u64,
name_filter: Option<utils::serde_regex::Regex>,
},
Failure {
mocked_error: MockedError,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[allow(clippy::upper_case_acronyms)]
pub enum MockedError {
EIO,
}
impl From<MockedError> for nix::Error {
fn from(e: MockedError) -> Self {
match e {
MockedError::EIO => nix::Error::EIO,
}
}
}
pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
info!("running mocked statvfs");
match behavior {
Behavior::Success {
blocksize,
total_blocks,
ref name_filter,
} => {
let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
// round it up to the nearest block multiple
let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
if used_blocks > *total_blocks {
panic!(
"mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}"
);
}
let avail_blocks = total_blocks - used_blocks;
Ok(Statvfs {
blocks: *total_blocks,
blocks_available: avail_blocks,
fragment_size: *blocksize,
block_size: *blocksize,
})
}
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
}
}
fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
let mut total = 0;
for entry in walkdir::WalkDir::new(path) {
let entry = entry?;
if !entry.file_type().is_file() {
continue;
}
if !name_filter
.as_ref()
.map(|filter| filter.is_match(entry.file_name().to_str().unwrap()))
.unwrap_or(true)
{
continue;
}
total += entry
.metadata()
.with_context(|| format!("get metadata of {:?}", entry.path()))?
.len();
}
Ok(total)
}
pub struct Statvfs {
pub blocks: u64,
pub blocks_available: u64,
pub fragment_size: u64,
pub block_size: u64,
}
}

View File

@@ -234,9 +234,6 @@ pub enum TaskKind {
// Eviction. One per timeline.
Eviction,
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
// Initial logical size calculation
InitialLogicalSizeCalculation,
@@ -484,25 +481,13 @@ pub async fn shutdown_tasks(
for task in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
info!("waiting for {} to shut down", task.name);
let join_handle = task_mut.join_handle.take();
drop(task_mut);
join_handle
};
if let Some(mut join_handle) = join_handle {
let completed = tokio::select! {
_ = &mut join_handle => { true },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for {} to shut down", task.name);
false
}
};
if !completed {
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)
// - task errors are already logged in the wrapper
let _ = join_handle.await;
}
if let Some(join_handle) = join_handle {
let _ = join_handle.await;
} else {
// Possibly one of:
// * The task had not even fully started yet.

View File

@@ -46,7 +46,6 @@ use std::time::{Duration, Instant};
use self::config::TenantConf;
use self::metadata::TimelineMetadata;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::EvictionTaskTenantState;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
@@ -95,7 +94,7 @@ mod timeline;
pub mod size;
pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
pub use timeline::{PageReconstructError, Timeline};
// re-export this function so that page_cache.rs can use it.
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -143,8 +142,6 @@ pub struct Tenant {
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
cached_synthetic_tenant_size: Arc<AtomicU64>,
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
}
/// A timeline with some of its files on disk, being initialized.
@@ -434,16 +431,6 @@ remote:
}
}
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("NotFound")]
NotFound,
#[error("HasChildren")]
HasChildren,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
struct RemoteStartupData {
index_part: IndexPart,
remote_metadata: TimelineMetadata,
@@ -1320,7 +1307,7 @@ impl Tenant {
&self,
timeline_id: TimelineId,
_ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
) -> anyhow::Result<()> {
// Transition the timeline into TimelineState::Stopping.
// This should prevent new operations from starting.
let timeline = {
@@ -1332,13 +1319,13 @@ impl Tenant {
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
if children_exist {
return Err(DeleteTimelineError::HasChildren);
}
anyhow::ensure!(
!children_exist,
"Cannot delete timeline which has child timelines"
);
let timeline_entry = match timelines.entry(timeline_id) {
Entry::Occupied(e) => e,
Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
Entry::Vacant(_) => bail!("timeline not found"),
};
let timeline = Arc::clone(timeline_entry.get());
@@ -1706,13 +1693,6 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn get_min_resident_size_override(&self) -> Option<u64> {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.min_resident_size_override
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
*self.tenant_conf.write().unwrap() = new_tenant_conf;
}
@@ -1791,7 +1771,6 @@ impl Tenant {
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
}
}
@@ -2790,7 +2769,6 @@ pub mod harness {
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
trace_read_requests: Some(tenant_conf.trace_read_requests),
eviction_policy: Some(tenant_conf.eviction_policy),
min_resident_size_override: tenant_conf.min_resident_size_override,
}
}
}

View File

@@ -92,7 +92,6 @@ pub struct TenantConf {
pub max_lsn_wal_lag: NonZeroU64,
pub trace_read_requests: bool,
pub eviction_policy: EvictionPolicy,
pub min_resident_size_override: Option<u64>,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -160,10 +159,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub eviction_policy: Option<EvictionPolicy>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub min_resident_size_override: Option<u64>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -225,9 +220,6 @@ impl TenantConfOpt {
.trace_read_requests
.unwrap_or(global_conf.trace_read_requests),
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
min_resident_size_override: self
.min_resident_size_override
.or(global_conf.min_resident_size_override),
}
}
}
@@ -259,7 +251,6 @@ impl Default for TenantConf {
.expect("cannot parse default max walreceiver Lsn wal lag"),
trace_read_requests: false,
eviction_policy: EvictionPolicy::NoEviction,
min_resident_size_override: None,
}
}
}

View File

@@ -321,20 +321,11 @@ pub async fn get_tenant(
}
}
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("Tenant {0}")]
Tenant(#[from] TenantStateError),
#[error("Timeline {0}")]
Timeline(#[from] crate::tenant::DeleteTimelineError),
}
pub async fn delete_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
) -> Result<(), TenantStateError> {
let tenant = get_tenant(tenant_id, true).await?;
tenant.delete_timeline(timeline_id, ctx).await?;
Ok(())

View File

@@ -6,7 +6,6 @@ use std::sync::Arc;
use anyhow::{bail, Context};
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -353,10 +352,6 @@ async fn fill_logical_sizes(
// our advantage with `?` error handling.
let mut joinset = tokio::task::JoinSet::new();
let cancel = tokio_util::sync::CancellationToken::new();
// be sure to cancel all spawned tasks if we are dropped
let _dg = cancel.clone().drop_guard();
// For each point that would benefit from having a logical size available,
// spawn a Task to fetch it, unless we have it cached already.
for seg in segments.iter() {
@@ -378,7 +373,6 @@ async fn fill_logical_sizes(
timeline,
lsn,
ctx,
cancel.child_token(),
));
}
e.insert(cached_size);
@@ -483,14 +477,13 @@ async fn calculate_logical_size(
timeline: Arc<crate::tenant::Timeline>,
lsn: utils::lsn::Lsn,
ctx: RequestContext,
cancel: CancellationToken,
) -> Result<TimelineAtLsnSizeResult, RecvError> {
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
let size_res = timeline
.spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
.spawn_ondemand_logical_size_calculation(lsn, ctx)
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
.await?;
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))

View File

@@ -121,10 +121,10 @@ struct LayerAccessStatsInner {
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct LayerAccessStatFullDetails {
pub(crate) when: SystemTime,
pub(crate) task_kind: TaskKind,
pub(crate) access_kind: LayerAccessKind,
pub(super) struct LayerAccessStatFullDetails {
pub(super) when: SystemTime,
pub(super) task_kind: TaskKind,
pub(super) access_kind: LayerAccessKind,
}
#[derive(Clone, Copy, strum_macros::EnumString)]
@@ -255,7 +255,7 @@ impl LayerAccessStats {
ret
}
fn most_recent_access_or_residence_event(
pub(super) fn most_recent_access_or_residence_event(
&self,
) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
let locked = self.0.lock().unwrap();
@@ -268,13 +268,6 @@ impl LayerAccessStats {
}
}
}
pub(crate) fn latest_activity(&self) -> SystemTime {
match self.most_recent_access_or_residence_event() {
Either::Left(mra) => mra.when,
Either::Right(re) => re.timestamp,
}
}
}
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface

View File

@@ -244,12 +244,14 @@ pub(crate) async fn random_init_delay(
) -> Result<(), Cancelled> {
use rand::Rng;
if period == Duration::ZERO {
return Ok(());
}
let d = {
let mut rng = rand::thread_rng();
// gen_range asserts that the range cannot be empty, which it could be because period can
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
let period = std::cmp::max(period, Duration::from_secs(10));
// semi-ok default as the source of jitter
rng.gen_range(Duration::ZERO..=period)
};

View File

@@ -2,6 +2,7 @@
mod eviction_task;
mod walreceiver;
mod layer_trace;
use anyhow::{anyhow, bail, ensure, Context};
use bytes::Bytes;
@@ -13,19 +14,16 @@ use pageserver_api::models::{
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState,
};
use remote_storage::GenericRemoteStorage;
use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::collections::{BinaryHeap, HashMap};
use std::fs;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
use std::pin::pin;
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
@@ -73,9 +71,6 @@ use crate::ZERO_PAGE;
use crate::{is_temporary, task_mgr};
use walreceiver::spawn_connection_manager_task;
pub(super) use self::eviction_task::EvictionTaskTenantState;
use self::eviction_task::EvictionTaskTimelineState;
use super::layer_map::BatchedUpdates;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
@@ -120,6 +115,17 @@ pub struct Timeline {
pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
/// Set of key ranges which should be covered by image layers to
/// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
/// It is used by compaction task when it checks if new image layer should be created.
/// Newly created image layer doesn't help to remove the delta layer, until the
/// newly created image layer falls off the PITR horizon. So on next GC cycle,
/// gc_timeline may still want the new image layer to be created. To avoid redundant
/// image layers creation we should check if image layer exists but beyond PITR horizon.
/// This is why we need remember GC cutoff LSN.
///
wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
last_freeze_ts: RwLock<Instant>,
@@ -222,7 +228,7 @@ pub struct Timeline {
state: watch::Sender<TimelineState>,
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
layer_trace_file: Mutex<Option<std::fs::File>>,
}
/// Internal structure to hold all data needed for logical size calculation.
@@ -319,7 +325,7 @@ impl LogicalSize {
// we change the type.
match self.initial_logical_size.get() {
Some(initial_size) => {
initial_size.checked_add_signed(size_increment)
initial_size.checked_add(size_increment.try_into().unwrap())
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
.map(CurrentLogicalSize::Exact)
}
@@ -335,13 +341,9 @@ impl LogicalSize {
.fetch_add(delta, AtomicOrdering::SeqCst);
}
/// Make the value computed by initial logical size computation
/// available for re-use. This doesn't contain the incremental part.
fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
match self.initial_part_end {
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
_ => None,
}
/// Returns the initialized (already calculated) value, if any.
fn initialized_size(&self) -> Option<u64> {
self.initial_logical_size.get().copied()
}
}
@@ -679,7 +681,8 @@ impl Timeline {
let mut failed = 0;
let mut cancelled = pin!(task_mgr::shutdown_watcher());
let cancelled = task_mgr::shutdown_watcher();
tokio::pin!(cancelled);
loop {
tokio::select! {
@@ -816,11 +819,11 @@ impl Timeline {
let mut is_exact = true;
let size = current_size.size();
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
(current_size, self.current_logical_size.initial_part_end)
{
is_exact = false;
self.try_spawn_size_init_task(initial_part_end, ctx);
self.try_spawn_size_init_task(init_lsn, ctx);
}
Ok((size, is_exact))
@@ -867,6 +870,7 @@ impl Timeline {
}
pub fn activate(self: &Arc<Self>) {
self.start_layer_tracing();
self.set_state(TimelineState::Active);
self.launch_wal_receiver();
self.launch_eviction_task();
@@ -958,25 +962,6 @@ impl Timeline {
}
}
/// Evict a batch of layers.
///
/// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
///
/// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
pub async fn evict_layers(
&self,
_: &GenericRemoteStorage,
layers_to_evict: &[Arc<dyn PersistentLayer>],
cancel: CancellationToken,
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
let remote_client = self.remote_client.clone().expect(
"GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
);
self.evict_layer_batch(&remote_client, layers_to_evict, cancel)
.await
}
/// Evict multiple layers at once, continuing through errors.
///
/// Try to evict the given `layers_to_evict` by
@@ -1014,15 +999,6 @@ impl Timeline {
// now lock out layer removal (compaction, gc, timeline deletion)
let layer_removal_guard = self.layer_removal_cs.lock().await;
{
// to avoid racing with detach and delete_timeline
let state = self.current_state();
anyhow::ensure!(
state == TimelineState::Active,
"timeline is not active but {state:?}"
);
}
// start the batch update
let mut layer_map = self.layers.write().unwrap();
let mut batch_updates = layer_map.batch_update();
@@ -1056,8 +1032,6 @@ impl Timeline {
use super::layer_map::Replacement;
if local_layer.is_remote_layer() {
// TODO(issue #3851): consider returning an err here instead of false,
// which is the same out the match later
return Ok(false);
}
@@ -1120,6 +1094,7 @@ impl Timeline {
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
self.trace_layer_evict(&local_layer.filename());
self.metrics.evictions.inc();
@@ -1127,9 +1102,6 @@ impl Timeline {
self.metrics
.evictions_with_low_residence_duration
.observe(delta);
info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
} else {
info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
}
true
@@ -1229,6 +1201,7 @@ impl Timeline {
tenant_id,
pg_version,
layers: RwLock::new(LayerMap::default()),
wanted_image_layers: Mutex::new(None),
walredo_mgr,
@@ -1292,9 +1265,7 @@ impl Timeline {
state,
eviction_task_timeline_state: tokio::sync::Mutex::new(
EvictionTaskTimelineState::default(),
),
layer_trace_file: Mutex::new(None),
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
@@ -1735,7 +1706,7 @@ impl Timeline {
Ok(())
}
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
.try_acquire_owned()
{
@@ -1772,11 +1743,8 @@ impl Timeline {
false,
// NB: don't log errors here, task_mgr will do that.
async move {
// no cancellation here, because nothing really waits for this to complete compared
// to spawn_ondemand_logical_size_calculation.
let cancel = CancellationToken::new();
let calculated_size = match self_clone
.logical_size_calculation_task(lsn, &background_ctx, cancel)
.logical_size_calculation_task(init_lsn, &background_ctx)
.await
{
Ok(s) => s,
@@ -1797,7 +1765,7 @@ impl Timeline {
.size_added_after_initial
.load(AtomicOrdering::Relaxed);
let sum = calculated_size.saturating_add_signed(added);
let sum = calculated_size.saturating_add(added.try_into().unwrap());
// set the gauge value before it can be set in `update_current_logical_size`.
self_clone.metrics.current_logical_size_gauge.set(sum);
@@ -1831,7 +1799,6 @@ impl Timeline {
self: &Arc<Self>,
lsn: Lsn,
ctx: RequestContext,
cancel: CancellationToken,
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
let (sender, receiver) = oneshot::channel();
let self_clone = Arc::clone(self);
@@ -1851,9 +1818,7 @@ impl Timeline {
"ondemand logical size calculation",
false,
async move {
let res = self_clone
.logical_size_calculation_task(lsn, &ctx, cancel)
.await;
let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
let _ = sender.send(res).ok();
Ok(()) // Receiver is responsible for handling errors
},
@@ -1864,20 +1829,20 @@ impl Timeline {
#[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
async fn logical_size_calculation_task(
self: &Arc<Self>,
lsn: Lsn,
init_lsn: Lsn,
ctx: &RequestContext,
cancel: CancellationToken,
) -> Result<u64, CalculateLogicalSizeError> {
let mut timeline_state_updates = self.subscribe_for_state_updates();
let self_calculation = Arc::clone(self);
let cancel = CancellationToken::new();
let mut calculation = pin!(async {
let calculation = async {
let cancel = cancel.child_token();
let ctx = ctx.attached_child();
self_calculation
.calculate_logical_size(lsn, cancel, &ctx)
.calculate_logical_size(init_lsn, cancel, &ctx)
.await
});
};
let timeline_state_cancellation = async {
loop {
match timeline_state_updates.changed().await {
@@ -1906,6 +1871,7 @@ impl Timeline {
"aborted because task_mgr shutdown requested".to_string()
};
tokio::pin!(calculation);
loop {
tokio::select! {
res = &mut calculation => { return res }
@@ -1958,12 +1924,21 @@ impl Timeline {
// need to return something
Ok(0)
});
// See if we've already done the work for initial size calculation.
// This is a short-cut for timelines that are mostly unused.
if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
return Ok(size);
}
let timer = self.metrics.logical_size_histo.start_timer();
let timer = if up_to_lsn == self.initdb_lsn {
if let Some(size) = self.current_logical_size.initialized_size() {
if size != 0 {
// non-zero size means that the size has already been calculated by this method
// after startup. if the logical size is for a new timeline without layers the
// size will be zero, and we cannot use that, or this caching strategy until
// pageserver restart.
return Ok(size);
}
}
self.metrics.init_logical_size_histo.start_timer()
} else {
self.metrics.logical_size_histo.start_timer()
};
let logical_size = self
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
.await?;
@@ -2671,6 +2646,8 @@ impl Timeline {
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
])?;
self.trace_layer_flush(&new_delta.filename());
// Add it to the layer map
self.layers
.write()
@@ -2726,6 +2703,30 @@ impl Timeline {
let layers = self.layers.read().unwrap();
let mut max_deltas = 0;
let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
let img_range =
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
if wanted.overlaps(&img_range) {
//
// gc_timeline only pays attention to image layers that are older than the GC cutoff,
// but create_image_layers creates image layers at last-record-lsn.
// So it's possible that gc_timeline decides that it wants new image layer to be created for a key range,
// and on next compcation create_image_layers creates the image layer.
// But on next GC cycle, gc_timeline still wantes the new image layer to be created,
// because the newly created image layer doesn't help to remove the delta layer,
// until the newly created image layer falls off the PITR horizon.
//
// So we should check if image layer beyond cutoff LSN already exists.
if !layers.image_layer_exists(&img_range, &(*cutoff_lsn..lsn))? {
debug!(
"Force generation of layer {}-{} wanted by GC)",
img_range.start, img_range.end
);
return Ok(true);
}
}
}
for part_range in &partition.ranges {
let image_coverage = layers.image_coverage(part_range, lsn)?;
@@ -2845,6 +2846,11 @@ impl Timeline {
image_layers.push(image_layer);
}
}
// All wanted layers are taken in account by time_for_new_image_layer.
// The wanted_image_layers could get updated out of turn and we could
// clear something which hasn't been looked at all. This is fine, because
// next gc round any wanted would get added back in.
*self.wanted_image_layers.lock().unwrap() = None;
// Sync the new layer to disk before adding it to the layer map, to make sure
// we don't garbage collect something based on the new layer, before it has
@@ -2881,6 +2887,7 @@ impl Timeline {
self.metrics
.resident_physical_size_gauge
.add(metadata.len());
self.trace_layer_image_create(&l.filename());
updates.insert_historic(Arc::new(l));
}
updates.flush();
@@ -3311,6 +3318,7 @@ impl Timeline {
self.metrics
.resident_physical_size_gauge
.add(metadata.len());
self.trace_layer_compact_create(&l.filename());
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
@@ -3321,6 +3329,7 @@ impl Timeline {
// delete the old ones
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
for l in deltas_to_compact {
self.trace_layer_compact_delete(&l.filename());
layer_names_to_delete.push(l.filename());
self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
}
@@ -3517,6 +3526,8 @@ impl Timeline {
info!("GC starting");
self.trace_gc_start(new_gc_cutoff);
debug!("retain_lsns: {:?}", retain_lsns);
// Before deleting any layers, we need to wait for their upload ops to finish.
@@ -3531,6 +3542,7 @@ impl Timeline {
}
let mut layers_to_remove = Vec::new();
let mut wanted_image_layers = KeySpace::default();
// Scan all layers in the timeline (remote or on-disk).
//
@@ -3614,6 +3626,15 @@ impl Timeline {
"keeping {} because it is the latest layer",
l.filename().file_name()
);
// Collect delta key ranges that need image layers to allow garbage
// collecting the layers.
// It is not so obvious whether we need to propagate information only about
// delta layers. Image layers can form "stairs" preventing old image from been deleted.
// But image layers are in any case less sparse than delta layers. Also we need some
// protection from replacing recent image layers with new one after each GC iteration.
if l.is_incremental() && !LayerMap::is_l0(&*l) {
wanted_image_layers.add_range(l.get_key_range());
}
result.layers_not_updated += 1;
continue 'outer;
}
@@ -3626,6 +3647,10 @@ impl Timeline {
);
layers_to_remove.push(Arc::clone(&l));
}
self.wanted_image_layers
.lock()
.unwrap()
.replace((new_gc_cutoff, wanted_image_layers));
let mut updates = layers.batch_update();
if !layers_to_remove.is_empty() {
@@ -3640,6 +3665,7 @@ impl Timeline {
{
for doomed_layer in layers_to_remove {
layer_names_to_delete.push(doomed_layer.filename());
self.trace_layer_gc_delete(&doomed_layer.filename());
self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
result.layers_removed += 1;
}
@@ -4046,67 +4072,6 @@ impl Timeline {
}
}
pub struct DiskUsageEvictionInfo {
/// Timeline's largest layer (remote or resident)
pub max_layer_size: Option<u64>,
/// Timeline's resident layers
pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
}
pub struct LocalLayerInfoForDiskUsageEviction {
pub layer: Arc<dyn PersistentLayer>,
pub last_activity_ts: SystemTime,
}
impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
// having to allocate a string to this is bad, but it will rarely be formatted
let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
f.debug_struct("LocalLayerInfoForDiskUsageEviction")
.field("layer", &self.layer)
.field("last_activity", &ts)
.finish()
}
}
impl LocalLayerInfoForDiskUsageEviction {
pub fn file_size(&self) -> u64 {
self.layer.file_size()
}
}
impl Timeline {
pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let layers = self.layers.read().unwrap();
let mut max_layer_size: Option<u64> = None;
let mut resident_layers = Vec::new();
for l in layers.iter_historic_layers() {
let file_size = l.file_size();
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
if l.is_remote_layer() {
continue;
}
let last_activity_ts = l.access_stats().latest_activity();
resident_layers.push(LocalLayerInfoForDiskUsageEviction {
layer: l,
last_activity_ts,
});
}
DiskUsageEvictionInfo {
max_layer_size,
resident_layers,
}
}
}
type TraversalPathItem = (
ValueReconstructResult,
Lsn,

View File

@@ -14,12 +14,12 @@
//!
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
use std::{
collections::HashMap,
ops::ControlFlow,
sync::Arc,
time::{Duration, SystemTime},
};
use either::Either;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, instrument, warn};
@@ -30,22 +30,11 @@ use crate::{
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
storage_layer::PersistentLayer,
Tenant,
},
};
use super::Timeline;
#[derive(Default)]
pub struct EvictionTaskTimelineState {
last_layer_access_imitation: Option<tokio::time::Instant>,
}
#[derive(Default)]
pub struct EvictionTaskTenantState {
last_layer_access_imitation: Option<Instant>,
}
impl Timeline {
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
let self_clone = Arc::clone(self);
@@ -133,35 +122,6 @@ impl Timeline {
) -> ControlFlow<()> {
let now = SystemTime::now();
// If we evict layers but keep cached values derived from those layers, then
// we face a storm of on-demand downloads after pageserver restart.
// The reason is that the restart empties the caches, and so, the values
// need to be re-computed by accessing layers, which we evicted while the
// caches were filled.
//
// Solutions here would be one of the following:
// 1. Have a persistent cache.
// 2. Count every access to a cached value to the access stats of all layers
// that were accessed to compute the value in the first place.
// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
// get re-computed from layers, thereby counting towards layer access stats.
// 4. Make the eviction task imitate the layer accesses that typically hit caches.
//
// We follow approach (4) here because in Neon prod deployment:
// - page cache is quite small => high churn => low hit rate
// => eviction gets correct access stats
// - value-level caches such as logical size & repatition have a high hit rate,
// especially for inactive tenants
// => eviction sees zero accesses for these
// => they cause the on-demand download storm on pageserver restart
//
// We should probably move to persistent caches in the future, or avoid
// having inactive tenants attached to pageserver in the first place.
match self.imitate_layer_accesses(p, cancel, ctx).await {
ControlFlow::Break(()) => return ControlFlow::Break(()),
ControlFlow::Continue(()) => (),
}
#[allow(dead_code)]
#[derive(Debug, Default)]
struct EvictionStats {
@@ -172,6 +132,19 @@ impl Timeline {
skipped_for_shutdown: usize,
}
// what we want is to invalidate any caches which haven't been accessed for `p.threshold`,
// but we cannot actually do it for current limitations except by restarting pageserver. we
// just recompute the values which would be recomputed on startup.
//
// for active tenants this will likely materialized page cache or in-memory layers. for
// inactive tenants it will refresh the last_access timestamps so that we will not evict
// and re-download on restart these layers.
self.refresh_layers_required_in_restart(cancel, ctx).await;
if cancel.is_cancelled() {
return ControlFlow::Break(());
}
let mut stats = EvictionStats::default();
// Gather layers for eviction.
// NB: all the checks can be invalidated as soon as we release the layer map lock.
@@ -184,7 +157,13 @@ impl Timeline {
if hist_layer.is_remote_layer() {
continue;
}
let last_activity_ts = hist_layer.access_stats().latest_activity();
let last_activity_ts = match hist_layer
.access_stats()
.most_recent_access_or_residence_event()
{
Either::Left(mra) => mra.when,
Either::Right(re) => re.timestamp,
};
let no_activity_for = match now.duration_since(last_activity_ts) {
Ok(d) => d,
Err(_e) => {
@@ -269,55 +248,8 @@ impl Timeline {
ControlFlow::Continue(())
}
async fn imitate_layer_accesses(
&self,
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<()> {
let mut state = self.eviction_task_timeline_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
_ => {
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
.await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
}
}
drop(state);
if cancel.is_cancelled() {
return ControlFlow::Break(());
}
// This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
// likely, we're shutting down
return ControlFlow::Break(());
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
_ => {
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
.await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
}
}
drop(state);
if cancel.is_cancelled() {
return ControlFlow::Break(());
}
ControlFlow::Continue(())
}
/// Recompute the values which would cause on-demand downloads during restart.
async fn imitate_timeline_cached_layer_accesses(
async fn refresh_layers_required_in_restart(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -351,61 +283,4 @@ impl Timeline {
}
}
}
// Imitate the synthetic size calculation done by the consumption_metrics module.
async fn imitate_synthetic_size_calculation_worker(
&self,
tenant: &Arc<Tenant>,
ctx: &RequestContext,
cancel: &CancellationToken,
) {
if self.conf.metric_collection_endpoint.is_none() {
// We don't start the consumption metrics task if this is not set in the config.
// So, no need to imitate the accesses in that case.
return;
}
// The consumption metrics are collected on a per-tenant basis, by a single
// global background loop.
// It limits the number of synthetic size calculations using the global
// `concurrent_tenant_size_logical_size_queries` semaphore to not overload
// the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
//
// If we used that same semaphore here, then we'd compete for the
// same permits, which may impact timeliness of consumption metrics.
// That is a no-go, as consumption metrics are much more important
// than what we do here.
//
// So, we have a separate semaphore, initialized to the same
// number of permits as the `concurrent_tenant_size_logical_size_queries`.
// In the worst, we would have twice the amount of concurrenct size calculations.
// But in practice, the `p.threshold` >> `consumption metric interval`, and
// we spread out the eviction task using `random_init_delay`.
// So, the chance of the worst case is quite low in practice.
// It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
// So, we must coordinate with other with other eviction tasks of this tenant.
let limit = self
.conf
.eviction_task_immitated_concurrent_logical_size_queries
.inner();
let mut throwaway_cache = HashMap::new();
let gather =
crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
tokio::select! {
_ = cancel.cancelled() => {}
gather_result = gather => {
match gather_result {
Ok(_) => {},
Err(e) => {
// We don't care about the result, but, if it failed, we should log it,
// since consumption metric might be hitting the cached value and
// thus not encountering this error.
warn!("failed to imitate synthetic size calculation accesses: {e:#}")
}
}
}
}
}
}

View File

@@ -0,0 +1,81 @@
use crate::tenant::timeline::LayerFileName;
use crate::tenant::Timeline;
use std::io::Write;
use std::time::UNIX_EPOCH;
use tracing::*;
use std::fs::File;
use utils::lsn::Lsn;
impl Timeline {
pub(super) fn start_layer_tracing(&self) {
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
let path = timeline_path.join("layer_trace");
match File::options()
.create(true)
.append(true)
.open(&path)
{
Ok(file) => {
info!("enabled layer tracing");
self.layer_trace_file.lock().unwrap().replace(file);
},
Err(e) => {
warn!("could not open layer tracing file \"{}\": {}", path.display(), e);
}
}
}
fn trace_op(&self, op: &str, filename: &str) {
let opt_out = &self.layer_trace_file.lock().unwrap();
if let Some(mut out) = opt_out.as_ref() {
if let Ok(elapsed) = UNIX_EPOCH.elapsed() {
let time = elapsed.as_millis();
let _ = writeln!(out, "{{ \"time\": {time}, \"op\": \"{op}\", \"filename\": \"{filename}\"}}");
}
else {
warn!("could not get current timestamp");
}
}
}
pub(super) fn trace_layer_evict(&self, filename: &LayerFileName) {
self.trace_op("evict", &filename.file_name())
}
pub(super) fn trace_layer_flush(&self, filename: &LayerFileName) {
self.trace_op("flush", &filename.file_name())
}
pub(super) fn trace_layer_compact_create(&self, filename: &LayerFileName) {
self.trace_op("compact_create", &filename.file_name())
}
pub(super) fn trace_layer_compact_delete(&self, filename: &LayerFileName) {
self.trace_op("compact_delete", &filename.file_name())
}
pub(super) fn trace_layer_image_create(&self, filename: &LayerFileName) {
self.trace_op("image_create", &filename.file_name())
}
pub(super) fn trace_layer_gc_delete(&self, filename: &LayerFileName) {
self.trace_op("gc_delete", &filename.file_name())
}
// TODO: also report 'retain_lsns'
pub(super) fn trace_gc_start(&self, cutoff_lsn: Lsn) {
let opt_out = &self.layer_trace_file.lock().unwrap();
if let Some(mut out) = opt_out.as_ref() {
if let Ok(elapsed) = UNIX_EPOCH.elapsed() {
let time = elapsed.as_millis();
let _ = writeln!(out, "{{ \"time\": {time}, \"op\": \"gc_start\", \"cutoff\": \"{cutoff_lsn}\"}}");
}
else {
warn!("could not get current timestamp");
}
}
}
}

View File

@@ -237,7 +237,11 @@ async fn connection_manager_loop_step(
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
info!("Switching to new connection candidate: {new_candidate:?}");
walreceiver_state
.change_connection(new_candidate, ctx)
.change_connection(
new_candidate.safekeeper_id,
new_candidate.wal_source_connconf,
ctx,
)
.await
}
}
@@ -342,8 +346,6 @@ struct WalConnection {
started_at: NaiveDateTime,
/// Current safekeeper pageserver is connected to for WAL streaming.
sk_id: NodeId,
/// Availability zone of the safekeeper.
availability_zone: Option<String>,
/// Status of the connection.
status: WalConnectionStatus,
/// WAL streaming task handle.
@@ -403,7 +405,12 @@ impl WalreceiverState {
}
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
async fn change_connection(
&mut self,
new_sk_id: NodeId,
new_wal_source_connconf: PgConnectionConfig,
ctx: &RequestContext,
) {
self.drop_old_connection(true).await;
let id = self.id;
@@ -417,7 +424,7 @@ impl WalreceiverState {
async move {
super::walreceiver_connection::handle_walreceiver_connection(
timeline,
new_sk.wal_source_connconf,
new_wal_source_connconf,
events_sender,
cancellation,
connect_timeout,
@@ -426,16 +433,13 @@ impl WalreceiverState {
.await
.context("walreceiver connection handling failure")
}
.instrument(
info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
)
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
});
let now = Utc::now().naive_utc();
self.wal_connection = Some(WalConnection {
started_at: now,
sk_id: new_sk.safekeeper_id,
availability_zone: new_sk.availability_zone,
sk_id: new_sk_id,
status: WalConnectionStatus {
is_connected: false,
has_processed_wal: false,
@@ -542,7 +546,6 @@ impl WalreceiverState {
/// * if connected safekeeper is not present, pick the candidate
/// * if we haven't received any updates for some time, pick the candidate
/// * if the candidate commit_lsn is much higher than the current one, pick the candidate
/// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate
/// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
///
/// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
@@ -556,7 +559,6 @@ impl WalreceiverState {
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
self.select_connection_candidate(Some(connected_sk_node))?;
let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone();
let now = Utc::now().naive_utc();
if let Ok(latest_interaciton) =
@@ -567,7 +569,6 @@ impl WalreceiverState {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connconf: new_wal_source_connconf,
availability_zone: new_availability_zone,
reason: ReconnectReason::NoKeepAlives {
last_keep_alive: Some(
existing_wal_connection.status.latest_connection_update,
@@ -593,7 +594,6 @@ impl WalreceiverState {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connconf: new_wal_source_connconf,
availability_zone: new_availability_zone,
reason: ReconnectReason::LaggingWal {
current_commit_lsn,
new_commit_lsn,
@@ -601,20 +601,6 @@ impl WalreceiverState {
},
});
}
// If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver,
// and the current one is not, switch to the new one.
if self.availability_zone.is_some()
&& existing_wal_connection.availability_zone
!= self.availability_zone
&& self.availability_zone == new_availability_zone
{
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
availability_zone: new_availability_zone,
wal_source_connconf: new_wal_source_connconf,
reason: ReconnectReason::SwitchAvailabilityZone,
});
}
}
None => debug!(
"Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
@@ -682,7 +668,6 @@ impl WalreceiverState {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connconf: new_wal_source_connconf,
availability_zone: new_availability_zone,
reason: ReconnectReason::NoWalTimeout {
current_lsn,
current_commit_lsn,
@@ -701,11 +686,10 @@ impl WalreceiverState {
self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
}
None => {
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
let (new_sk_id, _, new_wal_source_connconf) =
self.select_connection_candidate(None)?;
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
availability_zone: new_safekeeper_broker_data.availability_zone.clone(),
wal_source_connconf: new_wal_source_connconf,
reason: ReconnectReason::NoExistingConnection,
});
@@ -810,7 +794,6 @@ impl WalreceiverState {
struct NewWalConnectionCandidate {
safekeeper_id: NodeId,
wal_source_connconf: PgConnectionConfig,
availability_zone: Option<String>,
// This field is used in `derive(Debug)` only.
#[allow(dead_code)]
reason: ReconnectReason,
@@ -825,7 +808,6 @@ enum ReconnectReason {
new_commit_lsn: Lsn,
threshold: NonZeroU64,
},
SwitchAvailabilityZone,
NoWalTimeout {
current_lsn: Lsn,
current_commit_lsn: Lsn,
@@ -891,7 +873,6 @@ mod tests {
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
},
latest_update,
}
@@ -952,7 +933,6 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
@@ -1115,7 +1095,6 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
@@ -1181,7 +1160,6 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
@@ -1244,7 +1222,6 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
discovered_new_wal: Some(NewCommittedWAL {
@@ -1312,74 +1289,4 @@ mod tests {
availability_zone: None,
}
}
#[tokio::test]
async fn switch_to_same_availability_zone() -> anyhow::Result<()> {
// Pageserver and one of safekeepers will be in the same availability zone
// and pageserver should prefer to connect to it.
let test_az = Some("test_az".to_owned());
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
let mut state = dummy_state(&harness).await;
state.availability_zone = test_az.clone();
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
let connected_sk_id = NodeId(0);
let connection_status = WalConnectionStatus {
is_connected: true,
has_processed_wal: true,
latest_connection_update: now,
latest_wal_update: now,
commit_lsn: Some(current_lsn),
streaming_lsn: Some(current_lsn),
};
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
Ok(())
}),
discovered_new_wal: None,
});
// We have another safekeeper with the same commit_lsn, and it have the same availability zone as
// the current pageserver.
let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
same_az_sk.timeline.availability_zone = test_az.clone();
state.wal_stream_candidates = HashMap::from([
(
connected_sk_id,
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
),
(NodeId(1), same_az_sk),
]);
// We expect that pageserver will switch to the safekeeper in the same availability zone,
// even if it has the same commit_lsn.
let next_candidate = state.next_connection_candidate().expect(
"Expected one candidate selected out of multiple valid data options, but got none",
);
assert_eq!(next_candidate.safekeeper_id, NodeId(1));
assert_eq!(
next_candidate.reason,
ReconnectReason::SwitchAvailabilityZone,
"Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn"
);
assert_eq!(
next_candidate.wal_source_connconf.host(),
&Host::Domain("same_az".to_owned())
);
Ok(())
}
}

View File

@@ -2,7 +2,6 @@
use std::{
error::Error,
pin::pin,
str::FromStr,
sync::Arc,
time::{Duration, SystemTime},
@@ -18,7 +17,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn;
use postgres_ffi::WAL_SEGMENT_SIZE;
use postgres_protocol::message::backend::ReplicationMessage;
use postgres_types::PgLsn;
use tokio::{select, sync::watch, time};
use tokio::{pin, select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, trace, warn};
@@ -37,7 +36,7 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use pq_proto::PageserverFeedback;
use pq_proto::ReplicationFeedback;
use utils::lsn::Lsn;
/// Status of the connection.
@@ -188,7 +187,8 @@ pub async fn handle_walreceiver_connection(
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
let copy_stream = replication_client.copy_both_simple(&query).await?;
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
let physical_stream = ReplicationStream::new(copy_stream);
pin!(physical_stream);
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
@@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection(
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
let last_received_lsn = u64::from(last_lsn);
let write_lsn = u64::from(last_lsn);
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
let apply_lsn = u64::from(timeline_remote_consistent_lsn);
let ts = SystemTime::now();
// Update the status about what we just received. This is shown in the mgmt API.
@@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection(
let (timeline_logical_size, _) = timeline
.get_current_logical_size(&ctx)
.context("Status update creation failed to get current logical size")?;
let status_update = PageserverFeedback {
let status_update = ReplicationFeedback {
current_timeline_size: timeline_logical_size,
last_received_lsn,
disk_consistent_lsn,
remote_consistent_lsn,
replytime: ts,
ps_writelsn: write_lsn,
ps_flushlsn: flush_lsn,
ps_applylsn: apply_lsn,
ps_replytime: ts,
};
debug!("neon_status_update {status_update:?}");

View File

@@ -14,7 +14,6 @@
*/
#include <sys/file.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include <fcntl.h>
@@ -35,9 +34,6 @@
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/buf_internals.h"
#include "storage/procsignal.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
@@ -63,9 +59,6 @@
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
typedef struct FileCacheEntry
{
BufferTag key;
@@ -78,7 +71,6 @@ typedef struct FileCacheEntry
typedef struct FileCacheControl
{
uint32 size; /* size of cache file in chunks */
uint32 used; /* number of used chunks */
dlist_head lru; /* double linked list for LRU replacement algorithm */
} FileCacheControl;
@@ -87,14 +79,12 @@ static int lfc_desc;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_free_space_watermark;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
static void
lfc_shmem_startup(void)
@@ -122,7 +112,6 @@ lfc_shmem_startup(void)
&info,
HASH_ELEM | HASH_BLOBS);
lfc_ctl->size = 0;
lfc_ctl->used = 0;
dlist_init(&lfc_ctl->lru);
/* Remove file cache on restart */
@@ -176,7 +165,7 @@ lfc_change_limit_hook(int newval, void *extra)
}
}
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
{
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
@@ -186,86 +175,12 @@ lfc_change_limit_hook(int newval, void *extra)
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
lfc_ctl->size -= 1;
}
elog(LOG, "set local file cache limit to %d", new_size);
LWLockRelease(lfc_lock);
}
/*
* Local file system state monitor check available free space.
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
* but throwing away least recently accessed chunks.
* First time low space watermark is reached cache size is divided by two,
* second time by four,... Finally we remove all chunks from local cache.
*
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
* We only throw away cached chunks but do not prevent from filling cache by new chunks.
*
* Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
* disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
* Calling statvfs each second should not add any noticeable overhead.
*/
void
FileCacheMonitorMain(Datum main_arg)
{
/*
* Choose file system state monitor interval so that space can not be exosted
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
*/
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
BackgroundWorkerUnblockSignals();
/* Periodically dump buffers until terminated. */
while (!ShutdownRequestPending)
{
if (lfc_size_limit != 0)
{
struct statvfs sfs;
if (statvfs(lfc_path, &sfs) < 0)
{
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
}
else
{
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
{
if (lfc_shrinking_factor < 31) {
lfc_shrinking_factor += 1;
}
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
}
else
lfc_shrinking_factor = 0; /* reset to initial value */
}
}
pg_usleep(monitor_interval);
}
}
static void
lfc_register_free_space_monitor(void)
{
BackgroundWorker bgw;
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
void
lfc_init(void)
{
@@ -302,19 +217,6 @@ lfc_init(void)
lfc_change_limit_hook,
NULL);
DefineCustomIntVariable("neon.free_space_watermark",
"Minimal free space in local file system after reaching which local file cache will be truncated",
NULL,
&lfc_free_space_watermark,
1024, /* 1GB */
0,
INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MB,
NULL,
NULL,
NULL);
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
@@ -329,9 +231,6 @@ lfc_init(void)
if (lfc_max_size == 0)
return;
if (lfc_free_space_watermark != 0)
lfc_register_free_space_monitor();
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = lfc_shmem_startup;
#if PG_VERSION_NUM>=150000
@@ -481,7 +380,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
* we prefer not to complicate code and use second approach.
*/
if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
@@ -491,10 +390,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
elog(LOG, "Swap file cache page");
}
else
{
lfc_ctl->used += 1;
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
}
entry->access_count = 1;
memset(entry->bitmap, 0, sizeof entry->bitmap);
}

View File

@@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk)
return sk->state == SS_ACTIVE;
}
/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
void
ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf)
ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf)
{
uint8 nkeys;
int i;
@@ -1892,45 +1892,45 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->currentClusterSize = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
}
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
else if (strcmp(key, "ps_writelsn") == 0)
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->last_received_lsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
LSN_FORMAT_ARGS(rf->last_received_lsn));
rf->ps_writelsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_writelsn));
}
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
else if (strcmp(key, "ps_flushlsn") == 0)
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
rf->ps_flushlsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_flushlsn));
}
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
else if (strcmp(key, "ps_applylsn") == 0)
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
rf->ps_applylsn = pq_getmsgint64(reply_message);
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
LSN_FORMAT_ARGS(rf->ps_applylsn));
}
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
else if (strcmp(key, "ps_replytime") == 0)
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->replytime = pq_getmsgint64(reply_message);
rf->ps_replytime = pq_getmsgint64(reply_message);
{
char *replyTimeStr;
/* Copy because timestamptz_to_str returns a static buffer */
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
rf->replytime, replyTimeStr);
replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
rf->ps_replytime, replyTimeStr);
pfree(replyTimeStr);
}
@@ -1944,7 +1944,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf
* Skip unknown keys to support backward compatibile protocol
* changes
*/
elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
pq_getmsgbytes(reply_message, len);
};
}
@@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void)
}
/*
* WalproposerShmemSize --- report amount of shared memory space needed
* ReplicationFeedbackShmemSize --- report amount of shared memory space needed
*/
Size
WalproposerShmemSize(void)
@@ -2054,10 +2054,10 @@ WalproposerShmemInit(void)
}
void
replication_feedback_set(PageserverFeedback * rf)
replication_feedback_set(ReplicationFeedback * rf)
{
SpinLockAcquire(&walprop_shared->mutex);
memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
SpinLockRelease(&walprop_shared->mutex);
}
@@ -2065,43 +2065,43 @@ void
replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
{
SpinLockAcquire(&walprop_shared->mutex);
*writeLsn = walprop_shared->feedback.last_received_lsn;
*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
*writeLsn = walprop_shared->feedback.ps_writelsn;
*flushLsn = walprop_shared->feedback.ps_flushlsn;
*applyLsn = walprop_shared->feedback.ps_applylsn;
SpinLockRelease(&walprop_shared->mutex);
}
/*
* Get PageserverFeedback fields from the most advanced safekeeper
* Get ReplicationFeedback fields from the most advanced safekeeper
*/
static void
GetLatestNeonFeedback(PageserverFeedback * rf)
GetLatestNeonFeedback(ReplicationFeedback * rf)
{
int latest_safekeeper = 0;
XLogRecPtr last_received_lsn = InvalidXLogRecPtr;
XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
for (int i = 0; i < n_safekeepers; i++)
{
if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
{
latest_safekeeper = i;
last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn;
ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
}
}
rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime;
rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
" ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
LSN_FORMAT_ARGS(rf->ps_writelsn),
LSN_FORMAT_ARGS(rf->ps_flushlsn),
LSN_FORMAT_ARGS(rf->ps_applylsn),
rf->ps_replytime);
replication_feedback_set(rf);
}
@@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void)
XLogRecPtr minFlushLsn;
minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
if (!syncSafekeepers)
{
/* Get PageserverFeedback fields from the most advanced safekeeper */
/* Get ReplicationFeedback fields from the most advanced safekeeper */
GetLatestNeonFeedback(&quorumFeedback.rf);
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
}
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
{
if (minQuorumLsn > quorumFeedback.flushLsn)
@@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void)
* apply_lsn - This is what processed and durably saved at*
* pageserver.
*/
quorumFeedback.rf.disk_consistent_lsn,
quorumFeedback.rf.ps_flushlsn,
GetCurrentTimestamp(), false);
}
@@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
msg->hs.xmin.value = pq_getmsgint64_le(&s);
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
ParsePageserverFeedbackMessage(&s, &msg->rf);
ParseReplicationFeedbackMessage(&s, &msg->rf);
pq_getmsgend(&s);
return true;
}
@@ -2462,7 +2462,7 @@ backpressure_lag_impl(void)
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
#define MB ((XLogRecPtr)1024 * 1024)
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
LSN_FORMAT_ARGS(myFlushLsn),
LSN_FORMAT_ARGS(writePtr),
LSN_FORMAT_ARGS(flushPtr),

View File

@@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback
FullTransactionId catalog_xmin;
} HotStandbyFeedback;
typedef struct PageserverFeedback
typedef struct ReplicationFeedback
{
/* current size of the timeline on pageserver */
uint64 currentClusterSize;
/* standby_status_update fields that safekeeper received from pageserver */
XLogRecPtr last_received_lsn;
XLogRecPtr disk_consistent_lsn;
XLogRecPtr remote_consistent_lsn;
TimestampTz replytime;
} PageserverFeedback;
XLogRecPtr ps_writelsn;
XLogRecPtr ps_flushlsn;
XLogRecPtr ps_applylsn;
TimestampTz ps_replytime;
} ReplicationFeedback;
typedef struct WalproposerShmemState
{
slock_t mutex;
PageserverFeedback feedback;
ReplicationFeedback feedback;
term_t mineLastElectedTerm;
pg_atomic_uint64 backpressureThrottlingTime;
} WalproposerShmemState;
@@ -320,10 +320,10 @@ typedef struct AppendResponse
/* Feedback recieved from pageserver includes standby_status_update fields */
/* and custom neon feedback. */
/* This part of the message is extensible. */
PageserverFeedback rf;
ReplicationFeedback rf;
} AppendResponse;
/* PageserverFeedback is extensible part of the message that is parsed separately */
/* ReplicationFeedback is extensible part of the message that is parsed separately */
/* Other fields are fixed part */
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
@@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]);
extern void WalProposerMain(Datum main_arg);
extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
extern void WalProposerPoll(void);
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
PageserverFeedback *rf);
extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
ReplicationFeedback *rf);
extern void StartProposerReplication(StartReplicationCmd *cmd);
extern Size WalproposerShmemSize(void);
extern bool WalproposerShmemInit(void);
extern void replication_feedback_set(PageserverFeedback *rf);
extern void replication_feedback_set(ReplicationFeedback *rf);
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
/* libpqwalproposer hooks & helper type */

38
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -79,35 +79,37 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
[[package]]
name = "allure-pytest"
version = "2.13.1"
version = "2.10.0"
description = "Allure pytest integration"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
{file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
{file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
{file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
]
[package.dependencies]
allure-python-commons = "2.13.1"
allure-python-commons = "2.10.0"
pytest = ">=4.5.0"
six = ">=1.9.0"
[[package]]
name = "allure-python-commons"
version = "2.13.1"
version = "2.10.0"
description = "Common module for integrate allure with python-based frameworks"
category = "main"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.5"
files = [
{file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
{file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
{file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
{file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
]
[package.dependencies]
attrs = ">=16.0.0"
pluggy = ">=0.4.0"
six = ">=1.9.0"
[[package]]
name = "async-timeout"
@@ -1930,22 +1932,6 @@ pytest = [
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
]
[[package]]
name = "pytest-rerunfailures"
version = "11.1.2"
description = "pytest plugin to re-run tests to eliminate flaky failures"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
{file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
]
[package.dependencies]
packaging = ">=17.1"
pytest = ">=5.3"
[[package]]
name = "pytest-timeout"
version = "2.1.0"
@@ -2611,4 +2597,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"

View File

@@ -140,7 +140,7 @@ async fn auth_quirks(
impl BackendType<'_, ClientCredentials<'_>> {
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
#[tracing::instrument(fields(allow_cleartext), skip_all)]
pub async fn authenticate(
&mut self,
extra: &ConsoleReqExtra<'_>,

View File

@@ -53,7 +53,7 @@ pub async fn password_hack(
.await?;
info!(project = &payload.project, "received missing parameter");
creds.project = Some(payload.project);
creds.project = Some(payload.project.into());
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(payload.password);

View File

@@ -2,7 +2,7 @@
use crate::error::UserFacingError;
use pq_proto::StartupMessageParams;
use std::collections::HashSet;
use std::borrow::Cow;
use thiserror::Error;
use tracing::info;
@@ -19,10 +19,11 @@ pub enum ClientCredsParseError {
InconsistentProjectNames { domain: String, option: String },
#[error(
"Common name inferred from SNI ('{}') is not known",
.cn,
"SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
SNI should be formatted as '<project-name>.{}'.",
.sni, .cn, .cn,
)]
UnknownCommonName { cn: String },
InconsistentSni { sni: String, cn: String },
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
MalformedProjectName(String),
@@ -36,7 +37,7 @@ impl UserFacingError for ClientCredsParseError {}
pub struct ClientCredentials<'a> {
pub user: &'a str,
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<String>,
pub project: Option<Cow<'a, str>>,
}
impl ClientCredentials<'_> {
@@ -50,7 +51,7 @@ impl<'a> ClientCredentials<'a> {
pub fn parse(
params: &'a StartupMessageParams,
sni: Option<&str>,
common_names: Option<HashSet<String>>,
common_name: Option<&str>,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
@@ -59,43 +60,37 @@ impl<'a> ClientCredentials<'a> {
let user = get_param("user")?;
// Project name might be passed via PG's command-line options.
let project_option = params
.options_raw()
.and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
.map(|name| name.to_string());
let project_option = params.options_raw().and_then(|mut options| {
options
.find_map(|opt| opt.strip_prefix("project="))
.map(Cow::Borrowed)
});
let project_from_domain = if let Some(sni_str) = sni {
if let Some(cn) = common_names {
let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
let project = common_name_from_sni
.and_then(|domain| {
if cn.contains(domain) {
subdomain_from_sni(sni_str, domain)
} else {
None
}
// Alternative project name is in fact a subdomain from SNI.
// NOTE: we do not consider SNI if `common_name` is missing.
let project_domain = sni
.zip(common_name)
.map(|(sni, cn)| {
subdomain_from_sni(sni, cn)
.ok_or_else(|| InconsistentSni {
sni: sni.into(),
cn: cn.into(),
})
.ok_or_else(|| UnknownCommonName {
cn: common_name_from_sni.unwrap_or("").into(),
})?;
.map(Cow::<'static, str>::Owned)
})
.transpose()?;
Some(project)
} else {
None
}
} else {
None
};
let project = match (project_option, project_from_domain) {
let project = match (project_option, project_domain) {
// Invariant: if we have both project name variants, they should match.
(Some(option), Some(domain)) if option != domain => {
Some(Err(InconsistentProjectNames { domain, option }))
Some(Err(InconsistentProjectNames {
domain: domain.into(),
option: option.into(),
}))
}
// Invariant: project name may not contain certain characters.
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
false => Err(MalformedProjectName(name)),
false => Err(MalformedProjectName(name.into())),
true => Ok(name),
}),
}
@@ -154,9 +149,9 @@ mod tests {
let options = StartupMessageParams::new([("user", "john_doe")]);
let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into());
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&options, sni, common_names)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -182,41 +177,24 @@ mod tests {
let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into());
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&options, sni, common_names)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
Ok(())
}
#[test]
fn parse_multi_common_names() -> anyhow::Result<()> {
let options = StartupMessageParams::new([("user", "john_doe")]);
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com");
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
Ok(())
}
#[test]
fn parse_projects_different() {
let options =
StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into());
let common_name = Some("localhost");
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
assert_eq!(option, "first");
@@ -231,12 +209,13 @@ mod tests {
let options = StartupMessageParams::new([("user", "john_doe")]);
let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into());
let common_name = Some("example.com");
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
match err {
UnknownCommonName { cn } => {
assert_eq!(cn, "localhost");
InconsistentSni { sni, cn } => {
assert_eq!(sni, "project.localhost");
assert_eq!(cn, "example.com");
}
_ => panic!("bad error: {err:?}"),
}

View File

@@ -1,12 +1,6 @@
use crate::auth;
use anyhow::{bail, ensure, Context, Ok};
use rustls::sign;
use std::{
collections::{HashMap, HashSet},
str::FromStr,
sync::Arc,
time::Duration,
};
use anyhow::{bail, ensure, Context};
use std::{str::FromStr, sync::Arc, time::Duration};
pub struct ProxyConfig {
pub tls_config: Option<TlsConfig>,
@@ -22,7 +16,7 @@ pub struct MetricCollectionConfig {
pub struct TlsConfig {
pub config: Arc<rustls::ServerConfig>,
pub common_names: Option<HashSet<String>>,
pub common_name: Option<String>,
}
impl TlsConfig {
@@ -32,34 +26,28 @@ impl TlsConfig {
}
/// Configure TLS for the main endpoint.
pub fn configure_tls(
key_path: &str,
cert_path: &str,
certs_dir: Option<&String>,
) -> anyhow::Result<TlsConfig> {
let mut cert_resolver = CertResolver::new();
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
let key = {
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
// add default certificate
cert_resolver.add_cert(key_path, cert_path)?;
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
keys.pop().map(rustls::PrivateKey).unwrap()
};
// add extra certificates
if let Some(certs_dir) = certs_dir {
for entry in std::fs::read_dir(certs_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
// file names aligned with default cert-manager names
let key_path = path.join("tls.key");
let cert_path = path.join("tls.crt");
if key_path.exists() && cert_path.exists() {
cert_resolver
.add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
}
}
}
}
let cert_chain_bytes = std::fs::read(cert_path)
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
let common_names = cert_resolver.get_common_names();
let cert_chain = {
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
.context(format!(
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
))?
.into_iter()
.map(rustls::Certificate)
.collect()
};
let config = rustls::ServerConfig::builder()
.with_safe_default_cipher_suites()
@@ -67,116 +55,27 @@ pub fn configure_tls(
// allow TLS 1.2 to be compatible with older client libraries
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
.with_no_client_auth()
.with_cert_resolver(Arc::new(cert_resolver))
.with_single_cert(cert_chain, key)?
.into();
// determine common name from tls-cert (-c server.crt param).
// used in asserting project name formatting invariant.
let common_name = {
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
.context(format!(
"Failed to parse PEM object from bytes from file at '{cert_path}'."
))?
.1;
let common_name = pem.parse_x509()?.subject().to_string();
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
};
Ok(TlsConfig {
config,
common_names: Some(common_names),
common_name,
})
}
struct CertResolver {
certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
}
impl CertResolver {
fn new() -> Self {
Self {
certs: HashMap::new(),
}
}
fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
let priv_key = {
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
keys.pop().map(rustls::PrivateKey).unwrap()
};
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
let cert_chain_bytes = std::fs::read(cert_path)
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
let cert_chain = {
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
.context(format!(
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
))?
.into_iter()
.map(rustls::Certificate)
.collect()
};
let common_name = {
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
.context(format!(
"Failed to parse PEM object from bytes from file at '{cert_path}'."
))?
.1;
let common_name = pem.parse_x509()?.subject().to_string();
// We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
// wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
// verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
// and passed None instead, which blows up number of cases downstream code should handle. Proper coding
// here should better avoid Option for common_names, and do wildcard-based certificate selection instead
// of cutting off '*.' parts.
if common_name.starts_with("CN=*.") {
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
} else {
common_name.strip_prefix("CN=").map(|s| s.to_string())
}
}
.context(format!(
"Failed to parse common name from certificate at '{cert_path}'."
))?;
self.certs.insert(
common_name,
Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)),
);
Ok(())
}
fn get_common_names(&self) -> HashSet<String> {
self.certs.keys().map(|s| s.to_string()).collect()
}
}
impl rustls::server::ResolvesServerCert for CertResolver {
fn resolve(
&self,
_client_hello: rustls::server::ClientHello,
) -> Option<Arc<rustls::sign::CertifiedKey>> {
// loop here and cut off more and more subdomains until we find
// a match to get a proper wildcard support. OTOH, we now do not
// use nested domains, so keep this simple for now.
//
// With the current coding foo.com will match *.foo.com and that
// repeats behavior of the old code.
if let Some(mut sni_name) = _client_hello.server_name() {
loop {
if let Some(cert) = self.certs.get(sni_name) {
return Some(cert.clone());
}
if let Some((_, rest)) = sni_name.split_once('.') {
sni_name = rest;
} else {
return None;
}
}
} else {
None
}
}
}
/// Helper for cmdline cache options parsing.
pub struct CacheOptions {
/// Max number of entries.

View File

@@ -132,11 +132,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
args.get_one::<String>("tls-key"),
args.get_one::<String>("tls-cert"),
) {
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
key_path,
cert_path,
args.get_one::<String>("certs-dir"),
)?),
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
(None, None) => None,
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
};
@@ -258,12 +254,6 @@ fn cli() -> clap::Command {
.alias("ssl-cert") // backwards compatibility
.help("path to TLS cert for client postgres connections"),
)
// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
.arg(
Arg::new("certs-dir")
.long("certs-dir")
.help("path to directory with TLS certificates for client postgres connections"),
)
.arg(
Arg::new("metric-collection-endpoint")
.long("metric-collection-endpoint")

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use serde::Serialize;
use std::collections::HashMap;
use tracing::{error, info, instrument, trace, warn};
use tracing::{debug, error, info, instrument, trace};
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
@@ -84,14 +84,10 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
let value = ms.get_counter().get_value() as u64;
// Report if the metric value is suspiciously large
if value > (1u64 << 40) {
warn!(
"potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
branch_id, endpoint_id, value
);
}
debug!(
"branch_id {} endpoint_id {} val: {}",
branch_id, endpoint_id, value
);
current_metrics.push((
Ids {
endpoint_id: endpoint_id.to_string(),
@@ -128,15 +124,11 @@ async fn collect_metrics_iteration(
let mut value = *curr_val;
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
// Only send metrics updates if the metric has increased
if curr_val > prev_val {
// Only send metrics updates if the metric has changed
if curr_val - prev_val > 0 {
value = curr_val - prev_val;
start_time = *prev_time;
} else {
if curr_val < prev_val {
error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
prev_val, curr_val, curr_key);
}
return None;
}
};
@@ -197,7 +189,7 @@ async fn collect_metrics_iteration(
})
// update cached value (add delta) and time
.and_modify(|e| {
e.0 = e.0.saturating_add(send_metric.value);
e.0 += send_metric.value;
e.1 = stop_time
})
// cache new metric

View File

@@ -98,7 +98,7 @@ pub async fn task_main(
}
// TODO(tech debt): unite this with its twin below.
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
#[tracing::instrument(fields(session_id), skip_all)]
pub async fn handle_ws_client(
config: &'static ProxyConfig,
cancel_map: &CancelMap,
@@ -124,11 +124,11 @@ pub async fn handle_ws_client(
// Extract credentials which we're going to use for auth.
let creds = {
let common_names = tls.and_then(|tls| tls.common_names.clone());
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_names))
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -140,7 +140,7 @@ pub async fn handle_ws_client(
.await
}
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
#[tracing::instrument(fields(session_id), skip_all)]
async fn handle_client(
config: &'static ProxyConfig,
cancel_map: &CancelMap,
@@ -163,11 +163,11 @@ async fn handle_client(
// Extract credentials which we're going to use for auth.
let creds = {
let sni = stream.get_ref().sni_hostname();
let common_names = tls.and_then(|tls| tls.common_names.clone());
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, sni, common_names))
.map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?

View File

@@ -54,11 +54,9 @@ fn generate_tls_config<'a>(
.with_single_cert(vec![cert], key)?
.into();
let common_names = Some([common_name.to_owned()].iter().cloned().collect());
TlsConfig {
config,
common_names,
common_name: Some(common_name.to_string()),
}
};

View File

@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0"
Werkzeug = "^2.2.3"
pytest-order = "^1.0.1"
allure-pytest = "^2.13.1"
allure-pytest = "^2.10.0"
pytest-asyncio = "^0.19.0"
toml = "^0.10.2"
psutil = "^5.9.4"
@@ -34,7 +34,6 @@ types-psutil = "^5.9.5.4"
types-toml = "^0.10.8"
pytest-httpserver = "^1.0.6"
aiohttp = "3.7.4"
pytest-rerunfailures = "^11.1.2"
[tool.poetry.group.dev.dependencies]
black = "^23.1.0"
@@ -70,9 +69,6 @@ strict = true
module = [
"asyncpg.*",
"pg8000.*",
"allure.*",
"allure_commons.*",
"allure_pytest.*",
]
ignore_missing_imports = true

View File

@@ -8,7 +8,13 @@
# warnings and errors right in the editor.
# In vscode, this setting is Rust-analyzer>Check On Save:Command
# manual-range-contains wants
# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)
# instead of
# len < 4 || len > MAX_STARTUP_PACKET_LENGTH
# , let's disagree.
# * `-A unknown_lints` do not warn about unknown lint suppressions
# that people with newer toolchains might use
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.68.2"
channel = "1.66.1"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -5,7 +5,6 @@ use anyhow::{bail, Context, Result};
use clap::Parser;
use remote_storage::RemoteStorageConfig;
use toml_edit::Document;
use utils::signals::ShutdownSignals;
use std::fs::{self, File};
use std::io::{ErrorKind, Write};
@@ -40,7 +39,7 @@ use utils::{
logging::{self, LogFormat},
project_git_version,
sentry_init::init_sentry,
tcp_listener,
signals, tcp_listener,
};
const PID_FILE_NAME: &str = "safekeeper.pid";
@@ -217,6 +216,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
metrics::register_internal(Box::new(timeline_collector))?;
let signals = signals::install_shutdown_handlers()?;
let mut threads = vec![];
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
@@ -274,12 +274,15 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
set_build_info_metric(GIT_VERSION);
// TODO: put more thoughts into handling of failed threads
// We should catch & die if they are in trouble.
// We probably should restart them.
// On any shutdown signal, log receival and exit. Additionally, handling
// SIGQUIT prevents coredump.
ShutdownSignals::handle(|signal| {
info!("received {}, terminating", signal.name());
// NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
signals.handle(|signal| {
// TODO: implement graceful shutdown with joining threads etc
info!(
"received {}, terminating in immediate shutdown mode",
signal.name()
);
std::process::exit(0);
})
}

View File

@@ -242,7 +242,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
backup_lsn: sk_info.backup_lsn.0,
local_start_lsn: sk_info.local_start_lsn.0,
availability_zone: None,
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;

View File

@@ -255,7 +255,7 @@ pub struct TimelineCollector {
epoch_start_lsn: GenericGaugeVec<AtomicU64>,
peer_horizon_lsn: GenericGaugeVec<AtomicU64>,
remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
feedback_ps_write_lsn: GenericGaugeVec<AtomicU64>,
feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
timeline_active: GenericGaugeVec<AtomicU64>,
wal_backup_active: GenericGaugeVec<AtomicU64>,
@@ -339,15 +339,15 @@ impl TimelineCollector {
.unwrap();
descs.extend(remote_consistent_lsn.desc().into_iter().cloned());
let ps_last_received_lsn = GenericGaugeVec::new(
let feedback_ps_write_lsn = GenericGaugeVec::new(
Opts::new(
"safekeeper_ps_last_received_lsn",
"safekeeper_feedback_ps_write_lsn",
"Last LSN received by the pageserver, acknowledged in the feedback",
),
&["tenant_id", "timeline_id"],
)
.unwrap();
descs.extend(ps_last_received_lsn.desc().into_iter().cloned());
descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned());
let feedback_last_time_seconds = GenericGaugeVec::new(
Opts::new(
@@ -458,7 +458,7 @@ impl TimelineCollector {
epoch_start_lsn,
peer_horizon_lsn,
remote_consistent_lsn,
ps_last_received_lsn,
feedback_ps_write_lsn,
feedback_last_time_seconds,
timeline_active,
wal_backup_active,
@@ -489,7 +489,7 @@ impl Collector for TimelineCollector {
self.epoch_start_lsn.reset();
self.peer_horizon_lsn.reset();
self.remote_consistent_lsn.reset();
self.ps_last_received_lsn.reset();
self.feedback_ps_write_lsn.reset();
self.feedback_last_time_seconds.reset();
self.timeline_active.reset();
self.wal_backup_active.reset();
@@ -514,11 +514,11 @@ impl Collector for TimelineCollector {
let timeline_id = tli.ttid.timeline_id.to_string();
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
let mut most_advanced: Option<pq_proto::PageserverFeedback> = None;
let mut most_advanced: Option<pq_proto::ReplicationFeedback> = None;
for replica in tli.replicas.iter() {
if let Some(replica_feedback) = replica.pageserver_feedback {
if let Some(current) = most_advanced {
if current.last_received_lsn < replica_feedback.last_received_lsn {
if current.ps_writelsn < replica_feedback.ps_writelsn {
most_advanced = Some(replica_feedback);
}
} else {
@@ -568,10 +568,11 @@ impl Collector for TimelineCollector {
.set(tli.wal_storage.flush_wal_seconds);
if let Some(feedback) = most_advanced {
self.ps_last_received_lsn
self.feedback_ps_write_lsn
.with_label_values(labels)
.set(feedback.last_received_lsn);
if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) {
.set(feedback.ps_writelsn);
if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH)
{
self.feedback_last_time_seconds
.with_label_values(labels)
.set(unix_time.as_secs());
@@ -598,7 +599,7 @@ impl Collector for TimelineCollector {
mfs.extend(self.epoch_start_lsn.collect());
mfs.extend(self.peer_horizon_lsn.collect());
mfs.extend(self.remote_consistent_lsn.collect());
mfs.extend(self.ps_last_received_lsn.collect());
mfs.extend(self.feedback_ps_write_lsn.collect());
mfs.extend(self.feedback_last_time_seconds.collect());
mfs.extend(self.timeline_active.collect());
mfs.extend(self.wal_backup_active.collect());

View File

@@ -18,7 +18,7 @@ use crate::control_file;
use crate::send_wal::HotStandbyFeedback;
use crate::wal_storage;
use pq_proto::{PageserverFeedback, SystemId};
use pq_proto::{ReplicationFeedback, SystemId};
use utils::{
bin_ser::LeSer,
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -360,7 +360,7 @@ pub struct AppendResponse {
// a criterion for walproposer --sync mode exit
pub commit_lsn: Lsn,
pub hs_feedback: HotStandbyFeedback,
pub pageserver_feedback: PageserverFeedback,
pub pageserver_feedback: ReplicationFeedback,
}
impl AppendResponse {
@@ -370,7 +370,7 @@ impl AppendResponse {
flush_lsn: Lsn(0),
commit_lsn: Lsn(0),
hs_feedback: HotStandbyFeedback::empty(),
pageserver_feedback: PageserverFeedback::empty(),
pageserver_feedback: ReplicationFeedback::empty(),
}
}
}
@@ -708,7 +708,7 @@ where
commit_lsn: self.state.commit_lsn,
// will be filled by the upper code to avoid bothering safekeeper
hs_feedback: HotStandbyFeedback::empty(),
pageserver_feedback: PageserverFeedback::empty(),
pageserver_feedback: ReplicationFeedback::empty(),
};
trace!("formed AppendResponse {:?}", ar);
ar

Some files were not shown because too many files have changed in this diff Show More