mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-12 15:10:38 +00:00
Compare commits
46 Commits
joonas/may
...
log_subscr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2e448a22ba | ||
|
|
d1231431d6 | ||
|
|
8b445a7222 | ||
|
|
2f2aa9b79b | ||
|
|
c8b9116a97 | ||
|
|
beefc7a810 | ||
|
|
fa0750a37e | ||
|
|
0170611a97 | ||
|
|
1c96957e85 | ||
|
|
02a28c01ca | ||
|
|
c96593b473 | ||
|
|
ef57e73fbf | ||
|
|
4c5a0fdc75 | ||
|
|
4b26783c94 | ||
|
|
6949b45e17 | ||
|
|
3b8ca477ab | ||
|
|
eb7241c798 | ||
|
|
f246aa3ca7 | ||
|
|
188bde7f07 | ||
|
|
7131ac4730 | ||
|
|
2be69af6c3 | ||
|
|
c6b6b7700a | ||
|
|
e2d89f7991 | ||
|
|
25e7d321f4 | ||
|
|
3f91ea28d9 | ||
|
|
7fdc3ea162 | ||
|
|
4763a960d1 | ||
|
|
df086cd139 | ||
|
|
69cb1ee479 | ||
|
|
4e58fd9321 | ||
|
|
f087423a01 | ||
|
|
24d347f50b | ||
|
|
52641eb853 | ||
|
|
d9a57aeed9 | ||
|
|
a9c28be7d0 | ||
|
|
fef77b0cc9 | ||
|
|
168913bdf0 | ||
|
|
aa2e16f307 | ||
|
|
70b18ff481 | ||
|
|
60fc1e8cc8 | ||
|
|
36c1719a07 | ||
|
|
abb53ba36d | ||
|
|
a7028d92b7 | ||
|
|
6c9e3c9551 | ||
|
|
fc3d372f3a | ||
|
|
19d69d515c |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -1,7 +1,6 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- arm64
|
||||
- gen3
|
||||
- large
|
||||
- large-arm64
|
||||
- small
|
||||
|
||||
@@ -43,7 +43,7 @@ inputs:
|
||||
pg_version:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v14'
|
||||
default: 'v16'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -83,7 +83,6 @@ runs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
|
||||
@@ -70,7 +70,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
@@ -208,7 +207,7 @@ jobs:
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
#nextest does not yet support running doctests
|
||||
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
@@ -263,7 +262,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
|
||||
2
.github/workflows/actionlint.yml
vendored
2
.github/workflows/actionlint.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
grep -ERl $PAT .github/workflows |\
|
||||
while read -r f
|
||||
do
|
||||
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||
l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
|
||||
done
|
||||
exit 1
|
||||
|
||||
61
.github/workflows/benchmarking.yml
vendored
61
.github/workflows/benchmarking.yml
vendored
@@ -96,7 +96,7 @@ jobs:
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
@@ -146,6 +146,7 @@ jobs:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -154,7 +155,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic perf testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -176,7 +180,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -215,15 +219,23 @@ jobs:
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
|
||||
slack-message: |
|
||||
Periodic replication testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -325,7 +337,7 @@ jobs:
|
||||
prepare_AWS_RDS_databases:
|
||||
uses: ./.github/workflows/_benchmarking_preparation.yml
|
||||
secrets: inherit
|
||||
|
||||
|
||||
pgbench-compare:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
needs: [ generate-matrices, prepare_AWS_RDS_databases ]
|
||||
@@ -365,7 +377,7 @@ jobs:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -460,6 +472,7 @@ jobs:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -468,7 +481,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -542,7 +558,7 @@ jobs:
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
@@ -577,8 +593,9 @@ jobs:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -587,7 +604,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -670,6 +690,7 @@ jobs:
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -678,7 +699,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -764,6 +788,7 @@ jobs:
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -772,7 +797,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -843,6 +871,7 @@ jobs:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -851,6 +880,10 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: |
|
||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
54
.github/workflows/build_and_test.yml
vendored
54
.github/workflows/build_and_test.yml
vendored
@@ -48,7 +48,7 @@ jobs:
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -101,9 +101,6 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
@@ -142,7 +139,6 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Disabled for now
|
||||
# - name: Restore cargo deps cache
|
||||
@@ -204,7 +200,7 @@ jobs:
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
@@ -224,7 +220,7 @@ jobs:
|
||||
outputs:
|
||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -257,7 +253,7 @@ jobs:
|
||||
benchmarks:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -284,6 +280,7 @@ jobs:
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
pg_version: v16
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -302,9 +299,8 @@ jobs:
|
||||
with:
|
||||
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||
slack-message: |
|
||||
Benchmarks failed on main: ${{ github.event.head_commit.url }}
|
||||
|
||||
Allure report: ${{ needs.create-test-report.outputs.report-url }}
|
||||
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
||||
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -314,7 +310,7 @@ jobs:
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -361,7 +357,7 @@ jobs:
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
@@ -475,7 +471,7 @@ jobs:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -503,7 +499,10 @@ jobs:
|
||||
- uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
# ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure)
|
||||
# https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md
|
||||
build-args: |
|
||||
ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
@@ -551,7 +550,7 @@ jobs:
|
||||
version: [ v14, v15, v16 ]
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -696,7 +695,7 @@ jobs:
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
runs-on: [ self-hosted, large ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -745,7 +744,7 @@ jobs:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -960,7 +959,7 @@ jobs:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
@@ -980,7 +979,6 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Trigger deploy workflow
|
||||
@@ -988,10 +986,10 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
@@ -1001,14 +999,14 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
@@ -1018,7 +1016,7 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f branch=main \
|
||||
@@ -1061,7 +1059,7 @@ jobs:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
|
||||
if: github.ref_name == 'release'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
@@ -1117,10 +1115,12 @@ jobs:
|
||||
# Format `needs` differently to make the list more readable.
|
||||
# Usually we do `needs: [...]`
|
||||
needs:
|
||||
- build-and-test-locally
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
- build-and-test-locally
|
||||
- promote-images
|
||||
- test-images
|
||||
- trigger-custom-extensions-build-and-wait
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
# The list of possible results:
|
||||
|
||||
37
.github/workflows/label-for-external-users.yml
vendored
37
.github/workflows/label-for-external-users.yml
vendored
@@ -4,7 +4,7 @@ on:
|
||||
issues:
|
||||
types:
|
||||
- opened
|
||||
pull_request:
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
|
||||
@@ -15,21 +15,40 @@ env:
|
||||
LABEL: external
|
||||
|
||||
jobs:
|
||||
check-user:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
outputs:
|
||||
is-member: ${{ steps.check-user.outputs.is-member }}
|
||||
|
||||
steps:
|
||||
- name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
|
||||
id: check-user
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
|
||||
is_member=true
|
||||
else
|
||||
is_member=false
|
||||
fi
|
||||
|
||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
add-label:
|
||||
# This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
|
||||
# Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
|
||||
if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
|
||||
if: needs.check-user.outputs.is-member == 'false'
|
||||
needs: [ check-user ]
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
pull-requests: write
|
||||
issues: write
|
||||
pull-requests: write # for `gh pr edit`
|
||||
issues: write # for `gh issue edit`
|
||||
|
||||
steps:
|
||||
- name: Label new ${{ github.event_name }}
|
||||
- name: Add `${{ env.LABEL }}` label
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
|
||||
GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
|
||||
ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
|
||||
GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
|
||||
run: |
|
||||
gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -56,7 +56,6 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
run: brew install flex bison openssl protobuf icu4c pkg-config
|
||||
@@ -158,7 +157,6 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
|
||||
2
.github/workflows/periodic_pagebench.yml
vendored
2
.github/workflows/periodic_pagebench.yml
vendored
@@ -27,7 +27,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
|
||||
273
Cargo.lock
generated
273
Cargo.lock
generated
@@ -484,7 +484,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"once_cell",
|
||||
"p256",
|
||||
"p256 0.11.1",
|
||||
"percent-encoding",
|
||||
"ring 0.17.6",
|
||||
"sha2",
|
||||
@@ -848,6 +848,12 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
|
||||
|
||||
[[package]]
|
||||
name = "base16ct"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.13.1"
|
||||
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
version = "1.16.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
@@ -1526,8 +1532,10 @@ version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
|
||||
dependencies = [
|
||||
"const-oid",
|
||||
"pem-rfc7468",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"const-oid",
|
||||
"crypto-common",
|
||||
"subtle",
|
||||
]
|
||||
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
|
||||
dependencies = [
|
||||
"der 0.6.1",
|
||||
"elliptic-curve",
|
||||
"rfc6979",
|
||||
"elliptic-curve 0.12.3",
|
||||
"rfc6979 0.3.1",
|
||||
"signature 1.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ecdsa"
|
||||
version = "0.16.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"digest",
|
||||
"elliptic-curve 0.13.8",
|
||||
"rfc6979 0.4.0",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.8.1"
|
||||
@@ -1788,16 +1812,36 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"base16ct 0.1.1",
|
||||
"crypto-bigint 0.4.9",
|
||||
"der 0.6.1",
|
||||
"digest",
|
||||
"ff",
|
||||
"ff 0.12.1",
|
||||
"generic-array",
|
||||
"group",
|
||||
"pkcs8",
|
||||
"group 0.12.1",
|
||||
"pkcs8 0.9.0",
|
||||
"rand_core 0.6.4",
|
||||
"sec1",
|
||||
"sec1 0.3.0",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "elliptic-curve"
|
||||
version = "0.13.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
|
||||
dependencies = [
|
||||
"base16ct 0.2.0",
|
||||
"crypto-bigint 0.5.5",
|
||||
"digest",
|
||||
"ff 0.13.0",
|
||||
"generic-array",
|
||||
"group 0.13.0",
|
||||
"pem-rfc7468",
|
||||
"pkcs8 0.10.2",
|
||||
"rand_core 0.6.4",
|
||||
"sec1 0.7.3",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -1951,6 +1995,16 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ff"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
|
||||
dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.22"
|
||||
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
"version_check",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2214,7 +2269,18 @@ version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
|
||||
dependencies = [
|
||||
"ff",
|
||||
"ff 0.12.1",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "group"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
|
||||
dependencies = [
|
||||
"ff 0.13.0",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
@@ -2776,6 +2842,42 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-b64"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"serde",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-jwa"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-jwk"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
|
||||
dependencies = [
|
||||
"jose-b64",
|
||||
"jose-jwa",
|
||||
"p256 0.13.2",
|
||||
"p384",
|
||||
"rsa",
|
||||
"serde",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.69"
|
||||
@@ -2835,6 +2937,9 @@ name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
dependencies = [
|
||||
"spin 0.5.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
@@ -3204,6 +3309,23 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint-dig"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"lazy_static",
|
||||
"libm",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-traits",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.4"
|
||||
@@ -3481,11 +3603,33 @@ version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
|
||||
dependencies = [
|
||||
"ecdsa",
|
||||
"elliptic-curve",
|
||||
"ecdsa 0.14.8",
|
||||
"elliptic-curve 0.12.3",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
|
||||
dependencies = [
|
||||
"ecdsa 0.16.9",
|
||||
"elliptic-curve 0.13.8",
|
||||
"primeorder",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "p384"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
|
||||
dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
"primeorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
@@ -3847,6 +3991,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem-rfc7468"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.2.0"
|
||||
@@ -3913,6 +4066,17 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pkcs1"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"pkcs8 0.10.2",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkcs8"
|
||||
version = "0.9.0"
|
||||
@@ -3923,6 +4087,16 @@ dependencies = [
|
||||
"spki 0.6.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkcs8"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.27"
|
||||
@@ -4116,6 +4290,15 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "primeorder"
|
||||
version = "0.13.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
|
||||
dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.20+deprecated"
|
||||
@@ -4233,6 +4416,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
@@ -4250,6 +4434,7 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"crossbeam-deque",
|
||||
"dashmap",
|
||||
"ecdsa 0.16.9",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
@@ -4270,12 +4455,15 @@ dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"p256 0.13.2",
|
||||
"parking_lot 0.12.1",
|
||||
"parquet",
|
||||
"parquet_derive",
|
||||
@@ -4296,6 +4484,7 @@ dependencies = [
|
||||
"reqwest-retry",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rsa",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.22.4",
|
||||
@@ -4305,6 +4494,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"signature 2.2.0",
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
@@ -4807,6 +4997,16 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rfc6979"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
|
||||
dependencies = [
|
||||
"hmac",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -4867,6 +5067,26 @@ dependencies = [
|
||||
"archery",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rsa"
|
||||
version = "0.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
|
||||
dependencies = [
|
||||
"const-oid",
|
||||
"digest",
|
||||
"num-bigint-dig",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"pkcs1",
|
||||
"pkcs8 0.10.2",
|
||||
"rand_core 0.6.4",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstest"
|
||||
version = "0.18.2"
|
||||
@@ -5195,10 +5415,24 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"base16ct 0.1.1",
|
||||
"der 0.6.1",
|
||||
"generic-array",
|
||||
"pkcs8",
|
||||
"pkcs8 0.9.0",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sec1"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
|
||||
dependencies = [
|
||||
"base16ct 0.2.0",
|
||||
"der 0.7.8",
|
||||
"generic-array",
|
||||
"pkcs8 0.10.2",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -5545,6 +5779,7 @@ version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
@@ -7379,13 +7614,17 @@ dependencies = [
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"crypto-bigint 0.5.5",
|
||||
"der 0.7.8",
|
||||
"deranged",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
@@ -7393,6 +7632,7 @@ dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"memchr",
|
||||
@@ -7416,7 +7656,9 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"signature 2.2.0",
|
||||
"smallvec",
|
||||
"spki 0.7.3",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.52",
|
||||
@@ -7527,6 +7769,7 @@ version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
|
||||
@@ -35,8 +35,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS
|
||||
RUN set -e \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
|
||||
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -379,7 +379,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
|
||||
@@ -15,7 +15,9 @@ use control_plane::local_env::{
|
||||
};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
@@ -52,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
const DEFAULT_PG_VERSION: &str = "16";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
@@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
||||
humantime_duration.as_ref()
|
||||
}
|
||||
|
||||
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
|
||||
let base_port = args.get_one::<u16>("base-port");
|
||||
|
||||
if maybe_instance_id.is_some() && base_port.is_none() {
|
||||
panic!("storage-controller start specificied instance-id but did not provide base-port");
|
||||
}
|
||||
|
||||
let start_timeout = args
|
||||
.get_one::<humantime::Duration>("start-timeout")
|
||||
.expect("invalid value for start-timeout");
|
||||
|
||||
NeonStorageControllerStartArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
base_port: base_port.copied(),
|
||||
start_timeout: *start_timeout,
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
NeonStorageControllerStopArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
@@ -1113,19 +1145,14 @@ async fn handle_storage_controller(
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1228,7 +1255,12 @@ async fn handle_start_all(
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||
if let Err(e) = storage_controller
|
||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
||||
(*retry_timeout).into(),
|
||||
))
|
||||
.await
|
||||
{
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
eprintln!("neon broker stop failed: {e:#}");
|
||||
}
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
// Stop all storage controller instances. In the most common case there's only one,
|
||||
// but iterate though the base data directory in order to discover the instances.
|
||||
let storcon_instances = env
|
||||
.storage_controller_instances()
|
||||
.await
|
||||
.expect("Must inspect data dir");
|
||||
for (instance_id, _instance_dir_path) in storcon_instances {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
let stop_args = NeonStorageControllerStopArgs {
|
||||
instance_id,
|
||||
immediate,
|
||||
};
|
||||
|
||||
if let Err(e) = storage_controller.stop(stop_args).await {
|
||||
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1501,6 +1544,18 @@ fn cli() -> Command {
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false);
|
||||
|
||||
let instance_id = Arg::new("instance-id")
|
||||
.long("instance-id")
|
||||
.help("Identifier used to distinguish storage controller instances (default 1)")
|
||||
.value_parser(value_parser!(u8))
|
||||
.required(false);
|
||||
|
||||
let base_port = Arg::new("base-port")
|
||||
.long("base-port")
|
||||
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
|
||||
.value_parser(value_parser!(u16))
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1609,9 +1664,12 @@ fn cli() -> Command {
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start storage controller")
|
||||
.arg(timeout_arg.clone()))
|
||||
.arg(timeout_arg.clone())
|
||||
.arg(instance_id.clone())
|
||||
.arg(base_port))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(instance_id))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
@@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_warming_up: Duration,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
|
||||
/// Database url used when running multiple storage controller instances
|
||||
pub database_url: Option<SocketAddr>,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
@@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf {
|
||||
Self {
|
||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||
start_as_candidate: false,
|
||||
database_url: None,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
}
|
||||
@@ -392,6 +399,36 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
let mut instances = Vec::default();
|
||||
|
||||
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
|
||||
for dentry in dir {
|
||||
let dentry = dentry?;
|
||||
let is_dir = dentry.metadata()?.is_dir();
|
||||
let filename = dentry.file_name().into_string().unwrap();
|
||||
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
|
||||
Some(suffix) => suffix.parse::<u8>().ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let is_instance_dir = is_dir && parsed_instance_id.is_some();
|
||||
|
||||
if !is_instance_dir {
|
||||
continue;
|
||||
}
|
||||
|
||||
instances.push((
|
||||
parsed_instance_id.expect("Checked previously"),
|
||||
dentry.path(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(instances)
|
||||
}
|
||||
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
|
||||
@@ -3,6 +3,8 @@ use crate::{
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
@@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, str::FromStr, time::Duration};
|
||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -29,12 +31,14 @@ use utils::{
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
|
||||
// The listen addresses is learned when starting the storage controller,
|
||||
// hence the use of OnceLock to init it at the right time.
|
||||
listen: OnceLock<SocketAddr>,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
@@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
pub struct NeonStorageControllerStartArgs {
|
||||
pub instance_id: u8,
|
||||
pub base_port: Option<u16>,
|
||||
pub start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStartArgs {
|
||||
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
base_port: None,
|
||||
start_timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NeonStorageControllerStopArgs {
|
||||
pub instance_id: u8,
|
||||
pub immediate: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStopArgs {
|
||||
pub fn with_default_instance_id(immediate: bool) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -67,23 +101,6 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -126,20 +143,28 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
listen: OnceLock::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join(format!("storage_controller_{}", instance_id))
|
||||
}
|
||||
|
||||
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.storage_controller_instance_dir(instance_id)
|
||||
.join("storage_controller.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
@@ -184,23 +209,23 @@ impl StorageController {
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
/// Create our database if it doesn't exist
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let createdb_path = pg_bin_dir.join("createdb");
|
||||
@@ -209,7 +234,7 @@ impl StorageController {
|
||||
"-h",
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", self.postgres_port),
|
||||
&format!("{}", postgres_port),
|
||||
DB_NAME,
|
||||
])
|
||||
.output()
|
||||
@@ -230,13 +255,14 @@ impl StorageController {
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
postgres_port: u16,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(self.postgres_port)
|
||||
.port(postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
@@ -252,72 +278,114 @@ impl StorageController {
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
||||
if err.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
panic!("Failed to create instance dir {instance_dir:?}");
|
||||
}
|
||||
}
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
let (listen, postgres_port) = {
|
||||
if let Some(base_port) = start_args.base_port {
|
||||
(
|
||||
format!("127.0.0.1:{base_port}"),
|
||||
self.config
|
||||
.database_url
|
||||
.expect("--base-port requires NeonStorageControllerConf::database_url")
|
||||
.port(),
|
||||
)
|
||||
} else {
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
(listen, listen_url.port().unwrap() + 1)
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
let socket_addr = listen
|
||||
.parse()
|
||||
.expect("listen address is a valid socket address");
|
||||
self.listen
|
||||
.set(socket_addr)
|
||||
.expect("StorageController::listen is only set here");
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
// Do we remove the pid file on stop?
|
||||
let pg_started = self.is_postgres_running().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
retry_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
if !pg_started {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
&start_args.start_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
@@ -339,7 +407,7 @@ impl StorageController {
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database().await?;
|
||||
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
@@ -348,9 +416,20 @@ impl StorageController {
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let listen = self
|
||||
.listen
|
||||
.get()
|
||||
.expect("cell is set earlier in this function");
|
||||
let address_for_peers = Uri::builder()
|
||||
.scheme("http")
|
||||
.authority(format!("{}:{}", listen.ip(), listen.port()))
|
||||
.path_and_query("")
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
&listen.to_string(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
@@ -358,15 +437,27 @@ impl StorageController {
|
||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||
"--max-warming-up-interval",
|
||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||
"--address-for-peers",
|
||||
&address_for_peers.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if self.config.start_as_candidate {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
|
||||
let peer_claims = Claims::new(None, Scope::Admin);
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
@@ -394,15 +485,15 @@ impl StorageController {
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&instance_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
||||
&start_args.start_timeout,
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
@@ -415,8 +506,35 @@ impl StorageController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
|
||||
background_process::stop_process(
|
||||
stop_args.immediate,
|
||||
COMMAND,
|
||||
&self.pid_file(stop_args.instance_id),
|
||||
)?;
|
||||
|
||||
let storcon_instances = self.env.storage_controller_instances().await?;
|
||||
for (instance_id, instanced_dir_path) in storcon_instances {
|
||||
if instance_id == stop_args.instance_id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pid_file = instanced_dir_path.join("storage_controller.pid");
|
||||
let pid = tokio::fs::read_to_string(&pid_file)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
|
||||
})?
|
||||
.parse::<i32>()
|
||||
.expect("pid is valid i32");
|
||||
|
||||
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
|
||||
if other_proc_alive {
|
||||
// There is another storage controller instance running, so we return
|
||||
// and leave the database running.
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -429,27 +547,51 @@ impl StorageController {
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
match self.is_postgres_running().await {
|
||||
Ok(false) => {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(true) => {
|
||||
anyhow::bail!("Failed to stop storage controller database");
|
||||
}
|
||||
Err(err) => {
|
||||
anyhow::bail!("Failed to stop storage controller database: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
const PG_NO_DATA_DIR: i32 = 4;
|
||||
const PG_STATUS_RUNNING: i32 = 0;
|
||||
match status_exitcode.code() {
|
||||
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
|
||||
Some(PG_NO_DATA_DIR) => Ok(false),
|
||||
Some(PG_STATUS_RUNNING) => Ok(true),
|
||||
Some(code) => Err(anyhow::anyhow!(
|
||||
"pg_ctl status returned unexpected status code: {:?}",
|
||||
code
|
||||
)),
|
||||
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
@@ -475,15 +617,31 @@ impl StorageController {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
// In the special case of the `storage_controller start` subcommand, we wish
|
||||
// to use the API endpoint of the newly started storage controller in order
|
||||
// to pass the readiness check. In this scenario [`Self::listen`] will be set
|
||||
// (see [`Self::start`]).
|
||||
//
|
||||
// Otherwise, we infer the storage controller api endpoint from the configured
|
||||
// control plane API.
|
||||
let url = if let Some(socket_addr) = self.listen.get() {
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
socket_addr.ip().to_canonical(),
|
||||
socket_addr.port()
|
||||
))
|
||||
.unwrap()
|
||||
} else {
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
|
||||
@@ -622,6 +622,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
threshold: threshold.into(),
|
||||
},
|
||||
)),
|
||||
heatmap_period: Some("300s".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
|
||||
@@ -22,7 +22,10 @@ feature-depth = 1
|
||||
[advisories]
|
||||
db-urls = ["https://github.com/rustsec/advisory-db"]
|
||||
yanked = "warn"
|
||||
ignore = []
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
|
||||
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
|
||||
during the restart at 2024-04-03 16:37 UTC.
|
||||
|
||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||
|
||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||
|
||||
265
docs/rfcs/036-physical-replication.md
Normal file
265
docs/rfcs/036-physical-replication.md
Normal file
@@ -0,0 +1,265 @@
|
||||
# Physical Replication
|
||||
|
||||
This RFC is a bit special in that we have already implemented physical
|
||||
replication a long time ago. However, we never properly wrote down all
|
||||
the decisions and assumptions, and in the last months when more users
|
||||
have started to use the feature, numerous issues have surfaced.
|
||||
|
||||
This RFC documents the design decisions that have been made.
|
||||
|
||||
## Summary
|
||||
|
||||
PostgreSQL has a feature called streaming replication, where a replica
|
||||
streams WAL from the primary and continuously applies it. It is also
|
||||
known as "physical replication", to distinguish it from logical
|
||||
replication. In PostgreSQL, a replica is initialized by taking a
|
||||
physical backup of the primary. In Neon, the replica is initialized
|
||||
from a slim "base backup" from the pageserver, just like a primary,
|
||||
and the primary and the replicas connect to the same pageserver,
|
||||
sharing the storage.
|
||||
|
||||
There are two kinds of read-only replicas in Neon:
|
||||
- replicas that follow the primary, and
|
||||
- "static" replicas that are pinned at a particular LSN.
|
||||
|
||||
A static replica is useful e.g. for performing time-travel queries and
|
||||
running one-off slow queries without affecting the primary. A replica
|
||||
that follows the primary can be used e.g. to scale out read-only
|
||||
workloads.
|
||||
|
||||
## Motivation
|
||||
|
||||
Read-only replicas allow offloading read-only queries. It's useful for
|
||||
isolation, if you want to make sure that read-only queries don't
|
||||
affect the primary, and it's also an easy way to provide guaranteed
|
||||
read-only access to an application, without having to mess with access
|
||||
controls.
|
||||
|
||||
## Non Goals (if relevant)
|
||||
|
||||
This RFC is all about WAL-based *physical* replication. Logical
|
||||
replication is a different feature.
|
||||
|
||||
Neon also has the capability to launch "static" read-only nodes which
|
||||
do not follow the primary, but are pinned to a particular LSN. They
|
||||
can be used for long-running one-off queries, or for Point-in-time
|
||||
queries. They work similarly to read replicas that follow the primary,
|
||||
but some things are simpler: there are no concerns about cache
|
||||
invalidation when the data changes on the primary, or worrying about
|
||||
transactions that are in-progress on the primary.
|
||||
|
||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||
|
||||
- Control plane launches the replica
|
||||
- Replica Postgres instance connects to the safekeepers, to stream the WAL
|
||||
- The primary does not know about the standby, except for the hot standby feedback
|
||||
- The primary and replicas all connect to the same pageservers
|
||||
|
||||
|
||||
# Context
|
||||
|
||||
Some useful things to know about hot standby and replicas in
|
||||
PostgreSQL.
|
||||
|
||||
## PostgreSQL startup sequence
|
||||
|
||||
"Running" and "start up" terms are little imprecise. PostgreSQL
|
||||
replica startup goes through several stages:
|
||||
|
||||
1. First, the process is started up, and various initialization steps
|
||||
are performed, like initializing shared memory. If you try to
|
||||
connect to the server in this stage, you get an error: ERROR: the
|
||||
database system is starting up. This stage happens very quickly, no
|
||||
|
||||
2. Then the server reads the checpoint record from the WAL and starts
|
||||
the WAL replay starting from the checkpoint. This works differently
|
||||
in Neon: we start the WAL replay at the basebackup LSN, not from a
|
||||
checkpoint! If you connect to the server in this state, you get an
|
||||
error: ERROR: the database system is not yet accepting
|
||||
connections. We proceed to the next stage, when the WAL replay sees
|
||||
a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
|
||||
can allow us to move directly to next stage, with all the caveats
|
||||
listed in this RFC.
|
||||
|
||||
3. When the running-xacts information is established, the server
|
||||
starts to accept connections normally.
|
||||
|
||||
From PostgreSQL's point of view, the server is already running in
|
||||
stage 2, even though it's not accepting connections yet. Our
|
||||
`compute_ctl` does not consider it as running until stage 3. If the
|
||||
transition from stage 2 to 3 doesn't happen fast enough, the control
|
||||
plane will mark the start operation as failed.
|
||||
|
||||
|
||||
## Decisions, Issues
|
||||
|
||||
### Cache invalidation in replica
|
||||
|
||||
When a read replica follows the primary in PostgreSQL, it needs to
|
||||
stream all the WAL from the primary and apply all the records, to keep
|
||||
the local copy of the data consistent with the primary. In Neon, the
|
||||
replica can fetch the updated page versions from the pageserver, so
|
||||
it's not necessary to apply all the WAL. However, it needs to ensure
|
||||
that any pages that are currently in the Postgres buffer cache, or the
|
||||
Local File Cache, are either updated, or thrown away so that the next
|
||||
read of the page will fetch the latest version.
|
||||
|
||||
We choose to apply the WAL records for pages that are already in the
|
||||
buffer cache, and skip records for other pages. Somewhat arbitrarily,
|
||||
we also apply records affecting catalog relations, fetching the old
|
||||
page version from the pageserver if necessary first. See
|
||||
`neon_redo_read_buffer_filter()` function.
|
||||
|
||||
The replica wouldn't necessarily need to see all the WAL records, only
|
||||
the records that apply to cached pages. For simplicity, we do stream
|
||||
all the WAL to the replica, and the replica simply ignores WAL records
|
||||
that require no action.
|
||||
|
||||
Like in PostgreSQL, the read replica maintains a "replay LSN", which
|
||||
is the LSN up to which the replica has received and replayed the
|
||||
WAL. The replica can lag behind the primary, if it cannot quite keep
|
||||
up with the primary, or if a long-running query conflicts with changes
|
||||
that are about to be applied, or even intentionally if the user wishes
|
||||
to see delayed data (see recovery_min_apply_delay). It's important
|
||||
that the replica sees a consistent view of the whole cluster at the
|
||||
replay LSN, when it's lagging behind.
|
||||
|
||||
In Neon, the replica connects to a safekeeper to get the WAL
|
||||
stream. That means that the safekeepers must be able to regurgitate
|
||||
the original WAL as far back as the replay LSN of any running read
|
||||
replica. (A static read-only node that does not follow the primary
|
||||
does not require a WAL stream however). The primary does not need to
|
||||
be running, and when it is, the replicas don't incur any extra
|
||||
overhead to the primary (see hot standby feedback though).
|
||||
|
||||
### In-progress transactions
|
||||
|
||||
In PostgreSQL, when a hot standby server starts up, it cannot
|
||||
immediately open up for queries (see [PostgreSQL startup
|
||||
sequence]). It first needs to establish a complete list of in-progress
|
||||
transactions, including subtransactions, that are running at the
|
||||
primary, at the current replay LSN. Normally that happens quickly,
|
||||
when the replica sees a "running-xacts" WAL record, because the
|
||||
primary writes a running-xacts WAL record at every checkpoint, and in
|
||||
PostgreSQL the replica always starts the WAL replay from a checkpoint
|
||||
REDO point. (A shutdown checkpoint WAL record also implies that all
|
||||
the non-prepared transactions have ended.) If there are a lot of
|
||||
subtransactions in progress, however, the standby might need to wait
|
||||
for old transactions to complete before it can open up for queries.
|
||||
|
||||
In Neon that problem is worse: a replica can start at any LSN, so
|
||||
there's no guarantee that it will see a running-xacts record any time
|
||||
soon. In particular, if the primary is not running when the replica is
|
||||
started, it might never see a running-xacts record.
|
||||
|
||||
To make things worse, we initially missed this issue, and always
|
||||
started accepting queries at replica startup, even if it didn't have
|
||||
the transaction information. That could lead to incorrect query
|
||||
results and data corruption later. However, as we fixed that, we
|
||||
introduced a new problem compared to what we had before: previously
|
||||
the replica would always start up, but after fixing that bug, it might
|
||||
not. In a superficial way, the old behavior was better (but could lead
|
||||
to serious issues later!). That made fixing that bug was very hard,
|
||||
because as we fixed it, we made things (superficially) worse for
|
||||
others.
|
||||
|
||||
See https://github.com/neondatabase/neon/pull/7288 which fixed the
|
||||
bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
|
||||
and https://github.com/neondatabase/neon/pull/8484 to try to claw back
|
||||
the cases that started to cause trouble as fixing it. As of this
|
||||
writing, there are still cases where a replica might not immediately
|
||||
start up, causing the control plane operation to fail, the remaining
|
||||
issues are tracked in https://github.com/neondatabase/neon/issues/6211.
|
||||
|
||||
One long-term fix for this is to switch to using so-called CSN
|
||||
snapshots in read replica. That would make it unnecessary to have the
|
||||
full in-progress transaction list in the replica at startup time. See
|
||||
https://commitfest.postgresql.org/48/4912/ for a work-in-progress
|
||||
patch to upstream to implement that.
|
||||
|
||||
Another thing we could do is to teach the control plane about that
|
||||
distinction between "starting up" and "running but haven't received
|
||||
running-xacts information yet", so that we could keep the replica
|
||||
waiting longer in that stage, and also give any client connections the
|
||||
same `ERROR: the database system is not yet accepting connections`
|
||||
error that you get in standalone PostgreSQL in that state.
|
||||
|
||||
|
||||
### Recovery conflicts and Hot standby feedback
|
||||
|
||||
It's possible that a tuple version is vacuumed away in the primary,
|
||||
even though it is still needed by a running transactions in the
|
||||
replica. This is called a "recovery conflict", and PostgreSQL provides
|
||||
various options for dealing with it. By default, the WAL replay will
|
||||
wait up to 30 s for the conflicting query to finish. After that, it
|
||||
will kill the running query, so that the WAL replay can proceed.
|
||||
|
||||
Another way to avoid the situation is to enable the
|
||||
[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
|
||||
option. When it is enabled, the primary will refrain from vacuuming
|
||||
tuples that are still needed in the primary. That means potentially
|
||||
bloating the primary, which violates the usual rule that read replicas
|
||||
don't affect the operations on the primary, which is why it's off by
|
||||
default. We leave it to users to decide if they want to turn it on,
|
||||
same as PostgreSQL.
|
||||
|
||||
Neon supports `hot_standby_feedback` by passing the feedback messages
|
||||
from the replica to the safekeepers, and from safekeepers to the
|
||||
primary.
|
||||
|
||||
### Relationship of settings between primary and replica
|
||||
|
||||
In order to enter hot standby mode, some configuration options need to
|
||||
be set to the same or larger values in the standby, compared to the
|
||||
primary. See [explanation in the PostgreSQL
|
||||
docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
|
||||
|
||||
In Neon, we have this problem too. To prevent customers from hitting
|
||||
it, the control plane automatically adjusts the settings of a replica,
|
||||
so that they match or exceed the primary's settings (see
|
||||
https://github.com/neondatabase/cloud/issues/14903). However, you
|
||||
can still hit the issue if the primary is restarted with larger
|
||||
settings, while the replica is running.
|
||||
|
||||
|
||||
### Interaction with Pageserver GC
|
||||
|
||||
The read replica can lag behind the primary. If there are recovery
|
||||
conflicts or the replica cannot keep up for some reason, the lag can
|
||||
in principle grow indefinitely. The replica will issue all GetPage
|
||||
requests to the pageservers at the current replay LSN, and needs to
|
||||
see the old page versions.
|
||||
|
||||
If the retention period in the pageserver is set to be small, it may
|
||||
have already garbage collected away the old page versions. That will
|
||||
cause read errors in the compute, and can mean that the replica cannot
|
||||
make progress with the replication anymore.
|
||||
|
||||
There is a mechanism for replica to pass information about its replay
|
||||
LSN to the pageserver, so that the pageserver refrains from GC'ing
|
||||
data that is still needed by the standby. It's called
|
||||
'standby_horizon' in the pageserver code, see
|
||||
https://github.com/neondatabase/neon/pull/7368. A separate "lease"
|
||||
mechanism also is in the works, where the replica could hold a lease
|
||||
on the old LSN, preventing the pageserver from advancing the GC
|
||||
horizon past that point. The difference is that the standby_horizon
|
||||
mechanism relies on a feedback message from replica to safekeeper,
|
||||
while the least API is exposed directly from the pageserver. A static
|
||||
read-only node is not connected to safekeepers, so it cannot use the
|
||||
standby_horizon mechanism.
|
||||
|
||||
|
||||
### Synchronous replication
|
||||
|
||||
We haven't put any effort into synchronous replication yet.
|
||||
|
||||
PostgreSQL provides multiple levels of synchronicity. In the weaker
|
||||
levels, a transaction is not acknowledged as committed to the client
|
||||
in the primary until the WAL has been streamed to a replica or flushed
|
||||
to disk there. Those modes don't make senses in Neon, because the
|
||||
safekeepers handle durability.
|
||||
|
||||
`synchronous_commit=remote_apply` mode would make sense. In that mode,
|
||||
the commit is not acknowledged to the client until it has been
|
||||
replayed in the replica. That ensures that after commit, you can see
|
||||
the commit in the replica too (aka. read-your-write consistency).
|
||||
@@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest {
|
||||
pub struct MetadataHealthUpdateResponse {}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListUnhealthyResponse {
|
||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedRequest {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub not_scrubbed_for: Duration,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
pub struct MetadataHealthListOutdatedResponse {
|
||||
pub health_records: Vec<MetadataHealthRecord>,
|
||||
}
|
||||
|
||||
@@ -143,8 +143,8 @@ pub use v14::xlog_utils::XLogFileName;
|
||||
|
||||
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||
|
||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||
dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
|
||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
|
||||
dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
|
||||
}
|
||||
|
||||
pub fn generate_wal_segment(
|
||||
|
||||
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
|
||||
let properties_future = blob_client.get_properties().into_future();
|
||||
|
||||
let properties_future = tokio::time::timeout(self.timeout, properties_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = properties_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
let data = match res {
|
||||
Ok(Ok(data)) => Ok(data),
|
||||
Ok(Err(sdk)) => Err(to_download_error(sdk)),
|
||||
Err(_timeout) => Err(DownloadError::Timeout),
|
||||
}?;
|
||||
|
||||
let properties = data.blob.properties;
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::from(properties.last_modified),
|
||||
size: properties.content_length,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -150,7 +150,7 @@ pub enum ListingMode {
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
Ok(combined)
|
||||
}
|
||||
|
||||
/// Obtain metadata information about an object.
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// See [`RemoteStorage::head_object`].
|
||||
pub async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.head_object(key, cancel).await,
|
||||
Self::AwsS3(s) => s.head_object(key, cancel).await,
|
||||
Self::AzureBlob(s) => s.head_object(key, cancel).await,
|
||||
Self::Unreliable(s) => s.head_object(key, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
RequestKind::Head => &self.read,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let target_file_path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&target_file_path).await?;
|
||||
Ok(ListingObject {
|
||||
key: key.clone(),
|
||||
last_modified: metadata.modified()?,
|
||||
size: metadata.len(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
|
||||
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
Head = 6,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
@@ -27,6 +28,7 @@ impl RequestKind {
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
Head => "head_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -34,7 +36,8 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
const REQUEST_KIND_COUNT: usize = 7;
|
||||
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
|
||||
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -23,7 +23,7 @@ use aws_config::{
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
operation::{get_object::GetObjectError, head_object::HeadObjectError},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||
Client,
|
||||
};
|
||||
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let head_future = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket_name())
|
||||
.key(self.relative_path_to_s3_object(key))
|
||||
.send();
|
||||
|
||||
let head_future = tokio::time::timeout(self.timeout, head_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = head_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let res = res.map_err(|_e| DownloadError::Timeout)?;
|
||||
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
let data = match res {
|
||||
Ok(object_output) => object_output,
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
return Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("s3 head object"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"head_object doesn't contain last_modified or content_length"
|
||||
)))?;
|
||||
};
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
|
||||
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
|
||||
})?,
|
||||
size: size as u64,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
enum RemoteOp {
|
||||
ListPrefixes(Option<RemotePath>),
|
||||
HeadObject(RemotePath),
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::ListingObject, DownloadError> {
|
||||
self.attempt(RemoteOp::HeadObject(key.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.head_object(key, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -413,6 +413,10 @@ where
|
||||
let mut content = Vec::with_capacity(n_origins * 16 + 8);
|
||||
content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
|
||||
for (origin_id, origin_lsn) in repl_origins {
|
||||
info!(
|
||||
"Include origin_id={}, origin_lsn={} in basebackup",
|
||||
origin_id, origin_lsn
|
||||
);
|
||||
content.extend_from_slice(&origin_id.to_le_bytes());
|
||||
content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
|
||||
content.extend_from_slice(&origin_lsn.0.to_le_bytes());
|
||||
|
||||
@@ -50,7 +50,6 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
@@ -90,8 +89,7 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
@@ -478,7 +476,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
@@ -1065,7 +1063,7 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1305,7 +1303,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1378,7 +1376,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
|
||||
@@ -64,7 +64,7 @@ use crate::{
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
|
||||
},
|
||||
CancellableTask, DiskUsageEvictionTask,
|
||||
};
|
||||
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
|
||||
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
@@ -644,6 +644,7 @@ pub(crate) struct EvictionCandidate {
|
||||
pub(crate) layer: EvictionLayer,
|
||||
pub(crate) last_activity_ts: SystemTime,
|
||||
pub(crate) relative_last_activity: finite_f32::FiniteF32,
|
||||
pub(crate) visibility: LayerVisibilityHint,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EvictionLayer {
|
||||
@@ -685,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum MinResidentSizePartition {
|
||||
enum EvictionPartition {
|
||||
// A layer that is un-wanted by the tenant: evict all these first, before considering
|
||||
// any other layers
|
||||
EvictNow,
|
||||
|
||||
// Above the minimum size threshold: this layer is a candidate for eviction.
|
||||
Above,
|
||||
|
||||
// Below the minimum size threshold: this layer should only be evicted if all the
|
||||
// tenants' layers above the minimum size threshold have already been considered.
|
||||
Below,
|
||||
}
|
||||
|
||||
enum EvictionCandidates {
|
||||
Cancelled,
|
||||
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
|
||||
Finished(Vec<(EvictionPartition, EvictionCandidate)>),
|
||||
}
|
||||
|
||||
/// Gather the eviction candidates.
|
||||
@@ -890,8 +899,10 @@ async fn collect_eviction_candidates(
|
||||
max_layer_size
|
||||
};
|
||||
|
||||
// Sort layers most-recently-used first, then partition by
|
||||
// cumsum above/below min_resident_size.
|
||||
// Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
|
||||
// where the inputs are:
|
||||
// - whether the layer is visible
|
||||
// - whether the layer is above/below the min_resident_size cutline
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
@@ -908,12 +919,23 @@ async fn collect_eviction_candidates(
|
||||
candidate.relative_last_activity =
|
||||
eviction_order.relative_last_activity(total, i);
|
||||
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
} else {
|
||||
MinResidentSizePartition::Below
|
||||
let partition = match candidate.visibility {
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Covered layers are evicted first
|
||||
EvictionPartition::EvictNow
|
||||
}
|
||||
LayerVisibilityHint::Visible => {
|
||||
cumsum += i128::from(candidate.layer.get_file_size());
|
||||
|
||||
if cumsum > min_resident_size as i128 {
|
||||
EvictionPartition::Above
|
||||
} else {
|
||||
// The most recent layers below the min_resident_size threshold
|
||||
// are the last to be evicted.
|
||||
EvictionPartition::Below
|
||||
}
|
||||
}
|
||||
};
|
||||
cumsum += i128::from(candidate.layer.get_file_size());
|
||||
|
||||
(partition, candidate)
|
||||
});
|
||||
@@ -981,7 +1003,7 @@ async fn collect_eviction_candidates(
|
||||
// Secondary locations' layers are always considered above the min resident size,
|
||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||
// the layers have sufficiently old access times.
|
||||
MinResidentSizePartition::Above,
|
||||
EvictionPartition::Above,
|
||||
candidate,
|
||||
)
|
||||
});
|
||||
@@ -1009,7 +1031,9 @@ async fn collect_eviction_candidates(
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
eviction_order.sort(&mut candidates);
|
||||
@@ -1022,7 +1046,7 @@ async fn collect_eviction_candidates(
|
||||
///
|
||||
/// Returns the amount of candidates selected, with the planned usage.
|
||||
fn select_victims<U: Usage>(
|
||||
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
|
||||
candidates: &[(EvictionPartition, EvictionCandidate)],
|
||||
usage_pre: U,
|
||||
) -> VictimSelection<U> {
|
||||
let mut usage_when_switched = None;
|
||||
@@ -1034,7 +1058,7 @@ fn select_victims<U: Usage>(
|
||||
break;
|
||||
}
|
||||
|
||||
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
|
||||
if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
|
||||
usage_when_switched = Some((usage_planned, i));
|
||||
}
|
||||
|
||||
|
||||
@@ -178,10 +178,8 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
impl From<PageReconstructError> for ApiError {
|
||||
fn from(pre: PageReconstructError) -> ApiError {
|
||||
match pre {
|
||||
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||
PageReconstructError::MissingKey(e) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||
}
|
||||
PageReconstructError::Other(other) => ApiError::InternalServerError(other),
|
||||
PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()),
|
||||
PageReconstructError::Cancelled => ApiError::Cancelled,
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
|
||||
impl L0FlushGlobalState {
|
||||
pub fn new(config: L0FlushConfig) -> Self {
|
||||
match config {
|
||||
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||
L0FlushConfig::Direct { max_concurrency } => {
|
||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||
Self(Arc::new(Inner::Direct { semaphore }))
|
||||
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl L0FlushConfig {
|
||||
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||
use L0FlushConfig::*;
|
||||
match self {
|
||||
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
|
||||
@@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_secondary_heatmap_total_size",
|
||||
"The total size in bytes of all layers in the most recently downloaded heatmap.",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
|
||||
@@ -287,10 +287,7 @@ impl Timeline {
|
||||
// then check if the database was already initialized.
|
||||
// get_rel_exists can be called before dbdir is created.
|
||||
let buf = version.get(self, DBDIR_KEY, ctx).await?;
|
||||
let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.dbdirs),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}?;
|
||||
let dbdirs = DbDirectory::des(&buf)?.dbdirs;
|
||||
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
|
||||
return Ok(false);
|
||||
}
|
||||
@@ -298,13 +295,8 @@ impl Timeline {
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
@@ -323,20 +315,16 @@ impl Timeline {
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let rels: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
let rels: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Get the whole SLRU segment
|
||||
@@ -398,13 +386,8 @@ impl Timeline {
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.segments.contains(&segno);
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
Ok(dir.segments.contains(&segno))
|
||||
}
|
||||
|
||||
/// Locate LSN, such that all transactions that committed before
|
||||
@@ -620,10 +603,7 @@ impl Timeline {
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
Ok(SlruSegmentDirectory::des(&buf)?.segments)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_relmap_file(
|
||||
@@ -647,10 +627,7 @@ impl Timeline {
|
||||
// fetch directory entry
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.dbdirs),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
Ok(DbDirectory::des(&buf)?.dbdirs)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_twophase_file(
|
||||
@@ -672,10 +649,7 @@ impl Timeline {
|
||||
// fetch directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.xids),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
Ok(TwoPhaseDirectory::des(&buf)?.xids)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_control_file(
|
||||
@@ -700,10 +674,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.files),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
},
|
||||
Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
@@ -719,13 +690,14 @@ impl Timeline {
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||
.await
|
||||
.context("scan")?;
|
||||
.await?;
|
||||
let mut result = HashMap::new();
|
||||
let mut sz = 0;
|
||||
for (_, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
|
||||
let v = v?;
|
||||
let v = aux_file::decode_file_value_bytes(&v)
|
||||
.context("value decode")
|
||||
.map_err(PageReconstructError::Other)?;
|
||||
for (fname, content) in v {
|
||||
sz += fname.len();
|
||||
sz += content.len();
|
||||
@@ -793,11 +765,10 @@ impl Timeline {
|
||||
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
||||
let kv = self
|
||||
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
||||
.await
|
||||
.context("scan")?;
|
||||
.await?;
|
||||
let mut result = HashMap::new();
|
||||
for (k, v) in kv {
|
||||
let v = v.context("get value")?;
|
||||
let v = v?;
|
||||
let origin_id = k.field6 as RepOriginId;
|
||||
let origin_lsn = Lsn::des(&v).unwrap();
|
||||
if origin_lsn != Lsn::INVALID {
|
||||
@@ -1733,12 +1704,17 @@ impl<'a> DatadirModification<'a> {
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
e @ (PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
| PageReconstructError::MissingKey(_)),
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
if !matches!(e, PageReconstructError::MissingKey(_)) {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
tracing::warn!("treating error as if it was a missing key: {}", e);
|
||||
}
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
@@ -1893,7 +1869,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
|
||||
@@ -393,7 +393,7 @@ struct PageServerTask {
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
@@ -405,7 +405,7 @@ struct PageServerTask {
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
future: F,
|
||||
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
if log_all {
|
||||
if tenant_shard_id.is_none() {
|
||||
// there are quite few of these
|
||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||
} else {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
|
||||
@@ -798,7 +798,7 @@ impl Tenant {
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Attach,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
"attach tenant",
|
||||
async move {
|
||||
@@ -4491,10 +4491,13 @@ mod tests {
|
||||
|
||||
// This needs to traverse to the parent, and fails.
|
||||
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
|
||||
assert!(err.to_string().starts_with(&format!(
|
||||
"Bad state on timeline {}: Broken",
|
||||
tline.timeline_id
|
||||
)));
|
||||
assert!(
|
||||
err.to_string().starts_with(&format!(
|
||||
"bad state on timeline {}: Broken",
|
||||
tline.timeline_id
|
||||
)),
|
||||
"{err}"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ use tracing::warn;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
@@ -186,11 +187,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
/// You need to make sure that the internal buffer is empty, otherwise
|
||||
/// data will be written in wrong order.
|
||||
#[inline(always)]
|
||||
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all_unbuffered<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
src_buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
||||
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
|
||||
let nbytes = match res {
|
||||
Ok(nbytes) => nbytes,
|
||||
@@ -204,8 +205,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
|
||||
let buf = std::mem::take(&mut self.buf);
|
||||
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
|
||||
let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
let mut buf = slice.into_raw_slice().into_inner();
|
||||
buf.clear();
|
||||
self.buf = buf;
|
||||
Ok(())
|
||||
@@ -222,19 +224,30 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
}
|
||||
|
||||
/// Internal, possibly buffered, write function
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
src_buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
||||
let src_buf = src_buf.into_raw_slice();
|
||||
let src_buf_bounds = src_buf.bounds();
|
||||
let restore = move |src_buf_slice: Slice<_>| {
|
||||
FullSlice::must_new(Slice::from_buf_bounds(
|
||||
src_buf_slice.into_inner(),
|
||||
src_buf_bounds,
|
||||
))
|
||||
};
|
||||
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
return self.write_all_unbuffered(src_buf, ctx).await;
|
||||
return self
|
||||
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
|
||||
.await;
|
||||
}
|
||||
let remaining = Self::CAPACITY - self.buf.len();
|
||||
let src_buf_len = src_buf.bytes_init();
|
||||
if src_buf_len == 0 {
|
||||
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
|
||||
return (restore(src_buf), Ok(()));
|
||||
}
|
||||
let mut src_buf = src_buf.slice(0..src_buf_len);
|
||||
// First try to copy as much as we can into the buffer
|
||||
@@ -245,7 +258,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
// Then, if the buffer is full, flush it out
|
||||
if self.buf.len() == Self::CAPACITY {
|
||||
if let Err(e) = self.flush_buffer(ctx).await {
|
||||
return (Slice::into_inner(src_buf), Err(e));
|
||||
return (restore(src_buf), Err(e));
|
||||
}
|
||||
}
|
||||
// Finally, write the tail of src_buf:
|
||||
@@ -258,27 +271,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
let copied = self.write_into_buffer(&src_buf);
|
||||
// We just verified above that src_buf fits into our internal buffer.
|
||||
assert_eq!(copied, src_buf.len());
|
||||
Slice::into_inner(src_buf)
|
||||
restore(src_buf)
|
||||
} else {
|
||||
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
|
||||
let (src_buf, res) = self
|
||||
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
return (src_buf, Err(e));
|
||||
}
|
||||
src_buf
|
||||
}
|
||||
} else {
|
||||
Slice::into_inner(src_buf)
|
||||
restore(src_buf)
|
||||
};
|
||||
(src_buf, Ok(()))
|
||||
}
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
pub async fn write_blob<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
srcbuf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
) -> (FullSlice<Buf>, Result<u64, Error>) {
|
||||
let (buf, res) = self
|
||||
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||
.await;
|
||||
@@ -287,43 +302,40 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
pub(crate) async fn write_blob_maybe_compressed<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
srcbuf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
algorithm: ImageCompressionAlgorithm,
|
||||
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
|
||||
) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
|
||||
let offset = self.offset;
|
||||
let mut compression_info = CompressionInfo {
|
||||
written_compressed: false,
|
||||
compressed_size: None,
|
||||
};
|
||||
|
||||
let len = srcbuf.bytes_init();
|
||||
let len = srcbuf.len();
|
||||
|
||||
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||
io_buf.clear();
|
||||
let mut compressed_buf = None;
|
||||
let ((io_buf, hdr_res), srcbuf) = async {
|
||||
let ((io_buf_slice, hdr_res), srcbuf) = async {
|
||||
if len < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
io_buf.put_u8(len as u8);
|
||||
(
|
||||
self.write_all(io_buf, ctx).await,
|
||||
srcbuf.slice_full().into_inner(),
|
||||
)
|
||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > MAX_SUPPORTED_LEN {
|
||||
return (
|
||||
(
|
||||
io_buf,
|
||||
io_buf.slice_len(),
|
||||
Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({len} bytes)"),
|
||||
)),
|
||||
),
|
||||
srcbuf.slice_full().into_inner(),
|
||||
srcbuf,
|
||||
);
|
||||
}
|
||||
let (high_bit_mask, len_written, srcbuf) = match algorithm {
|
||||
@@ -336,8 +348,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
} else {
|
||||
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
|
||||
};
|
||||
let slice = srcbuf.slice_full();
|
||||
encoder.write_all(&slice[..]).await.unwrap();
|
||||
encoder.write_all(&srcbuf[..]).await.unwrap();
|
||||
encoder.shutdown().await.unwrap();
|
||||
let compressed = encoder.into_inner();
|
||||
compression_info.compressed_size = Some(compressed.len());
|
||||
@@ -345,31 +356,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
compression_info.written_compressed = true;
|
||||
let compressed_len = compressed.len();
|
||||
compressed_buf = Some(compressed);
|
||||
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||
(BYTE_ZSTD, compressed_len, srcbuf)
|
||||
} else {
|
||||
(BYTE_UNCOMPRESSED, len, slice.into_inner())
|
||||
(BYTE_UNCOMPRESSED, len, srcbuf)
|
||||
}
|
||||
}
|
||||
ImageCompressionAlgorithm::Disabled => {
|
||||
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
|
||||
}
|
||||
ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf),
|
||||
};
|
||||
let mut len_buf = (len_written as u32).to_be_bytes();
|
||||
assert_eq!(len_buf[0] & 0xf0, 0);
|
||||
len_buf[0] |= high_bit_mask;
|
||||
io_buf.extend_from_slice(&len_buf[..]);
|
||||
(self.write_all(io_buf, ctx).await, srcbuf)
|
||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
||||
}
|
||||
}
|
||||
.await;
|
||||
self.io_buf = Some(io_buf);
|
||||
self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner());
|
||||
match hdr_res {
|
||||
Ok(_) => (),
|
||||
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||
Err(e) => return (srcbuf, Err(e)),
|
||||
}
|
||||
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
|
||||
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
|
||||
(Slice::into_inner(srcbuf.slice(..)), res)
|
||||
let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await;
|
||||
(srcbuf, res)
|
||||
} else {
|
||||
self.write_all(srcbuf, ctx).await
|
||||
};
|
||||
@@ -432,21 +441,21 @@ pub(crate) mod tests {
|
||||
let (_, res) = if compression {
|
||||
let res = wtr
|
||||
.write_blob_maybe_compressed(
|
||||
blob.clone(),
|
||||
blob.clone().slice_len(),
|
||||
ctx,
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||
)
|
||||
.await;
|
||||
(res.0, res.1.map(|(off, _)| off))
|
||||
} else {
|
||||
wtr.write_blob(blob.clone(), ctx).await
|
||||
wtr.write_blob(blob.clone().slice_len(), ctx).await
|
||||
};
|
||||
let offs = res?;
|
||||
offsets.push(offs);
|
||||
}
|
||||
// Write out one page worth of zeros so that we can
|
||||
// read again with read_blk
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
|
||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
|
||||
let offs = res?;
|
||||
println!("Writing final blob at offs={offs}");
|
||||
wtr.flush_buffer(ctx).await?;
|
||||
|
||||
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||
mod zero_padded_read_write;
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -52,12 +51,10 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
rw: page_caching::RW::new(file, gate_guard),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
//!
|
||||
//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::io::{self};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
@@ -17,33 +18,17 @@ use super::zero_padded_read_write;
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PrewarmOnWrite {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||
page_cache_file_id,
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -83,10 +68,10 @@ impl RW {
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let writer = self.rw.as_writer();
|
||||
let flushed_range = writer.written_range();
|
||||
let mut vec = writer
|
||||
.file
|
||||
let file_size_tracking_writer = self.rw.as_writer();
|
||||
let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
|
||||
let mut vec = file_size_tracking_writer
|
||||
.as_inner()
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
@@ -121,7 +106,7 @@ impl RW {
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_writer().file.path,
|
||||
self.rw.as_writer().as_inner().path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
@@ -131,7 +116,7 @@ impl RW {
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = writer
|
||||
.file
|
||||
.as_inner()
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
@@ -153,153 +138,16 @@ impl Drop for RW {
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
let path = &self.rw.as_writer().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.rw.as_writer().file.path,
|
||||
e
|
||||
);
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PreWarmingWriter {
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
nwritten_blocks: u32,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
impl PreWarmingWriter {
|
||||
fn new(
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
) -> Self {
|
||||
Self {
|
||||
prewarm_on_write,
|
||||
nwritten_blocks: 0,
|
||||
page_cache_file_id,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the byte range within `file` that has been written though `write_all`.
|
||||
///
|
||||
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||
struct Wrapper(Range<usize>);
|
||||
impl Deref for Wrapper {
|
||||
type Target = Range<usize>;
|
||||
fn deref(&self) -> &Range<usize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||
async fn write_all<
|
||||
B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
|
||||
Buf: tokio_epoll_uring::IoBuf + Send,
|
||||
>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let buf = buf.slice(..);
|
||||
let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
|
||||
let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
|
||||
Some(buf.to_vec())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let buflen = buf.len();
|
||||
assert_eq!(
|
||||
buflen % PAGE_SZ,
|
||||
0,
|
||||
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
|
||||
);
|
||||
|
||||
// Do the IO.
|
||||
let iobuf = match self.file.write_all(buf, ctx).await {
|
||||
(iobuf, Ok(nwritten)) => {
|
||||
assert_eq!(nwritten, buflen);
|
||||
iobuf
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
// order error before path because path is long and error is short
|
||||
format!(
|
||||
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
|
||||
self.nwritten_blocks, buflen, e, self.file.path,
|
||||
),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
|
||||
let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
|
||||
if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
|
||||
assert_eq!(&check_bounds_stuff_works, &*buf);
|
||||
}
|
||||
|
||||
let nblocks = buflen / PAGE_SZ;
|
||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||
|
||||
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer =
|
||||
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||
Ok((buflen, buf.into_inner()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct Buffer<const N: usize> {
|
||||
allocation: Box<[u8; N]>,
|
||||
@@ -60,10 +62,10 @@ impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Bu
|
||||
self.written
|
||||
}
|
||||
|
||||
fn flush(self) -> tokio_epoll_uring::Slice<Self> {
|
||||
fn flush(self) -> FullSlice<Self> {
|
||||
self.invariants();
|
||||
let written = self.written;
|
||||
tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
|
||||
FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
|
||||
}
|
||||
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
||||
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
@@ -574,7 +574,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
0, 0, 0, 16, // pg_version (4 bytes)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
|
||||
task_mgr::spawn(
|
||||
&self.runtime,
|
||||
TaskKind::RemoteUploadTask,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
async move {
|
||||
|
||||
@@ -23,6 +23,8 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
#[cfg_attr(target_os = "macos", allow(unused_imports))]
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||
@@ -219,9 +221,7 @@ async fn download_object<'a>(
|
||||
Ok(chunk) => chunk,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
|
||||
.await?;
|
||||
buffered.write_buffered(chunk.slice_len(), ctx).await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner(ctx).await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
};
|
||||
|
||||
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
|
||||
|
||||
// Sum of layer sizes on local disk
|
||||
pub(super) resident_size_metric: UIntGauge,
|
||||
|
||||
// Sum of layer sizes in the most recently downloaded heatmap
|
||||
pub(super) heatmap_total_size_metric: UIntGauge,
|
||||
}
|
||||
|
||||
impl Drop for SecondaryTenant {
|
||||
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
|
||||
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +133,10 @@ impl SecondaryTenant {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
@@ -145,6 +154,7 @@ impl SecondaryTenant {
|
||||
progress: std::sync::Mutex::default(),
|
||||
|
||||
resident_size_metric,
|
||||
heatmap_total_size_metric,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::{
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::{layer::local_layer_path, LayerName},
|
||||
storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint},
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
},
|
||||
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||
@@ -296,6 +296,9 @@ impl SecondaryDetail {
|
||||
}),
|
||||
last_activity_ts: ods.access_time,
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
// Secondary location layers are presumed visible, because Covered layers
|
||||
// are excluded from the heatmap
|
||||
visibility: LayerVisibilityHint::Visible,
|
||||
}
|
||||
}));
|
||||
|
||||
@@ -826,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
|
||||
// Also expose heatmap bytes_total as a metric
|
||||
self.secondary_state
|
||||
.heatmap_total_size_metric
|
||||
.set(heatmap_stats.bytes);
|
||||
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
|
||||
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(super) timeline_id: TimelineId,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
pub(super) layers: Vec<HeatMapLayer>,
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(crate) name: LayerName,
|
||||
pub(crate) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
|
||||
@@ -42,6 +42,7 @@ use crate::tenant::vectored_blob_io::{
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
@@ -63,6 +64,7 @@ use std::os::unix::fs::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -436,19 +438,28 @@ impl DeltaLayerWriterInner {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (_, res) = self
|
||||
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
|
||||
.put_value_bytes(
|
||||
key,
|
||||
lsn,
|
||||
Value::ser(&val)?.slice_len(),
|
||||
val.will_init(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
res
|
||||
}
|
||||
|
||||
async fn put_value_bytes(
|
||||
async fn put_value_bytes<Buf>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
val: FullSlice<Buf>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
{
|
||||
assert!(
|
||||
self.lsn_range.start <= lsn,
|
||||
"lsn_start={}, lsn={}",
|
||||
@@ -514,7 +525,7 @@ impl DeltaLayerWriterInner {
|
||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||
.await?;
|
||||
for buf in block_buf.blocks {
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
}
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
@@ -534,7 +545,7 @@ impl DeltaLayerWriterInner {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
@@ -646,14 +657,17 @@ impl DeltaLayerWriter {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn put_value_bytes(
|
||||
pub async fn put_value_bytes<Buf>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Vec<u8>,
|
||||
val: FullSlice<Buf>,
|
||||
will_init: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
{
|
||||
self.inner
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
@@ -743,7 +757,7 @@ impl DeltaLayer {
|
||||
// TODO: could use smallvec here, but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1020,7 +1034,7 @@ impl DeltaLayerInner {
|
||||
for (_, blob_meta) in read.blobs_at.as_slice() {
|
||||
reconstruct_state.on_key_error(
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
PageReconstructError::Other(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
kind
|
||||
@@ -1047,7 +1061,7 @@ impl DeltaLayerInner {
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
@@ -1291,12 +1305,12 @@ impl DeltaLayerInner {
|
||||
.put_value_bytes(
|
||||
key,
|
||||
lsn,
|
||||
std::mem::take(&mut per_blob_copy),
|
||||
std::mem::take(&mut per_blob_copy).slice_len(),
|
||||
will_init,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
per_blob_copy = tmp;
|
||||
per_blob_copy = tmp.into_raw_slice().into_inner();
|
||||
|
||||
res?;
|
||||
|
||||
@@ -1871,7 +1885,7 @@ pub(crate) mod test {
|
||||
|
||||
for entry in entries {
|
||||
let (_, res) = writer
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@ use crate::tenant::vectored_blob_io::{
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
@@ -354,7 +355,7 @@ impl ImageLayer {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -786,7 +787,7 @@ impl ImageLayerWriterInner {
|
||||
self.num_keys += 1;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
.write_blob_maybe_compressed(img.slice_len(), ctx, compression)
|
||||
.await;
|
||||
// TODO: re-use the buffer for `img` further upstack
|
||||
let (off, compression_info) = res?;
|
||||
@@ -838,7 +839,7 @@ impl ImageLayerWriterInner {
|
||||
.await?;
|
||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||
for buf in block_buf.blocks {
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
}
|
||||
|
||||
@@ -858,7 +859,7 @@ impl ImageLayerWriterInner {
|
||||
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||
Summary::ser_into(&summary, &mut buf)?;
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
let (_buf, res) = file.write_all(buf.slice_len(), ctx).await;
|
||||
res?;
|
||||
|
||||
let metadata = file
|
||||
|
||||
@@ -12,7 +12,8 @@ use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
@@ -248,9 +249,7 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
println!(
|
||||
@@ -258,39 +257,6 @@ impl InMemoryLayer {
|
||||
self.timeline_id, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -535,7 +501,6 @@ impl InMemoryLayer {
|
||||
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
|
||||
@@ -567,28 +532,6 @@ impl InMemoryLayer {
|
||||
.await?;
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
}
|
||||
}
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -620,11 +563,17 @@ impl InMemoryLayer {
|
||||
// => https://github.com/neondatabase/neon/issues/8183
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let res;
|
||||
(buf, res) = delta_layer_writer
|
||||
.put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
|
||||
let (tmp, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
buf.slice_len(),
|
||||
will_init,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,7 +312,9 @@ impl Layer {
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
|
||||
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
|
||||
GetVectoredError::Cancelled
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
|
||||
@@ -208,6 +208,8 @@ impl SplitDeltaLayerWriter {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
@@ -229,7 +231,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn get_large_img() -> Bytes {
|
||||
vec![0; 8192].into()
|
||||
let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
|
||||
let mut data = vec![0; 8192];
|
||||
rng.fill_bytes(&mut data);
|
||||
data.into()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -98,7 +98,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -121,7 +121,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -144,7 +144,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
{
|
||||
|
||||
@@ -511,7 +511,7 @@ pub(crate) struct TimelineVisitOutcome {
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum PageReconstructError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(WaitLsnError),
|
||||
@@ -527,6 +527,22 @@ pub(crate) enum PageReconstructError {
|
||||
MissingKey(MissingKeyError),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
fn from(value: anyhow::Error) -> Self {
|
||||
// with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
|
||||
match value.downcast::<PageReconstructError>() {
|
||||
Ok(pre) => pre,
|
||||
Err(other) => PageReconstructError::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<utils::bin_ser::DeserializeError> for PageReconstructError {
|
||||
fn from(value: utils::bin_ser::DeserializeError) -> Self {
|
||||
PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure"))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<layer_manager::Shutdown> for PageReconstructError {
|
||||
fn from(_: layer_manager::Shutdown) -> Self {
|
||||
PageReconstructError::Cancelled
|
||||
@@ -546,6 +562,7 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
shard: ShardNumber,
|
||||
@@ -585,11 +602,8 @@ impl PageReconstructError {
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled => true,
|
||||
WalRedo(_) => false,
|
||||
MissingKey { .. } => false,
|
||||
Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -599,11 +613,11 @@ pub(crate) enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
GetVectoredError(GetVectoredError),
|
||||
#[error("read failed")]
|
||||
GetVectoredError(#[source] GetVectoredError),
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(PageReconstructError),
|
||||
#[error("reconstruction failed")]
|
||||
PageReconstructError(#[source] PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
@@ -627,10 +641,10 @@ pub(crate) enum FlushLayerError {
|
||||
|
||||
// Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
|
||||
// loop via a watch channel, where we can only borrow it.
|
||||
#[error(transparent)]
|
||||
#[error("create image layers (shared)")]
|
||||
CreateImageLayersError(Arc<CreateImageLayersError>),
|
||||
|
||||
#[error(transparent)]
|
||||
#[error("other (shared)")]
|
||||
Other(#[from] Arc<anyhow::Error>),
|
||||
}
|
||||
|
||||
@@ -663,34 +677,46 @@ pub(crate) enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
#[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
Oversized(u64),
|
||||
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
#[error("requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
|
||||
#[error("Requested key not found: {0}")]
|
||||
#[error("requested key not found: {0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
|
||||
#[error(transparent)]
|
||||
GetReadyAncestorError(GetReadyAncestorError),
|
||||
#[error("ancestry walk")]
|
||||
GetReadyAncestorError(#[source] GetReadyAncestorError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<GetReadyAncestorError> for GetVectoredError {
|
||||
fn from(value: GetReadyAncestorError) -> Self {
|
||||
use GetReadyAncestorError::*;
|
||||
match value {
|
||||
Cancelled => GetVectoredError::Cancelled,
|
||||
AncestorLsnTimeout(_) | BadState { .. } => {
|
||||
GetVectoredError::GetReadyAncestorError(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetReadyAncestorError {
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
#[error("ancestor LSN wait error")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
#[error("Bad state on timeline {timeline_id}: {state:?}")]
|
||||
#[error("bad state on timeline {timeline_id}: {state:?}")]
|
||||
BadState {
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineState,
|
||||
},
|
||||
|
||||
#[error("Cancelled")]
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
@@ -1619,6 +1645,20 @@ impl Timeline {
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
if try_freeze_and_flush {
|
||||
if let Some((open, frozen)) = self
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
|
||||
.ok()
|
||||
.filter(|(open, frozen)| *open || *frozen > 0)
|
||||
{
|
||||
tracing::info!(?open, frozen, "flushing and freezing on shutdown");
|
||||
} else {
|
||||
// this is double-shutdown, ignore it
|
||||
}
|
||||
|
||||
// we shut down walreceiver above, so, we won't add anything more
|
||||
// to the InMemoryLayer; freeze it and wait for all frozen layers
|
||||
// to reach the disk & upload queue, then shut the upload queue and
|
||||
@@ -2241,7 +2281,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
async move {
|
||||
@@ -2595,7 +2635,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"initial size calculation",
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
@@ -2763,7 +2803,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"ondemand logical size calculation",
|
||||
async move {
|
||||
@@ -2937,11 +2977,7 @@ impl Timeline {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
Some(HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
))
|
||||
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
|
||||
@@ -2950,7 +2986,23 @@ impl Timeline {
|
||||
}
|
||||
});
|
||||
|
||||
let layers = resident.collect();
|
||||
let mut layers = resident.collect::<Vec<_>>();
|
||||
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
|
||||
});
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
@@ -3046,8 +3098,7 @@ impl Timeline {
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::GetReadyAncestorError)?;
|
||||
.await?;
|
||||
timeline = &*timeline_owned;
|
||||
};
|
||||
|
||||
@@ -3944,7 +3995,7 @@ impl Timeline {
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(CreateImageLayersError::PageReconstructError(err));
|
||||
return Err(CreateImageLayersError::from(err));
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -4004,7 +4055,7 @@ impl Timeline {
|
||||
let mut total_kb_retrieved = 0;
|
||||
let mut total_keys_retrieved = 0;
|
||||
for (k, v) in data {
|
||||
let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
|
||||
let v = v?;
|
||||
total_kb_retrieved += KEY_SIZE + v.len();
|
||||
total_keys_retrieved += 1;
|
||||
new_data.insert(k, v);
|
||||
@@ -4477,6 +4528,7 @@ impl DurationRecorder {
|
||||
/// the layer descriptor requires the user to provide the ranges, which should cover all
|
||||
/// keys specified in the `data` field.
|
||||
#[cfg(test)]
|
||||
#[derive(Clone)]
|
||||
pub struct DeltaLayerTestDesc {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub key_range: Range<Key>,
|
||||
@@ -4506,6 +4558,13 @@ impl DeltaLayerTestDesc {
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn layer_name(&self) -> LayerName {
|
||||
LayerName::Delta(super::storage_layer::DeltaLayerName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -4515,7 +4574,12 @@ impl Timeline {
|
||||
new_images: &[ResidentLayer],
|
||||
layers_to_remove: &[Layer],
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut guard = tokio::select! {
|
||||
guard = self.layers.write() => guard,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
};
|
||||
|
||||
let mut duplicated_layers = HashSet::new();
|
||||
|
||||
@@ -5098,7 +5162,7 @@ impl Timeline {
|
||||
let task_id = task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"download all remote layers task",
|
||||
async move {
|
||||
@@ -5261,6 +5325,7 @@ impl Timeline {
|
||||
layer: layer.to_owned().into(),
|
||||
last_activity_ts,
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
visibility: layer.visibility(),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
@@ -5723,12 +5788,110 @@ fn is_send() {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::tenant::{
|
||||
harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heatmap_generation() {
|
||||
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
|
||||
|
||||
let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let l0_delta = DeltaLayerTestDesc::new(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
Key::from_hex("000000000000000000000000000000000000").unwrap()
|
||||
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x25),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let delta_layers = vec![
|
||||
covered_delta.clone(),
|
||||
visible_delta.clone(),
|
||||
l0_delta.clone(),
|
||||
];
|
||||
|
||||
let image_layer = (
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("bar"),
|
||||
)],
|
||||
);
|
||||
let image_layers = vec![image_layer];
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Layer visibility is an input to heatmap generation, so refresh it first
|
||||
timeline.update_layer_visibility().await.unwrap();
|
||||
|
||||
let heatmap = timeline
|
||||
.generate_heatmap()
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in heatmap.layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
let layer_lsn = match &layer.name {
|
||||
LayerName::Delta(d) => d.lsn_range.end,
|
||||
LayerName::Image(i) => i.lsn,
|
||||
};
|
||||
|
||||
// Apart from L0s, newest Layers should come first
|
||||
if !LayerMap::is_l0(layer.name.key_range()) {
|
||||
assert!(layer_lsn <= last_lsn);
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
|
||||
|
||||
@@ -1048,11 +1048,22 @@ impl Timeline {
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
let mut next_hole = 0; // index of next hole in holes vector
|
||||
|
||||
let mut keys = 0;
|
||||
|
||||
while let Some((key, lsn, value)) = all_values_iter
|
||||
.next(ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
keys += 1;
|
||||
|
||||
if keys % 32_768 == 0 && self.cancel.is_cancelled() {
|
||||
// avoid hitting the cancellation token on every key. in benches, we end up
|
||||
// shuffling an order of million keys per layer, this means we'll check it
|
||||
// around tens of times per layer.
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
@@ -1157,6 +1168,8 @@ impl Timeline {
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
|
||||
keys = 0;
|
||||
}
|
||||
|
||||
writer
|
||||
@@ -2325,7 +2338,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
key_range,
|
||||
))
|
||||
} else {
|
||||
// The current compaction implementatin only ever requests the key space
|
||||
// The current compaction implementation only ever requests the key space
|
||||
// at the compaction end LSN.
|
||||
anyhow::bail!("keyspace not available for requested lsn");
|
||||
}
|
||||
|
||||
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
|
||||
@@ -30,7 +30,8 @@ use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
||||
storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -59,7 +60,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Eviction,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
&format!(
|
||||
"layer eviction for {}/{}",
|
||||
@@ -241,7 +242,22 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
no_activity_for > p.threshold
|
||||
match layer.visibility() {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Usual case: a visible layer might be read any time, and we will keep it
|
||||
// resident until it hits our configured TTL threshold.
|
||||
no_activity_for > p.threshold
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Covered layers: this is probably a layer that was recently covered by
|
||||
// an image layer during compaction. We don't evict it immediately, but
|
||||
// it doesn't stay resident for the full `threshold`: we just keep it
|
||||
// for a shorter time in case
|
||||
// - it is used for Timestamp->LSN lookups
|
||||
// - a new branch is created in recent history which will read this layer
|
||||
no_activity_for > p.period
|
||||
}
|
||||
}
|
||||
})
|
||||
.cloned()
|
||||
.for_each(|layer| {
|
||||
|
||||
@@ -17,6 +17,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
@@ -50,6 +51,7 @@ pub(crate) mod owned_buffers_io {
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod io_buf_ext;
|
||||
pub(crate) mod slice;
|
||||
pub(crate) mod write;
|
||||
pub(crate) mod util {
|
||||
@@ -637,24 +639,24 @@ impl VirtualFile {
|
||||
}
|
||||
|
||||
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
|
||||
pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
pub async fn write_all_at<Buf: IoBuf + Send>(
|
||||
&self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
mut offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(()));
|
||||
}
|
||||
let mut buf = buf.slice(0..buf_len);
|
||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
||||
let buf = buf.into_raw_slice();
|
||||
let bounds = buf.bounds();
|
||||
let restore =
|
||||
|buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
|
||||
let mut buf = buf;
|
||||
while !buf.is_empty() {
|
||||
let res;
|
||||
(buf, res) = self.write_at(buf, offset, ctx).await;
|
||||
let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await;
|
||||
buf = tmp.into_raw_slice();
|
||||
match res {
|
||||
Ok(0) => {
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
restore(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
@@ -666,33 +668,33 @@ impl VirtualFile {
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
Err(e) => return (restore(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
(Slice::into_inner(buf), Ok(()))
|
||||
(restore(buf), Ok(()))
|
||||
}
|
||||
|
||||
/// Writes `buf.slice(0..buf.bytes_init())`.
|
||||
/// Returns the IoBuf that is underlying the BoundedBuf `buf`.
|
||||
/// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
|
||||
/// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
|
||||
pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
/// Writes `buf` to the file at the current offset.
|
||||
///
|
||||
/// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller.
|
||||
pub async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (B::Buf, Result<usize, Error>) {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return (Slice::into_inner(buf.slice_full()), Ok(0));
|
||||
}
|
||||
let mut buf = buf.slice(0..nbytes);
|
||||
) -> (FullSlice<Buf>, Result<usize, Error>) {
|
||||
let buf = buf.into_raw_slice();
|
||||
let bounds = buf.bounds();
|
||||
let restore =
|
||||
|buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds));
|
||||
let nbytes = buf.len();
|
||||
let mut buf = buf;
|
||||
while !buf.is_empty() {
|
||||
let res;
|
||||
(buf, res) = self.write(buf, ctx).await;
|
||||
let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await;
|
||||
buf = tmp.into_raw_slice();
|
||||
match res {
|
||||
Ok(0) => {
|
||||
return (
|
||||
Slice::into_inner(buf),
|
||||
restore(buf),
|
||||
Err(Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
@@ -703,17 +705,17 @@ impl VirtualFile {
|
||||
buf = buf.slice(n..);
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||
Err(e) => return (restore(buf), Err(e)),
|
||||
}
|
||||
}
|
||||
(Slice::into_inner(buf), Ok(nbytes))
|
||||
(restore(buf), Ok(nbytes))
|
||||
}
|
||||
|
||||
async fn write<B: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: Slice<B>,
|
||||
buf: FullSlice<B>,
|
||||
ctx: &RequestContext,
|
||||
) -> (Slice<B>, Result<usize, std::io::Error>) {
|
||||
) -> (FullSlice<B>, Result<usize, std::io::Error>) {
|
||||
let pos = self.pos;
|
||||
let (buf, res) = self.write_at(buf, pos, ctx).await;
|
||||
let n = match res {
|
||||
@@ -754,12 +756,24 @@ impl VirtualFile {
|
||||
})
|
||||
}
|
||||
|
||||
/// The function aborts the process if the error is fatal.
|
||||
async fn write_at<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: Slice<B>,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (Slice<B>, Result<usize, Error>) {
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
|
||||
let result = result.maybe_fatal_err("write_at");
|
||||
(slice, result)
|
||||
}
|
||||
|
||||
async fn write_at_inner<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
Err(e) => return (buf, Err(e)),
|
||||
@@ -1093,11 +1107,11 @@ impl Drop for VirtualFile {
|
||||
|
||||
impl OwnedAsyncWriter for VirtualFile {
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
|
||||
res.map(move |v| (v, buf))
|
||||
}
|
||||
@@ -1159,7 +1173,8 @@ mod tests {
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
use super::*;
|
||||
use owned_buffers_io::slice::SliceExt;
|
||||
use owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use owned_buffers_io::slice::SliceMutExt;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
@@ -1193,9 +1208,9 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all_at<Buf: IoBuf + Send>(
|
||||
&self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), Error> {
|
||||
@@ -1204,13 +1219,7 @@ mod tests {
|
||||
let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
|
||||
res
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all_at(&buf.slice(0..buf_len), offset)
|
||||
}
|
||||
MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
|
||||
}
|
||||
}
|
||||
async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
@@ -1219,9 +1228,9 @@ mod tests {
|
||||
MaybeVirtualFile::File(file) => file.seek(pos),
|
||||
}
|
||||
}
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), Error> {
|
||||
match self {
|
||||
@@ -1229,13 +1238,7 @@ mod tests {
|
||||
let (_buf, res) = file.write_all(buf, ctx).await;
|
||||
res.map(|_| ())
|
||||
}
|
||||
MaybeVirtualFile::File(file) => {
|
||||
let buf_len = buf.bytes_init();
|
||||
if buf_len == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
file.write_all(&buf.slice(0..buf_len))
|
||||
}
|
||||
MaybeVirtualFile::File(file) => file.write_all(&buf[..]),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1347,7 +1350,9 @@ mod tests {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
file_a.write_all(b"foobar".to_vec(), &ctx).await?;
|
||||
file_a
|
||||
.write_all(b"foobar".to_vec().slice_len(), &ctx)
|
||||
.await?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
let _ = file_a.read_string(&ctx).await.unwrap_err();
|
||||
@@ -1356,7 +1361,10 @@ mod tests {
|
||||
let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
|
||||
let _ = file_a
|
||||
.write_all(b"bar".to_vec().slice_len(), &ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", file_a.read_string(&ctx).await?);
|
||||
@@ -1399,8 +1407,12 @@ mod tests {
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
|
||||
file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
|
||||
file_b
|
||||
.write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
|
||||
.await?;
|
||||
file_b
|
||||
.write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#[cfg(target_os = "linux")]
|
||||
pub(super) mod tokio_epoll_uring_ext;
|
||||
|
||||
use tokio_epoll_uring::{IoBuf, Slice};
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tracing::Instrument;
|
||||
|
||||
pub(crate) use super::api::IoEngineKind;
|
||||
@@ -107,7 +107,10 @@ use std::{
|
||||
sync::atomic::{AtomicU8, Ordering},
|
||||
};
|
||||
|
||||
use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
|
||||
use super::{
|
||||
owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt},
|
||||
FileGuard, Metadata,
|
||||
};
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
|
||||
@@ -206,8 +209,8 @@ impl IoEngine {
|
||||
&self,
|
||||
file_guard: FileGuard,
|
||||
offset: u64,
|
||||
buf: Slice<B>,
|
||||
) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
|
||||
buf: FullSlice<B>,
|
||||
) -> ((FileGuard, FullSlice<B>), std::io::Result<usize>) {
|
||||
match self {
|
||||
IoEngine::NotSet => panic!("not initialized"),
|
||||
IoEngine::StdFs => {
|
||||
@@ -217,8 +220,12 @@ impl IoEngine {
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.write(file_guard, offset, buf).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
let ((file_guard, slice), res) =
|
||||
system.write(file_guard, offset, buf.into_raw_slice()).await;
|
||||
(
|
||||
(file_guard, FullSlice::must_new(slice)),
|
||||
res.map_err(epoll_uring_error_to_std),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
78
pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
Normal file
78
pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
//! See [`FullSlice`].
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
|
||||
/// The true owned equivalent for Rust [`slice`]. Use this for the write path.
|
||||
///
|
||||
/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
|
||||
/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that
|
||||
/// [`<FullSlice as Deref<Target = [u8]>>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`].
|
||||
///
|
||||
pub struct FullSlice<B> {
|
||||
slice: Slice<B>,
|
||||
}
|
||||
|
||||
impl<B> FullSlice<B>
|
||||
where
|
||||
B: IoBuf,
|
||||
{
|
||||
pub(crate) fn must_new(slice: Slice<B>) -> Self {
|
||||
assert_eq!(slice.bytes_init(), slice.bytes_total());
|
||||
FullSlice { slice }
|
||||
}
|
||||
pub(crate) fn into_raw_slice(self) -> Slice<B> {
|
||||
let FullSlice { slice: s } = self;
|
||||
s
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Deref for FullSlice<B>
|
||||
where
|
||||
B: IoBuf,
|
||||
{
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
let rust_slice = &self.slice[..];
|
||||
assert_eq!(rust_slice.len(), self.slice.bytes_init());
|
||||
assert_eq!(rust_slice.len(), self.slice.bytes_total());
|
||||
rust_slice
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait IoBufExt {
|
||||
/// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
|
||||
fn slice_len(self) -> FullSlice<Self>
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
|
||||
macro_rules! impl_io_buf_ext {
|
||||
($T:ty) => {
|
||||
impl IoBufExt for $T {
|
||||
#[inline(always)]
|
||||
fn slice_len(self) -> FullSlice<Self> {
|
||||
let len = self.len();
|
||||
let s = if len == 0 {
|
||||
// `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion,
|
||||
// causing a panic if len == 0.
|
||||
// The Slice::from_buf_bounds has the correct assertion (<= instead of <).
|
||||
// => https://github.com/neondatabase/tokio-epoll-uring/issues/46
|
||||
let slice = self.slice_full();
|
||||
let mut bounds: Range<_> = slice.bounds();
|
||||
bounds.end = bounds.start;
|
||||
Slice::from_buf_bounds(slice.into_inner(), bounds)
|
||||
} else {
|
||||
self.slice(0..len)
|
||||
};
|
||||
FullSlice::must_new(s)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_io_buf_ext!(Bytes);
|
||||
impl_io_buf_ext!(BytesMut);
|
||||
impl_io_buf_ext!(Vec<u8>);
|
||||
@@ -3,14 +3,14 @@ use tokio_epoll_uring::BoundedBufMut;
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::Slice;
|
||||
|
||||
pub(crate) trait SliceExt {
|
||||
pub(crate) trait SliceMutExt {
|
||||
/// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
|
||||
///
|
||||
/// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
|
||||
fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
|
||||
}
|
||||
|
||||
impl<B> SliceExt for Slice<B>
|
||||
impl<B> SliceMutExt for Slice<B>
|
||||
where
|
||||
B: IoBufMut,
|
||||
{
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
|
||||
};
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
|
||||
pub struct Writer<W> {
|
||||
dst: W,
|
||||
@@ -35,11 +38,11 @@ where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
|
||||
@@ -1,16 +1,18 @@
|
||||
use bytes::BytesMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
use super::io_buf_ext::{FullSlice, IoBufExt};
|
||||
|
||||
/// A trait for doing owned-buffer write IO.
|
||||
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
|
||||
pub trait OwnedAsyncWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)>;
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)>;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
|
||||
@@ -79,9 +81,11 @@ where
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn write_buffered<S: IoBuf + Send>(
|
||||
&mut self,
|
||||
chunk: Slice<S>,
|
||||
chunk: FullSlice<S>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, S)> {
|
||||
) -> std::io::Result<(usize, FullSlice<S>)> {
|
||||
let chunk = chunk.into_raw_slice();
|
||||
|
||||
let chunk_len = chunk.len();
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= self.buf().cap() {
|
||||
@@ -94,7 +98,10 @@ where
|
||||
.pending(),
|
||||
0
|
||||
);
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
|
||||
let (nwritten, chunk) = self
|
||||
.writer
|
||||
.write_all(FullSlice::must_new(chunk), ctx)
|
||||
.await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
return Ok((nwritten, chunk));
|
||||
}
|
||||
@@ -114,7 +121,7 @@ where
|
||||
}
|
||||
}
|
||||
assert!(slice.is_empty(), "by now we should have drained the chunk");
|
||||
Ok((chunk_len, chunk.into_inner()))
|
||||
Ok((chunk_len, FullSlice::must_new(chunk)))
|
||||
}
|
||||
|
||||
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
|
||||
@@ -150,9 +157,12 @@ where
|
||||
self.buf = Some(buf);
|
||||
return Ok(());
|
||||
}
|
||||
let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
|
||||
let slice = buf.flush();
|
||||
let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
self.buf = Some(Buffer::reuse_after_flush(io_buf));
|
||||
self.buf = Some(Buffer::reuse_after_flush(
|
||||
slice.into_raw_slice().into_inner(),
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -172,9 +182,9 @@ pub trait Buffer {
|
||||
/// Number of bytes in the buffer.
|
||||
fn pending(&self) -> usize;
|
||||
|
||||
/// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
|
||||
/// Turns `self` into a [`FullSlice`] of the pending data
|
||||
/// so we can use [`tokio_epoll_uring`] to write it to disk.
|
||||
fn flush(self) -> Slice<Self::IoBuf>;
|
||||
fn flush(self) -> FullSlice<Self::IoBuf>;
|
||||
|
||||
/// After the write to disk is done and we have gotten back the slice,
|
||||
/// [`BufferedWriter`] uses this method to re-use the io buffer.
|
||||
@@ -198,12 +208,8 @@ impl Buffer for BytesMut {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn flush(self) -> Slice<BytesMut> {
|
||||
if self.is_empty() {
|
||||
return self.slice_full();
|
||||
}
|
||||
let len = self.len();
|
||||
self.slice(0..len)
|
||||
fn flush(self) -> FullSlice<BytesMut> {
|
||||
self.slice_len()
|
||||
}
|
||||
|
||||
fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
|
||||
@@ -213,18 +219,13 @@ impl Buffer for BytesMut {
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
self.extend_from_slice(&buf[..]);
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
Ok((buf.len(), buf))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,19 +242,13 @@ mod tests {
|
||||
writes: Vec<Vec<u8>>,
|
||||
}
|
||||
impl OwnedAsyncWriter for RecorderWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
buf: FullSlice<Buf>,
|
||||
_: &RequestContext,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
self.writes.push(vec![]);
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
self.writes.push(Vec::from(&buf[..]));
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
Ok((buf.len(), buf))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -264,7 +259,7 @@ mod tests {
|
||||
macro_rules! write {
|
||||
($writer:ident, $data:literal) => {{
|
||||
$writer
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
|
||||
.await?;
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -235,6 +235,7 @@ impl WalIngest {
|
||||
&parsed_xact,
|
||||
info == pg_constants::XLOG_XACT_COMMIT,
|
||||
decoded.origin_id,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -248,6 +249,7 @@ impl WalIngest {
|
||||
&parsed_xact,
|
||||
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
|
||||
decoded.origin_id,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -408,11 +410,16 @@ impl WalIngest {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_REPLORIGIN_SET {
|
||||
let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
|
||||
info!(
|
||||
"Set replication origin_id={}, origin_lsn={}",
|
||||
xlrec.node_id, xlrec.remote_lsn
|
||||
);
|
||||
modification
|
||||
.set_replorigin(xlrec.node_id, xlrec.remote_lsn)
|
||||
.await?
|
||||
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
|
||||
let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
|
||||
info!("Drop replication origin_id={}", xlrec.node_id);
|
||||
modification.drop_replorigin(xlrec.node_id).await?
|
||||
}
|
||||
}
|
||||
@@ -515,7 +522,7 @@ impl WalIngest {
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
|
||||
// do not materialize null pages because them most likely be soon replaced with real data
|
||||
&& blk.bimg_len != 0
|
||||
{
|
||||
@@ -1220,6 +1227,7 @@ impl WalIngest {
|
||||
parsed: &XlXactParsedRecord,
|
||||
is_commit: bool,
|
||||
origin_id: u16,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Record update of CLOG pages
|
||||
@@ -1286,6 +1294,10 @@ impl WalIngest {
|
||||
}
|
||||
}
|
||||
if origin_id != 0 {
|
||||
info!(
|
||||
"Commit at {} origin_id={}, origin_lsn={}",
|
||||
lsn, origin_id, parsed.origin_lsn
|
||||
);
|
||||
modification
|
||||
.set_replorigin(origin_id, parsed.origin_lsn)
|
||||
.await?;
|
||||
@@ -1702,7 +1714,7 @@ async fn get_relsize(
|
||||
modification: &DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<BlockNumber> {
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let nblocks = if !modification
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(modification), ctx)
|
||||
|
||||
@@ -1018,7 +1018,7 @@ pub fn decode_wal_record(
|
||||
);
|
||||
|
||||
let blk_img_is_compressed =
|
||||
postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?;
|
||||
postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version);
|
||||
|
||||
if blk_img_is_compressed {
|
||||
debug!("compressed block image , pg_version = {}", pg_version);
|
||||
|
||||
@@ -41,6 +41,8 @@
|
||||
|
||||
#include "hll.h"
|
||||
|
||||
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||
@@ -51,19 +53,43 @@
|
||||
*
|
||||
* Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
|
||||
* its consistency.
|
||||
|
||||
*
|
||||
* ## Holes
|
||||
*
|
||||
* The LFC can be resized on the fly, up to a maximum size that's determined
|
||||
* at server startup (neon.max_file_cache_size). After server startup, we
|
||||
* expand the underlying file when needed, until it reaches the soft limit
|
||||
* (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
|
||||
* the LFC by punching holes in the underlying file with a
|
||||
* fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
|
||||
* shrink, but the disk space it uses does.
|
||||
*
|
||||
* Each hole is tracked by a dummy FileCacheEntry, which are kept in the
|
||||
* 'holes' linked list. They are entered into the chunk hash table, with a
|
||||
* special key where the blockNumber is used to store the 'offset' of the
|
||||
* hole, and all other fields are zero. Holes are never looked up in the hash
|
||||
* table, we only enter them there to have a FileCacheEntry that we can keep
|
||||
* in the linked list. If the soft limit is raised again, we reuse the holes
|
||||
* before extending the nominal size of the file.
|
||||
*/
|
||||
|
||||
/* Local file storage allocation chunk.
|
||||
* Should be power of two and not less than 32. Using larger than page chunks can
|
||||
* Should be power of two. Using larger than page chunks can
|
||||
* 1. Reduce hash-map memory footprint: 8TB database contains billion pages
|
||||
* and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
|
||||
* 1Mb chunks can reduce hash map size to 320Mb.
|
||||
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
|
||||
*/
|
||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||
/*
|
||||
* Smaller chunk seems to be better for OLTP workload
|
||||
*/
|
||||
// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
@@ -71,8 +97,8 @@ typedef struct FileCacheEntry
|
||||
uint32 hash;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
uint32 bitmap[CHUNK_BITMAP_SIZE];
|
||||
dlist_node list_node; /* LRU/holes list node */
|
||||
} FileCacheEntry;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
@@ -87,6 +113,7 @@ typedef struct FileCacheControl
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
} FileCacheControl;
|
||||
|
||||
@@ -135,6 +162,7 @@ lfc_disable(char const *op)
|
||||
lfc_ctl->used = 0;
|
||||
lfc_ctl->limit = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
if (lfc_desc > 0)
|
||||
{
|
||||
@@ -214,18 +242,18 @@ lfc_shmem_startup(void)
|
||||
if (!found)
|
||||
{
|
||||
int fd;
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = sizeof(FileCacheEntry);
|
||||
|
||||
/*
|
||||
* lfc_size+1 because we add new element to hash table before eviction
|
||||
* n_chunks+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
lfc_size + 1, lfc_size + 1,
|
||||
n_chunks + 1, n_chunks + 1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->generation = 0;
|
||||
@@ -235,6 +263,7 @@ lfc_shmem_startup(void)
|
||||
lfc_ctl->misses = 0;
|
||||
lfc_ctl->writes = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
@@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
* Shrink cache by throwing away least recently accessed chunks and
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *hole;
|
||||
uint32 offset = victim->offset;
|
||||
uint32 hash;
|
||||
bool found;
|
||||
BufferTag holetag;
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
/* We remove the old entry, and re-enter a hole to the hash table */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
|
||||
memset(&holetag, 0, sizeof(holetag));
|
||||
holetag.blockNum = offset;
|
||||
hash = get_hash_value(lfc_hash, &holetag);
|
||||
hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
|
||||
hole->hash = hash;
|
||||
hole->offset = offset;
|
||||
hole->access_count = 0;
|
||||
CriticalAssert(!found);
|
||||
dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
|
||||
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
lfc_ctl->limit = new_size;
|
||||
@@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
{
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
@@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->list_node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
/* Unlink entry from LRU list to pin it for the duration of IO operation */
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
generation = lfc_ctl->generation;
|
||||
entry_offset = entry->offset;
|
||||
|
||||
@@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
Assert(LFC_ENABLED());
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
lfc_ctl->hits += 1;
|
||||
pgBufferUsage.file_cache.hits += 1;
|
||||
Assert(entry->access_count > 0);
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
}
|
||||
else
|
||||
result = false;
|
||||
@@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
* operation
|
||||
*/
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else if (!dlist_is_empty(&lfc_ctl->holes))
|
||||
{
|
||||
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
|
||||
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
|
||||
uint32 offset = hole->offset;
|
||||
bool found;
|
||||
|
||||
hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
|
||||
CriticalAssert(found);
|
||||
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = offset; /* reuse the hole */
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
@@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
Assert(LFC_ENABLED());
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
/* Place entry to the head of LRU list */
|
||||
Assert(entry->access_count > 0);
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
|
||||
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
|
||||
}
|
||||
@@ -708,7 +774,6 @@ typedef struct
|
||||
} NeonGetStatsCtx;
|
||||
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
#define NUM_NEON_GET_STATS_ROWS 3
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
Datum
|
||||
@@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
@@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->writes;
|
||||
break;
|
||||
case 4:
|
||||
key = "file_cache_size";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
@@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
n_pages += pg_popcount32(entry->bitmap[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
XLogRecPtr cutoff_lsn;
|
||||
|
||||
/* In case of a SIGHUP, just reload the configuration. */
|
||||
if (ConfigReloadPending)
|
||||
{
|
||||
ConfigReloadPending = false;
|
||||
ProcessConfigFile(PGC_SIGHUP);
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are too many .snap files, just drop all logical slots to
|
||||
* prevent aux files bloat.
|
||||
|
||||
@@ -54,6 +54,10 @@
|
||||
|
||||
#define BufTagGetNRelFileInfo(tag) tag.rnode
|
||||
|
||||
#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
|
||||
|
||||
#define InvalidRelFileNumber InvalidOid
|
||||
|
||||
#define SMgrRelGetRelInfo(reln) \
|
||||
(reln->smgr_rnode.node)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ testing = []
|
||||
[dependencies]
|
||||
ahash.workspace = true
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
atomic-take.workspace = true
|
||||
@@ -73,7 +74,7 @@ rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2 = { workspace = true, features = ["asm"] }
|
||||
sha2 = { workspace = true, features = ["asm", "oid"] }
|
||||
smol_str.workspace = true
|
||||
smallvec.workspace = true
|
||||
socket2.workspace = true
|
||||
@@ -103,6 +104,14 @@ x509-parser.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
redis.workspace = true
|
||||
|
||||
# jwt stuff
|
||||
jose-jwa = "0.1.2"
|
||||
jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
|
||||
signature = "2"
|
||||
ecdsa = "0.16"
|
||||
p256 = "0.13"
|
||||
rsa = "0.9"
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
||||
|
||||
impl UserFacingError for AuthError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use AuthErrorImpl::*;
|
||||
match self.0.as_ref() {
|
||||
Link(e) => e.to_string_client(),
|
||||
GetAuthInfo(e) => e.to_string_client(),
|
||||
Sasl(e) => e.to_string_client(),
|
||||
AuthFailed(_) => self.to_string(),
|
||||
BadAuthMethod(_) => self.to_string(),
|
||||
MalformedPassword(_) => self.to_string(),
|
||||
MissingEndpointName => self.to_string(),
|
||||
Io(_) => "Internal error".to_string(),
|
||||
IpAddressNotAllowed(_) => self.to_string(),
|
||||
TooManyConnections => self.to_string(),
|
||||
UserTimeout(_) => self.to_string(),
|
||||
AuthErrorImpl::Link(e) => e.to_string_client(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
|
||||
AuthErrorImpl::Sasl(e) => e.to_string_client(),
|
||||
AuthErrorImpl::AuthFailed(_) => self.to_string(),
|
||||
AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
|
||||
AuthErrorImpl::MalformedPassword(_) => self.to_string(),
|
||||
AuthErrorImpl::MissingEndpointName => self.to_string(),
|
||||
AuthErrorImpl::Io(_) => "Internal error".to_string(),
|
||||
AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
|
||||
AuthErrorImpl::TooManyConnections => self.to_string(),
|
||||
AuthErrorImpl::UserTimeout(_) => self.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for AuthError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
use AuthErrorImpl::*;
|
||||
match self.0.as_ref() {
|
||||
Link(e) => e.get_error_kind(),
|
||||
GetAuthInfo(e) => e.get_error_kind(),
|
||||
Sasl(e) => e.get_error_kind(),
|
||||
AuthFailed(_) => crate::error::ErrorKind::User,
|
||||
BadAuthMethod(_) => crate::error::ErrorKind::User,
|
||||
MalformedPassword(_) => crate::error::ErrorKind::User,
|
||||
MissingEndpointName => crate::error::ErrorKind::User,
|
||||
Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
|
||||
TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
UserTimeout(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::Link(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::Sasl(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
mod classic;
|
||||
mod hacks;
|
||||
pub mod jwt;
|
||||
mod link;
|
||||
|
||||
use std::net::IpAddr;
|
||||
@@ -79,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static {
|
||||
|
||||
impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, _) => match &**api {
|
||||
Self::Console(api, _) => match &**api {
|
||||
ConsoleBackend::Console(endpoint) => {
|
||||
fmt.debug_tuple("Console").field(&endpoint.url()).finish()
|
||||
}
|
||||
@@ -92,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
#[cfg(test)]
|
||||
ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
},
|
||||
Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -101,10 +101,9 @@ impl<T, D> BackendType<'_, T, D> {
|
||||
/// Very similar to [`std::option::Option::as_ref`].
|
||||
/// This helps us pass structured config to async tasks.
|
||||
pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
|
||||
Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
|
||||
Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
|
||||
Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> {
|
||||
/// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(c, f(x)),
|
||||
Link(c, x) => Link(c, x),
|
||||
Self::Console(c, x) => BackendType::Console(c, f(x)),
|
||||
Self::Link(c, x) => BackendType::Link(c, x),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -125,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
|
||||
/// Very similar to [`std::option::Option::transpose`].
|
||||
/// This is most useful for error handling.
|
||||
pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => x.map(|x| Console(c, x)),
|
||||
Link(c, x) => Ok(Link(c, x)),
|
||||
Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
|
||||
Self::Link(c, x) => Ok(BackendType::Link(c, x)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -292,7 +289,9 @@ async fn auth_quirks(
|
||||
ctx.set_endpoint_id(res.info.endpoint.clone());
|
||||
let password = match res.keys {
|
||||
ComputeCredentialKeys::Password(p) => p,
|
||||
_ => unreachable!("password hack should return a password"),
|
||||
ComputeCredentialKeys::AuthKeys(_) => {
|
||||
unreachable!("password hack should return a password")
|
||||
}
|
||||
};
|
||||
(res.info, Some(password))
|
||||
}
|
||||
@@ -399,21 +398,17 @@ async fn authenticate_with_secret(
|
||||
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||
Link(_, _) => Some("link".into()),
|
||||
Self::Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||
Self::Link(_, _) => Some("link".into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get username from the credentials.
|
||||
pub fn get_user(&self) -> &str {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => &user_info.user,
|
||||
Link(_, _) => "link",
|
||||
Self::Console(_, user_info) => &user_info.user,
|
||||
Self::Link(_, _) => "link",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -427,10 +422,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
config: &'static AuthenticationConfig,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
|
||||
use BackendType::*;
|
||||
|
||||
let res = match self {
|
||||
Console(api, user_info) => {
|
||||
Self::Console(api, user_info) => {
|
||||
info!(
|
||||
user = &*user_info.user,
|
||||
project = user_info.endpoint(),
|
||||
@@ -450,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
BackendType::Console(api, credentials)
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url, _) => {
|
||||
Self::Link(url, _) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
let info = link::authenticate(ctx, &url, client).await?;
|
||||
@@ -469,10 +462,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||
Link(_, _) => Ok(Cached::new_uncached(None)),
|
||||
Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Link(_, _) => Ok(Cached::new_uncached(None)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -480,10 +472,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -494,18 +485,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Link(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||
match self {
|
||||
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||
BackendType::Link(_, _) => None,
|
||||
Self::Console(_, creds) => Some(&creds.keys),
|
||||
Self::Link(_, _) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -516,18 +505,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||
match self {
|
||||
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||
BackendType::Link(_, _) => None,
|
||||
Self::Console(_, creds) => Some(&creds.keys),
|
||||
Self::Link(_, _) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
556
proxy/src/auth/backend/jwt.rs
Normal file
556
proxy/src/auth/backend/jwt.rs
Normal file
@@ -0,0 +1,556 @@
|
||||
use std::{future::Future, sync::Arc, time::Duration};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FetchAuthRulesFromCplane {
|
||||
#[allow(dead_code)]
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for FetchAuthRulesFromCplane {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Err(anyhow::anyhow!("not yet implemented"))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AuthRules {
|
||||
jwks_urls: Vec<url::Url>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntryLock {
|
||||
cached: ArcSwapOption<JwkCacheEntry>,
|
||||
lookup: tokio::sync::Semaphore,
|
||||
}
|
||||
|
||||
impl Default for JwkCacheEntryLock {
|
||||
fn default() -> Self {
|
||||
JwkCacheEntryLock {
|
||||
cached: ArcSwapOption::empty(),
|
||||
lookup: tokio::sync::Semaphore::new(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntryLock {
|
||||
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
|
||||
JwkRenewalPermit::acquire_permit(self).await
|
||||
}
|
||||
|
||||
fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
|
||||
JwkRenewalPermit::try_acquire_permit(self)
|
||||
}
|
||||
|
||||
async fn renew_jwks<F: FetchAuthRules>(
|
||||
&self,
|
||||
_permit: JwkRenewalPermit<'_>,
|
||||
client: &reqwest::Client,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
if let Some(cached) = guard {
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
if last_update < Duration::from_secs(300) {
|
||||
return Ok(cached);
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules.fetch_auth_rules().await?;
|
||||
let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
|
||||
rules.jwks_urls.len(),
|
||||
ahash::RandomState::new(),
|
||||
);
|
||||
// TODO(conrad): run concurrently
|
||||
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
|
||||
for url in rules.jwks_urls {
|
||||
let req = client.get(url.clone());
|
||||
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
|
||||
match req.send().await.and_then(|r| r.error_for_status()) {
|
||||
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
|
||||
// I expect these failures would be quite sparse.
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
|
||||
Ok(r) => {
|
||||
let resp: http::Response<reqwest::Body> = r.into();
|
||||
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
|
||||
resp.into_body(),
|
||||
MAX_JWK_BODY_SIZE,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
|
||||
Ok(jwks) => {
|
||||
key_sets.insert(url, jwks);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let entry = Arc::new(JwkCacheEntry {
|
||||
last_retrieved: now,
|
||||
key_sets,
|
||||
});
|
||||
self.cached.swap(Some(Arc::clone(&entry)));
|
||||
|
||||
Ok(entry)
|
||||
}
|
||||
|
||||
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
client: &reqwest::Client,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
|
||||
// if we have no cached JWKs, try and get some
|
||||
let Some(cached) = guard else {
|
||||
let permit = self.acquire_permit().await;
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
|
||||
// check if the cached JWKs need updating.
|
||||
if last_update > MAX_RENEW {
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
if last_update > AUTO_RENEW {
|
||||
if let Some(permit) = self.try_acquire_permit() {
|
||||
tracing::debug!("JWKs should be renewed. Renewal permit acquired");
|
||||
let permit = permit.into_owned();
|
||||
let entry = self.clone();
|
||||
let client = client.clone();
|
||||
let fetch = fetch.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
}
|
||||
});
|
||||
} else {
|
||||
tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cached)
|
||||
}
|
||||
|
||||
async fn check_jwt<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
jwt: String,
|
||||
client: &reqwest::Client,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
// <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
|
||||
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
|
||||
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once(".")
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let (header, _payload) = header_payload
|
||||
.split_once(".")
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
let kid = header.kid.context("missing key id")?;
|
||||
|
||||
let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let jwk = loop {
|
||||
let jwk = guard
|
||||
.key_sets
|
||||
.values()
|
||||
.flat_map(|jwks| &jwks.keys)
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
|
||||
|
||||
match jwk {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self.renew_jwks(permit, client, fetch).await?;
|
||||
}
|
||||
_ => {
|
||||
bail!("jwk not found");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ensure!(
|
||||
jwk.is_supported(&header.alg),
|
||||
"signature algorithm not supported"
|
||||
);
|
||||
|
||||
match &jwk.key {
|
||||
jose_jwk::Key::Ec(key) => {
|
||||
verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
|
||||
}
|
||||
jose_jwk::Key::Rsa(key) => {
|
||||
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
|
||||
}
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
};
|
||||
|
||||
// TODO(conrad): verify iss, exp, nbf, etc...
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkCache {
|
||||
pub async fn check_jwt(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
jwt: String,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// try with just a read lock first
|
||||
let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
|
||||
let entry = match entry {
|
||||
Some(entry) => entry,
|
||||
None => {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(endpoint).or_default();
|
||||
Arc::clone(&*entry)
|
||||
}
|
||||
};
|
||||
|
||||
let fetch = FetchAuthRulesFromCplane { endpoint };
|
||||
entry.check_jwt(jwt, &self.client, &fetch).await
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
|
||||
use ecdsa::Signature;
|
||||
use signature::Verifier;
|
||||
|
||||
match key.crv {
|
||||
jose_jwk::EcCurves::P256 => {
|
||||
let pk =
|
||||
p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
|
||||
let key = p256::ecdsa::VerifyingKey::from(&pk);
|
||||
let sig = Signature::from_slice(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
key => bail!("unsupported ec key type {key:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn verify_rsa_signature(
|
||||
data: &[u8],
|
||||
sig: &[u8],
|
||||
key: &jose_jwk::Rsa,
|
||||
alg: &Option<jose_jwa::Algorithm>,
|
||||
) -> anyhow::Result<()> {
|
||||
use jose_jwa::{Algorithm, Signing};
|
||||
use rsa::{
|
||||
pkcs1v15::{Signature, VerifyingKey},
|
||||
RsaPublicKey,
|
||||
};
|
||||
|
||||
let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
|
||||
|
||||
match alg {
|
||||
Some(Algorithm::Signing(Signing::Rs256)) => {
|
||||
let key = VerifyingKey::<sha2::Sha256>::new(key);
|
||||
let sig = Signature::try_from(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
_ => bail!("invalid RSA signing algorithm"),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct JWTHeader<'a> {
|
||||
/// must be "JWT"
|
||||
typ: &'a str,
|
||||
/// must be a supported alg
|
||||
alg: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
kid: Option<&'a str>,
|
||||
}
|
||||
|
||||
struct JwkRenewalPermit<'a> {
|
||||
inner: Option<JwkRenewalPermitInner<'a>>,
|
||||
}
|
||||
|
||||
enum JwkRenewalPermitInner<'a> {
|
||||
Owned(Arc<JwkCacheEntryLock>),
|
||||
Borrowed(&'a Arc<JwkCacheEntryLock>),
|
||||
}
|
||||
|
||||
impl JwkRenewalPermit<'_> {
|
||||
fn into_owned(mut self) -> JwkRenewalPermit<'static> {
|
||||
JwkRenewalPermit {
|
||||
inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
|
||||
match from.lookup.acquire().await {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
JwkRenewalPermit {
|
||||
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
|
||||
}
|
||||
}
|
||||
Err(_) => panic!("semaphore should not be closed"),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
|
||||
match from.lookup.try_acquire() {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
Some(JwkRenewalPermit {
|
||||
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
|
||||
})
|
||||
}
|
||||
Err(tokio::sync::TryAcquireError::NoPermits) => None,
|
||||
Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkRenewalPermitInner<'_> {
|
||||
fn into_owned(self) -> JwkRenewalPermitInner<'static> {
|
||||
match self {
|
||||
JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
|
||||
JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for JwkRenewalPermit<'_> {
|
||||
fn drop(&mut self) {
|
||||
let entry = match &self.inner {
|
||||
None => return,
|
||||
Some(JwkRenewalPermitInner::Owned(p)) => p,
|
||||
Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
|
||||
};
|
||||
entry.lookup.add_permits(1);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
|
||||
|
||||
use base64::URL_SAFE_NO_PAD;
|
||||
use bytes::Bytes;
|
||||
use http::Response;
|
||||
use http_body_util::Full;
|
||||
use hyper1::service::service_fn;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use rand::rngs::OsRng;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
|
||||
let sk = p256::SecretKey::random(&mut OsRng);
|
||||
let pk = sk.public_key().into();
|
||||
let jwk = jose_jwk::Jwk {
|
||||
key: jose_jwk::Key::Ec(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
|
||||
let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
|
||||
let pk = sk.to_public_key().into();
|
||||
let jwk = jose_jwk::Jwk {
|
||||
key: jose_jwk::Key::Rsa(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let header = JWTHeader {
|
||||
typ: "JWT",
|
||||
alg: jose_jwa::Algorithm::Signing(sig),
|
||||
kid: Some(&kid),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
}};
|
||||
|
||||
let header =
|
||||
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
|
||||
fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
|
||||
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
|
||||
use rsa::pkcs1v15::SigningKey;
|
||||
use rsa::signature::SignatureEncoding;
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
|
||||
let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn renew() {
|
||||
let (rs1, jwk1) = new_rsa_jwk("1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk("2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), ec2);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
|
||||
let service = service_fn(move |req| {
|
||||
let foo_jwks = foo_jwks.clone();
|
||||
let bar_jwks = bar_jwks.clone();
|
||||
async move {
|
||||
let jwks = match req.uri().path() {
|
||||
"/foo" => &foo_jwks,
|
||||
"/bar" => &bar_jwks,
|
||||
_ => {
|
||||
return Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new()));
|
||||
}
|
||||
};
|
||||
let body = serde_json::to_vec(jwks).unwrap();
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body)))
|
||||
}
|
||||
});
|
||||
|
||||
let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
|
||||
let server = hyper1::server::conn::http1::Builder::new();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
let (s, _) = listener.accept().await.unwrap();
|
||||
let serve = server.serve_connection(TokioIo::new(s), service.clone());
|
||||
tokio::spawn(serve.into_future());
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Ok(AuthRules {
|
||||
jwks_urls: vec![
|
||||
format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(jwt1, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt2, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt3, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt4, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
sni: Option<&str>,
|
||||
common_names: Option<&HashSet<String>>,
|
||||
) -> Result<Self, ComputeUserInfoParseError> {
|
||||
use ComputeUserInfoParseError::*;
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let get_param = |key| {
|
||||
params
|
||||
.get(key)
|
||||
.ok_or(ComputeUserInfoParseError::MissingKey(key))
|
||||
};
|
||||
let user: RoleName = get_param("user")?.into();
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
@@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
let endpoint = match (endpoint_option, endpoint_from_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(option), Some(domain)) if option != domain => {
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
|
||||
domain,
|
||||
option,
|
||||
}))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
|
||||
false => Err(MalformedProjectName(name)),
|
||||
false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
|
||||
true => Ok(name),
|
||||
}),
|
||||
}
|
||||
@@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
|
||||
impl<'de> serde::de::Visitor<'de> for StrVisitor {
|
||||
type Value = IpPattern;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
|
||||
}
|
||||
|
||||
|
||||
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
|
||||
/// cache for `role_secret` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
role_secret_cache: String,
|
||||
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_ip_check_for_http: bool,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
@@ -661,6 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: true,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
|
||||
2
proxy/src/cache/common.rs
vendored
2
proxy/src/cache/common.rs
vendored
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
|
||||
type LookupInfo<Key> = C::LookupInfo<Key>;
|
||||
|
||||
fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
|
||||
C::invalidate(self, info)
|
||||
C::invalidate(self, info);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
2
proxy/src/cache/timed_lru.rs
vendored
2
proxy/src/cache/timed_lru.rs
vendored
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
|
||||
type LookupInfo<Key> = LookupInfo<Key>;
|
||||
|
||||
fn invalidate(&self, info: &Self::LookupInfo<K>) {
|
||||
self.invalidate_raw(info)
|
||||
self.invalidate_raw(info);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -44,11 +44,10 @@ pub enum ConnectionError {
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use ConnectionError::*;
|
||||
match self {
|
||||
// This helps us drop irrelevant library-specific prefixes.
|
||||
// TODO: propagate severity level and other parameters.
|
||||
Postgres(err) => match err.as_db_error() {
|
||||
ConnectionError::Postgres(err) => match err.as_db_error() {
|
||||
Some(err) => {
|
||||
let msg = err.message();
|
||||
|
||||
@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
|
||||
}
|
||||
None => err.to_string(),
|
||||
},
|
||||
WakeComputeError(err) => err.to_string_client(),
|
||||
TooManyConnectionAttempts(_) => {
|
||||
ConnectionError::WakeComputeError(err) => err.to_string_client(),
|
||||
ConnectionError::TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
_ => COULD_NOT_CONNECT.to_owned(),
|
||||
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
|
||||
struct AcceptEverythingVerifier;
|
||||
impl ServerCertVerifier for AcceptEverythingVerifier {
|
||||
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||
use rustls::SignatureScheme::*;
|
||||
use rustls::SignatureScheme;
|
||||
// The schemes for which `SignatureScheme::supported_in_tls13` returns true.
|
||||
vec![
|
||||
ECDSA_NISTP521_SHA512,
|
||||
ECDSA_NISTP384_SHA384,
|
||||
ECDSA_NISTP256_SHA256,
|
||||
RSA_PSS_SHA512,
|
||||
RSA_PSS_SHA384,
|
||||
RSA_PSS_SHA256,
|
||||
ED25519,
|
||||
SignatureScheme::ECDSA_NISTP521_SHA512,
|
||||
SignatureScheme::ECDSA_NISTP384_SHA384,
|
||||
SignatureScheme::ECDSA_NISTP256_SHA256,
|
||||
SignatureScheme::RSA_PSS_SHA512,
|
||||
SignatureScheme::RSA_PSS_SHA384,
|
||||
SignatureScheme::RSA_PSS_SHA256,
|
||||
SignatureScheme::ED25519,
|
||||
]
|
||||
}
|
||||
fn verify_server_cert(
|
||||
|
||||
@@ -52,6 +52,7 @@ pub struct TlsConfig {
|
||||
}
|
||||
|
||||
pub struct HttpConfig {
|
||||
pub accept_websockets: bool,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
@@ -155,7 +156,7 @@ pub enum TlsServerEndPoint {
|
||||
}
|
||||
|
||||
impl TlsServerEndPoint {
|
||||
pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
|
||||
pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
|
||||
let sha256_oids = [
|
||||
// I'm explicitly not adding MD5 or SHA1 here... They're bad.
|
||||
oid_registry::OID_SIG_ECDSA_WITH_SHA256,
|
||||
@@ -278,7 +279,7 @@ impl CertResolver {
|
||||
impl rustls::server::ResolvesServerCert for CertResolver {
|
||||
fn resolve(
|
||||
&self,
|
||||
client_hello: rustls::server::ClientHello,
|
||||
client_hello: rustls::server::ClientHello<'_>,
|
||||
) -> Option<Arc<rustls::sign::CertifiedKey>> {
|
||||
self.resolve(client_hello.server_name()).map(|x| x.0)
|
||||
}
|
||||
@@ -559,7 +560,7 @@ impl RetryConfig {
|
||||
match key {
|
||||
"num_retries" => num_retries = Some(value.parse()?),
|
||||
"base_retry_wait_duration" => {
|
||||
base_retry_wait_duration = Some(humantime::parse_duration(value)?)
|
||||
base_retry_wait_duration = Some(humantime::parse_duration(value)?);
|
||||
}
|
||||
"retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
|
||||
unknown => bail!("unknown key: {unknown}"),
|
||||
|
||||
@@ -22,16 +22,15 @@ impl ConsoleError {
|
||||
self.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.error_info.as_ref())
|
||||
.map(|e| e.reason)
|
||||
.unwrap_or(Reason::Unknown)
|
||||
.map_or(Reason::Unknown, |e| e.reason)
|
||||
}
|
||||
|
||||
pub fn get_user_facing_message(&self) -> String {
|
||||
use super::provider::errors::REQUEST_FAILED;
|
||||
self.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.user_facing_message.as_ref())
|
||||
.map(|m| m.message.clone().into())
|
||||
.unwrap_or_else(|| {
|
||||
.map_or_else(|| {
|
||||
// Ask @neondatabase/control-plane for review before adding more.
|
||||
match self.http_status_code {
|
||||
http::StatusCode::NOT_FOUND => {
|
||||
@@ -48,19 +47,18 @@ impl ConsoleError {
|
||||
}
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
})
|
||||
}, |m| m.message.clone().into())
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ConsoleError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let msg = self
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let msg: &str = self
|
||||
.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.user_facing_message.as_ref())
|
||||
.map(|m| m.message.as_ref())
|
||||
.unwrap_or_else(|| &self.error);
|
||||
write!(f, "{}", msg)
|
||||
.map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
|
||||
write!(f, "{msg}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -286,7 +284,7 @@ pub struct DatabaseInfo {
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
impl fmt::Debug for DatabaseInfo {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("DatabaseInfo")
|
||||
.field("host", &self.host)
|
||||
.field("port", &self.port)
|
||||
@@ -373,7 +371,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
});
|
||||
let _: KickSession = serde_json::from_str(&json.to_string())?;
|
||||
let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
|
||||
}
|
||||
|
||||
fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
|
||||
let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
|
||||
let resp: KickSession<'_> =
|
||||
serde_json::from_str(query).context("Failed to parse query as json")?;
|
||||
|
||||
let span = info_span!("event", session_id = resp.session_id);
|
||||
let _enter = span.enter();
|
||||
|
||||
@@ -26,7 +26,7 @@ use tracing::info;
|
||||
pub mod errors {
|
||||
use crate::{
|
||||
console::messages::{self, ConsoleError, Reason},
|
||||
error::{io_error, ReportableError, UserFacingError},
|
||||
error::{io_error, ErrorKind, ReportableError, UserFacingError},
|
||||
proxy::retry::CouldRetry,
|
||||
};
|
||||
use thiserror::Error;
|
||||
@@ -51,21 +51,19 @@ pub mod errors {
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
pub fn get_reason(&self) -> messages::Reason {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
Console(e) => e.get_reason(),
|
||||
_ => messages::Reason::Unknown,
|
||||
ApiError::Console(e) => e.get_reason(),
|
||||
ApiError::Transport(_) => messages::Reason::Unknown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for ApiError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use ApiError::*;
|
||||
match self {
|
||||
// To minimize risks, only select errors are forwarded to users.
|
||||
Console(c) => c.get_user_facing_message(),
|
||||
_ => REQUEST_FAILED.to_owned(),
|
||||
ApiError::Console(c) => c.get_user_facing_message(),
|
||||
ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -73,57 +71,53 @@ pub mod errors {
|
||||
impl ReportableError for ApiError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
ApiError::Console(e) => {
|
||||
use crate::error::ErrorKind::*;
|
||||
match e.get_reason() {
|
||||
Reason::RoleProtected => User,
|
||||
Reason::ResourceNotFound => User,
|
||||
Reason::ProjectNotFound => User,
|
||||
Reason::EndpointNotFound => User,
|
||||
Reason::BranchNotFound => User,
|
||||
Reason::RateLimitExceeded => ServiceRateLimit,
|
||||
Reason::NonDefaultBranchComputeTimeExceeded => User,
|
||||
Reason::ActiveTimeQuotaExceeded => User,
|
||||
Reason::ComputeTimeQuotaExceeded => User,
|
||||
Reason::WrittenDataQuotaExceeded => User,
|
||||
Reason::DataTransferQuotaExceeded => User,
|
||||
Reason::LogicalSizeQuotaExceeded => User,
|
||||
Reason::ConcurrencyLimitReached => ControlPlane,
|
||||
Reason::LockAlreadyTaken => ControlPlane,
|
||||
Reason::RunningOperations => ControlPlane,
|
||||
Reason::Unknown => match &e {
|
||||
ConsoleError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
error,
|
||||
..
|
||||
} if error.contains(
|
||||
"compute time quota of non-primary branches is exceeded",
|
||||
) =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
error,
|
||||
..
|
||||
} if error.contains("quota exceeded")
|
||||
|| error.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
},
|
||||
}
|
||||
}
|
||||
ApiError::Console(e) => match e.get_reason() {
|
||||
Reason::RoleProtected => ErrorKind::User,
|
||||
Reason::ResourceNotFound => ErrorKind::User,
|
||||
Reason::ProjectNotFound => ErrorKind::User,
|
||||
Reason::EndpointNotFound => ErrorKind::User,
|
||||
Reason::BranchNotFound => ErrorKind::User,
|
||||
Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
|
||||
Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
|
||||
Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
|
||||
Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
|
||||
Reason::WrittenDataQuotaExceeded => ErrorKind::User,
|
||||
Reason::DataTransferQuotaExceeded => ErrorKind::User,
|
||||
Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
|
||||
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
|
||||
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
|
||||
Reason::RunningOperations => ErrorKind::ControlPlane,
|
||||
Reason::Unknown => match &e {
|
||||
ConsoleError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
error,
|
||||
..
|
||||
} if error
|
||||
.contains("compute time quota of non-primary branches is exceeded") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::LOCKED,
|
||||
error,
|
||||
..
|
||||
} if error.contains("quota exceeded")
|
||||
|| error.contains("the limit for current plan reached") =>
|
||||
{
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ConsoleError {
|
||||
http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
|
||||
..
|
||||
} => crate::error::ErrorKind::ServiceRateLimit,
|
||||
ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
|
||||
},
|
||||
},
|
||||
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
|
||||
}
|
||||
}
|
||||
@@ -170,12 +164,11 @@ pub mod errors {
|
||||
|
||||
impl UserFacingError for GetAuthInfoError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use GetAuthInfoError::*;
|
||||
match self {
|
||||
// We absolutely should not leak any secrets!
|
||||
BadSecret => REQUEST_FAILED.to_owned(),
|
||||
Self::BadSecret => REQUEST_FAILED.to_owned(),
|
||||
// However, API might return a meaningful error.
|
||||
ApiError(e) => e.to_string_client(),
|
||||
Self::ApiError(e) => e.to_string_client(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,8 +176,8 @@ pub mod errors {
|
||||
impl ReportableError for GetAuthInfoError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
|
||||
GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
|
||||
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
|
||||
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -213,17 +206,16 @@ pub mod errors {
|
||||
|
||||
impl UserFacingError for WakeComputeError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use WakeComputeError::*;
|
||||
match self {
|
||||
// We shouldn't show user the address even if it's broken.
|
||||
// Besides, user is unlikely to care about this detail.
|
||||
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
|
||||
Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
|
||||
// However, API might return a meaningful error.
|
||||
ApiError(e) => e.to_string_client(),
|
||||
Self::ApiError(e) => e.to_string_client(),
|
||||
|
||||
TooManyConnections => self.to_string(),
|
||||
Self::TooManyConnections => self.to_string(),
|
||||
|
||||
TooManyConnectionAttempts(_) => {
|
||||
Self::TooManyConnectionAttempts(_) => {
|
||||
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
|
||||
}
|
||||
}
|
||||
@@ -233,10 +225,10 @@ pub mod errors {
|
||||
impl ReportableError for WakeComputeError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
WakeComputeError::ApiError(e) => e.get_error_kind(),
|
||||
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
Self::ApiError(e) => e.get_error_kind(),
|
||||
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,10 +236,10 @@ pub mod errors {
|
||||
impl CouldRetry for WakeComputeError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
WakeComputeError::BadComputeAddress(_) => false,
|
||||
WakeComputeError::ApiError(e) => e.could_retry(),
|
||||
WakeComputeError::TooManyConnections => false,
|
||||
WakeComputeError::TooManyConnectionAttempts(_) => false,
|
||||
Self::BadComputeAddress(_) => false,
|
||||
Self::ApiError(e) => e.could_retry(),
|
||||
Self::TooManyConnections => false,
|
||||
Self::TooManyConnectionAttempts(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -366,13 +358,14 @@ impl Api for ConsoleBackend {
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
match self {
|
||||
Console(api) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Console(api) => api.get_role_secret(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Postgres(api) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
Test(_) => unreachable!("this function should never be called in the test backend"),
|
||||
Self::Test(_) => {
|
||||
unreachable!("this function should never be called in the test backend")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,13 +374,12 @@ impl Api for ConsoleBackend {
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
match self {
|
||||
Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
Test(api) => api.get_allowed_ips_and_secret(),
|
||||
Self::Test(api) => api.get_allowed_ips_and_secret(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,14 +388,12 @@ impl Api for ConsoleBackend {
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
|
||||
use ConsoleBackend::*;
|
||||
|
||||
match self {
|
||||
Console(api) => api.wake_compute(ctx, user_info).await,
|
||||
Self::Console(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Postgres(api) => api.wake_compute(ctx, user_info).await,
|
||||
Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
|
||||
#[cfg(test)]
|
||||
Test(api) => api.wake_compute(),
|
||||
Self::Test(api) => api.wake_compute(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -549,7 +539,7 @@ impl WakeComputePermit {
|
||||
!self.permit.is_disabled()
|
||||
}
|
||||
pub fn release(self, outcome: Outcome) {
|
||||
self.permit.release(outcome)
|
||||
self.permit.release(outcome);
|
||||
}
|
||||
pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
|
||||
match res {
|
||||
|
||||
@@ -166,7 +166,7 @@ impl RequestMonitoring {
|
||||
pub fn set_project(&self, x: MetricsAuxInfo) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
if this.endpoint_id.is_none() {
|
||||
this.set_endpoint_id(x.endpoint_id.as_str().into())
|
||||
this.set_endpoint_id(x.endpoint_id.as_str().into());
|
||||
}
|
||||
this.branch = Some(x.branch_id);
|
||||
this.project = Some(x.project_id);
|
||||
@@ -260,7 +260,7 @@ impl RequestMonitoring {
|
||||
.cold_start_info
|
||||
}
|
||||
|
||||
pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
|
||||
pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
|
||||
LatencyTimerPause {
|
||||
ctx: self,
|
||||
start: tokio::time::Instant::now(),
|
||||
@@ -273,7 +273,7 @@ impl RequestMonitoring {
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.latency_timer
|
||||
.success()
|
||||
.success();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
|
||||
fn has_private_peer_addr(&self) -> bool {
|
||||
match self.peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
IpAddr::V6(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -736,7 +736,7 @@ mod tests {
|
||||
while let Some(r) = s.next().await {
|
||||
tx.send(r).unwrap();
|
||||
}
|
||||
time::sleep(time::Duration::from_secs(70)).await
|
||||
time::sleep(time::Duration::from_secs(70)).await;
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -6,6 +6,12 @@ pub mod health_server;
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use http_body_util::BodyExt;
|
||||
use hyper1::body::Body;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
@@ -96,6 +102,33 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<D> {
|
||||
// We could use `b.limited().collect().await.to_bytes()` here
|
||||
// but this ends up being slightly more efficient as far as I can tell.
|
||||
|
||||
// check the lower bound of the size hint.
|
||||
// in reqwest, this value is influenced by the Content-Length header.
|
||||
let lower_bound = match usize::try_from(b.size_hint().lower()) {
|
||||
Ok(bound) if bound <= limit => bound,
|
||||
_ => bail!("Content length exceeds limit of {limit} bytes"),
|
||||
};
|
||||
let mut bytes = Vec::with_capacity(lower_bound);
|
||||
|
||||
while let Some(frame) = b.frame().await.transpose()? {
|
||||
if let Ok(data) = frame.into_data() {
|
||||
if bytes.len() + data.len() > limit {
|
||||
bail!("Content length exceeds limit of {limit} bytes")
|
||||
}
|
||||
bytes.extend_from_slice(&data);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::from_slice::<D>(&bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
|
||||
impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
|
||||
type Value = InternedString<Id>;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
formatter.write_str("a string")
|
||||
}
|
||||
|
||||
|
||||
@@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> {
|
||||
}
|
||||
|
||||
impl HttpEndpointPools {
|
||||
pub fn guard(&self) -> HttpEndpointPoolsGuard {
|
||||
pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> {
|
||||
self.http_pool_endpoints_registered_total.inc();
|
||||
HttpEndpointPoolsGuard {
|
||||
dec: &self.http_pool_endpoints_unregistered_total,
|
||||
|
||||
@@ -184,7 +184,7 @@ impl CopyBuffer {
|
||||
}
|
||||
Poll::Pending
|
||||
}
|
||||
res => res.map_err(ErrorDirection::Write),
|
||||
res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -82,9 +82,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||
loop {
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
use FeStartupPacket::*;
|
||||
match msg {
|
||||
SslRequest { direct } => match stream.get_ref() {
|
||||
FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_ssl => {
|
||||
tried_ssl = true;
|
||||
|
||||
@@ -139,7 +138,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let tls_stream = accept.await.inspect_err(|_| {
|
||||
if record_handshake_error {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc()
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -182,7 +181,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
_ => return Err(HandshakeError::ProtocolViolation),
|
||||
},
|
||||
GssEncRequest => match stream.get_ref() {
|
||||
FeStartupPacket::GssEncRequest => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_gss => {
|
||||
tried_gss = true;
|
||||
|
||||
@@ -191,7 +190,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
_ => return Err(HandshakeError::ProtocolViolation),
|
||||
},
|
||||
StartupMessage { params, version }
|
||||
FeStartupPacket::StartupMessage { params, version }
|
||||
if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
|
||||
{
|
||||
// Check that the config has been consumed during upgrade
|
||||
@@ -211,7 +210,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
break Ok(HandshakeData::Startup(stream, params));
|
||||
}
|
||||
// downgrade protocol version
|
||||
StartupMessage { params, version }
|
||||
FeStartupPacket::StartupMessage { params, version }
|
||||
if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
|
||||
{
|
||||
warn!(?version, "unsupported minor version");
|
||||
@@ -241,7 +240,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
);
|
||||
break Ok(HandshakeData::Startup(stream, params));
|
||||
}
|
||||
StartupMessage { version, .. } => {
|
||||
FeStartupPacket::StartupMessage { version, .. } => {
|
||||
warn!(
|
||||
?version,
|
||||
session_type = "normal",
|
||||
@@ -249,7 +248,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
);
|
||||
return Err(HandshakeError::ProtocolViolation);
|
||||
}
|
||||
CancelRequest(cancel_key_data) => {
|
||||
FeStartupPacket::CancelRequest(cancel_key_data) => {
|
||||
info!(session_type = "cancellation", "successful handshake");
|
||||
break Ok(HandshakeData::Cancel(cancel_key_data));
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@ async fn proxy_mitm(
|
||||
end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
|
||||
continue;
|
||||
}
|
||||
end_client.send(message).await.unwrap()
|
||||
end_client.send(message).await.unwrap();
|
||||
}
|
||||
_ => break,
|
||||
}
|
||||
@@ -88,7 +88,7 @@ async fn proxy_mitm(
|
||||
end_server.send(buf.freeze()).await.unwrap();
|
||||
continue;
|
||||
}
|
||||
end_server.send(message).await.unwrap()
|
||||
end_server.send(message).await.unwrap();
|
||||
}
|
||||
_ => break,
|
||||
}
|
||||
|
||||
@@ -237,7 +237,7 @@ impl Token {
|
||||
}
|
||||
|
||||
pub fn release(mut self, outcome: Outcome) {
|
||||
self.release_mut(Some(outcome))
|
||||
self.release_mut(Some(outcome));
|
||||
}
|
||||
|
||||
pub fn release_mut(&mut self, outcome: Option<Outcome>) {
|
||||
@@ -249,7 +249,7 @@ impl Token {
|
||||
|
||||
impl Drop for Token {
|
||||
fn drop(&mut self) {
|
||||
self.release_mut(None)
|
||||
self.release_mut(None);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,9 +25,8 @@ pub struct Aimd {
|
||||
|
||||
impl LimitAlgorithm for Aimd {
|
||||
fn update(&self, old_limit: usize, sample: Sample) -> usize {
|
||||
use Outcome::*;
|
||||
match sample.outcome {
|
||||
Success => {
|
||||
Outcome::Success => {
|
||||
let utilisation = sample.in_flight as f32 / old_limit as f32;
|
||||
|
||||
if utilisation > self.utilisation {
|
||||
@@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd {
|
||||
old_limit
|
||||
}
|
||||
}
|
||||
Overload => {
|
||||
Outcome::Overload => {
|
||||
let limit = old_limit as f32 * self.dec;
|
||||
|
||||
// Floor instead of round, so the limit reduces even with small numbers.
|
||||
|
||||
@@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider {
|
||||
info!("Establishing a new connection...");
|
||||
self.con = None;
|
||||
if let Some(f) = self.refresh_token_task.take() {
|
||||
f.abort()
|
||||
f.abort();
|
||||
}
|
||||
let mut con = self
|
||||
.get_client()
|
||||
|
||||
@@ -108,7 +108,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
}
|
||||
#[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
|
||||
async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
|
||||
use Notification::*;
|
||||
let payload: String = msg.get_payload()?;
|
||||
tracing::debug!(?payload, "received a message payload");
|
||||
|
||||
@@ -124,7 +123,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
};
|
||||
tracing::debug!(?msg, "received a message");
|
||||
match msg {
|
||||
Cancel(cancel_session) => {
|
||||
Notification::Cancel(cancel_session) => {
|
||||
tracing::Span::current().record(
|
||||
"session_id",
|
||||
tracing::field::display(cancel_session.session_id),
|
||||
@@ -153,12 +152,12 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
}
|
||||
_ => {
|
||||
invalidate_cache(self.cache.clone(), msg.clone());
|
||||
if matches!(msg, AllowedIpsUpdate { .. }) {
|
||||
if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
.inc(RedisEventsCount::AllowedIpsUpdate);
|
||||
} else if matches!(msg, PasswordUpdate { .. }) {
|
||||
} else if matches!(msg, Notification::PasswordUpdate { .. }) {
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.redis_events_count
|
||||
@@ -180,16 +179,16 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
}
|
||||
|
||||
fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
use Notification::*;
|
||||
match msg {
|
||||
AllowedIpsUpdate { allowed_ips_update } => {
|
||||
cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
|
||||
Notification::AllowedIpsUpdate { allowed_ips_update } => {
|
||||
cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
|
||||
}
|
||||
PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
|
||||
password_update.project_id,
|
||||
password_update.role_name,
|
||||
),
|
||||
Cancel(_) => unreachable!("cancel message should be handled separately"),
|
||||
Notification::PasswordUpdate { password_update } => cache
|
||||
.invalidate_role_secret_for_project(
|
||||
password_update.project_id,
|
||||
password_update.role_name,
|
||||
),
|
||||
Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -42,10 +42,9 @@ pub enum Error {
|
||||
|
||||
impl UserFacingError for Error {
|
||||
fn to_string_client(&self) -> String {
|
||||
use Error::*;
|
||||
match self {
|
||||
ChannelBindingFailed(m) => m.to_string(),
|
||||
ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
|
||||
Self::ChannelBindingFailed(m) => (*m).to_string(),
|
||||
Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
|
||||
_ => "authentication protocol violation".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,11 +13,10 @@ pub enum ChannelBinding<T> {
|
||||
|
||||
impl<T> ChannelBinding<T> {
|
||||
pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
|
||||
use ChannelBinding::*;
|
||||
Ok(match self {
|
||||
NotSupportedClient => NotSupportedClient,
|
||||
NotSupportedServer => NotSupportedServer,
|
||||
Required(x) => Required(f(x)?),
|
||||
Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
|
||||
Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
|
||||
Self::Required(x) => ChannelBinding::Required(f(x)?),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -25,11 +24,10 @@ impl<T> ChannelBinding<T> {
|
||||
impl<'a> ChannelBinding<&'a str> {
|
||||
// NB: FromStr doesn't work with lifetimes
|
||||
pub fn parse(input: &'a str) -> Option<Self> {
|
||||
use ChannelBinding::*;
|
||||
Some(match input {
|
||||
"n" => NotSupportedClient,
|
||||
"y" => NotSupportedServer,
|
||||
other => Required(other.strip_prefix("p=")?),
|
||||
"n" => Self::NotSupportedClient,
|
||||
"y" => Self::NotSupportedServer,
|
||||
other => Self::Required(other.strip_prefix("p=")?),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -40,17 +38,16 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
|
||||
&self,
|
||||
get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
|
||||
) -> Result<std::borrow::Cow<'static, str>, E> {
|
||||
use ChannelBinding::*;
|
||||
Ok(match self {
|
||||
NotSupportedClient => {
|
||||
Self::NotSupportedClient => {
|
||||
// base64::encode("n,,")
|
||||
"biws".into()
|
||||
}
|
||||
NotSupportedServer => {
|
||||
Self::NotSupportedServer => {
|
||||
// base64::encode("y,,")
|
||||
"eSws".into()
|
||||
}
|
||||
Required(mode) => {
|
||||
Self::Required(mode) => {
|
||||
use std::io::Write;
|
||||
let mut cbind_input = vec![];
|
||||
write!(&mut cbind_input, "p={mode},,",).unwrap();
|
||||
|
||||
@@ -42,10 +42,9 @@ pub(super) enum ServerMessage<T> {
|
||||
|
||||
impl<'a> ServerMessage<&'a str> {
|
||||
pub(super) fn to_reply(&self) -> BeMessage<'a> {
|
||||
use BeAuthenticationSaslMessage::*;
|
||||
BeMessage::AuthenticationSasl(match self {
|
||||
ServerMessage::Continue(s) => Continue(s.as_bytes()),
|
||||
ServerMessage::Final(s) => Final(s.as_bytes()),
|
||||
ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
|
||||
ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,12 +137,12 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn round_trip() {
|
||||
run_round_trip_test("pencil", "pencil").await
|
||||
run_round_trip_test("pencil", "pencil").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic(expected = "password doesn't match")]
|
||||
async fn failure() {
|
||||
run_round_trip_test("pencil", "eraser").await
|
||||
run_round_trip_test("pencil", "eraser").await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,8 +98,6 @@ mod tests {
|
||||
// q% of counts will be within p of the actual value
|
||||
let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
|
||||
|
||||
dbg!(sketch.buckets.len());
|
||||
|
||||
// insert a bunch of entries in a random order
|
||||
let mut ids2 = ids.clone();
|
||||
while !ids2.is_empty() {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user