mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-13 13:20:38 +00:00
Compare commits
132 Commits
release-pr
...
hackathon/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
05a8ec269a | ||
|
|
fcab61bdcd | ||
|
|
9e3ead3689 | ||
|
|
8dc069037b | ||
|
|
0a363c3dce | ||
|
|
aeca15008c | ||
|
|
43846b72fa | ||
|
|
cb060548fb | ||
|
|
bae793ffcd | ||
|
|
26b5fcdc50 | ||
|
|
97582178cb | ||
|
|
842be0ba74 | ||
|
|
982b376ea2 | ||
|
|
e158df4e86 | ||
|
|
723c0971e8 | ||
|
|
c8f67eed8f | ||
|
|
2d885ac07a | ||
|
|
89c5e80b3f | ||
|
|
93ec7503e0 | ||
|
|
7d7d1f354b | ||
|
|
16c200d6d9 | ||
|
|
3dbd34aa78 | ||
|
|
fa3fc73c1b | ||
|
|
ac5815b594 | ||
|
|
30583cb626 | ||
|
|
c1a51416db | ||
|
|
8eab7009c1 | ||
|
|
11cf16e3f3 | ||
|
|
af6f63617e | ||
|
|
e287f36a05 | ||
|
|
cbcd4058ed | ||
|
|
e86fef05dd | ||
|
|
a1323231bc | ||
|
|
06e840b884 | ||
|
|
cf11c8ab6a | ||
|
|
04f99a87bf | ||
|
|
fd12dd942f | ||
|
|
ebddda5b7f | ||
|
|
efe03d5a1c | ||
|
|
850421ec06 | ||
|
|
6dfbf49128 | ||
|
|
708322ce3c | ||
|
|
99fa1c3600 | ||
|
|
0205ce1849 | ||
|
|
1a9b54f1d9 | ||
|
|
3f43823a9b | ||
|
|
a046717a24 | ||
|
|
7a1397cf37 | ||
|
|
75310fe441 | ||
|
|
ecfa3d9de9 | ||
|
|
3d9001d83f | ||
|
|
1a874a3e86 | ||
|
|
c4fe6641c1 | ||
|
|
c7187be8a1 | ||
|
|
83dd7f559c | ||
|
|
80512e2779 | ||
|
|
3916810f20 | ||
|
|
c43e664ff5 | ||
|
|
b37da32c6f | ||
|
|
3b317cae07 | ||
|
|
bf0531d107 | ||
|
|
15e90cc427 | ||
|
|
9746b6ea31 | ||
|
|
516ac0591e | ||
|
|
3ec785f30d | ||
|
|
05caaab850 | ||
|
|
cacb1ae333 | ||
|
|
df971f995c | ||
|
|
e58e045ebb | ||
|
|
20f82f9169 | ||
|
|
72aa6b02da | ||
|
|
022fad65eb | ||
|
|
8eaa8ad358 | ||
|
|
653a6532a2 | ||
|
|
18bfc43fa7 | ||
|
|
7ce49fe6e3 | ||
|
|
a8fbc63be2 | ||
|
|
96b5c4d33d | ||
|
|
c7481402a0 | ||
|
|
a644f01b6a | ||
|
|
c2f8fdccd7 | ||
|
|
cfa45ff5ee | ||
|
|
acc075071d | ||
|
|
9627747d35 | ||
|
|
63a0d0d039 | ||
|
|
793b5061ec | ||
|
|
a889a49e06 | ||
|
|
5eb7322d08 | ||
|
|
c0ba18a112 | ||
|
|
992a951b5e | ||
|
|
c5ef779801 | ||
|
|
2d10306f7a | ||
|
|
9b9f90c562 | ||
|
|
52cb33770b | ||
|
|
12850dd5e9 | ||
|
|
5d527133a3 | ||
|
|
09362b6363 | ||
|
|
7820c572e7 | ||
|
|
bf03713fa1 | ||
|
|
0f65684263 | ||
|
|
97241776aa | ||
|
|
2dd53e7ae0 | ||
|
|
d6eede515a | ||
|
|
d48229f50f | ||
|
|
cdfdcd3e5d | ||
|
|
06795c6b9a | ||
|
|
701cb61b57 | ||
|
|
0aa1450936 | ||
|
|
b65a95f12e | ||
|
|
c1cb7a0fa0 | ||
|
|
f4cac1f30f | ||
|
|
612b643315 | ||
|
|
bcc68a7866 | ||
|
|
73286e6b9f | ||
|
|
bc8cfe1b55 | ||
|
|
6a74bcadec | ||
|
|
e62cd9e121 | ||
|
|
e80ab8fd6a | ||
|
|
d8ca495eae | ||
|
|
dbdb8a1187 | ||
|
|
f7ab3ffcb7 | ||
|
|
2f8d548a12 | ||
|
|
66db381dc9 | ||
|
|
6744ed19d8 | ||
|
|
ae63ac7488 | ||
|
|
6eb638f4b3 | ||
|
|
7a485b599b | ||
|
|
b1c457898b | ||
|
|
1a9d559be8 | ||
|
|
0e6c0d47a5 | ||
|
|
d645645fab | ||
|
|
7c74112b2a |
1
.devcontainer/Dockerfile.devcontainer
Normal file
1
.devcontainer/Dockerfile.devcontainer
Normal file
@@ -0,0 +1 @@
|
|||||||
|
FROM neondatabase/build-tools:pinned
|
||||||
23
.devcontainer/devcontainer.json
Normal file
23
.devcontainer/devcontainer.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
// https://containers.dev/implementors/json_reference/
|
||||||
|
{
|
||||||
|
"name": "Neon",
|
||||||
|
"build": {
|
||||||
|
"context": "..",
|
||||||
|
"dockerfile": "Dockerfile.devcontainer"
|
||||||
|
},
|
||||||
|
|
||||||
|
"postCreateCommand": {
|
||||||
|
"build neon": "BUILD_TYPE=debug CARGO_BUILD_FLAGS='--features=testing' mold -run make -s -j`nproc`",
|
||||||
|
"install python deps": "./scripts/pysync"
|
||||||
|
},
|
||||||
|
|
||||||
|
"customizations": {
|
||||||
|
"vscode": {
|
||||||
|
"extensions": [
|
||||||
|
"charliermarsh.ruff",
|
||||||
|
"github.vscode-github-actions",
|
||||||
|
"rust-lang.rust-analyzer"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
|
||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: Feature request
|
||||||
|
url: https://console.neon.tech/app/projects?modal=feedback
|
||||||
|
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
|
||||||
7
.github/actionlint.yml
vendored
7
.github/actionlint.yml
vendored
@@ -7,6 +7,13 @@ self-hosted-runner:
|
|||||||
- small-arm64
|
- small-arm64
|
||||||
- us-east-2
|
- us-east-2
|
||||||
config-variables:
|
config-variables:
|
||||||
|
- AZURE_DEV_CLIENT_ID
|
||||||
|
- AZURE_DEV_REGISTRY_NAME
|
||||||
|
- AZURE_DEV_SUBSCRIPTION_ID
|
||||||
|
- AZURE_PROD_CLIENT_ID
|
||||||
|
- AZURE_PROD_REGISTRY_NAME
|
||||||
|
- AZURE_PROD_SUBSCRIPTION_ID
|
||||||
|
- AZURE_TENANT_ID
|
||||||
- BENCHMARK_PROJECT_ID_PUB
|
- BENCHMARK_PROJECT_ID_PUB
|
||||||
- BENCHMARK_PROJECT_ID_SUB
|
- BENCHMARK_PROJECT_ID_SUB
|
||||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ runs:
|
|||||||
if: inputs.build_type != 'remote'
|
if: inputs.build_type != 'remote'
|
||||||
uses: ./.github/actions/download
|
uses: ./.github/actions/download
|
||||||
with:
|
with:
|
||||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||||
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||||
prefix: latest
|
prefix: latest
|
||||||
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
||||||
@@ -211,13 +211,13 @@ runs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Upload compatibility snapshot
|
- name: Upload compatibility snapshot
|
||||||
if: github.ref_name == 'release'
|
# Note, that we use `github.base_ref` which is a target branch for a PR
|
||||||
|
if: github.event_name == 'pull_request' && github.base_ref == 'release'
|
||||||
uses: ./.github/actions/upload
|
uses: ./.github/actions/upload
|
||||||
with:
|
with:
|
||||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
|
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||||
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
||||||
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
||||||
prefix: latest
|
|
||||||
|
|
||||||
- name: Upload test results
|
- name: Upload test results
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
|
|||||||
@@ -216,8 +216,14 @@ jobs:
|
|||||||
#nextest does not yet support running doctests
|
#nextest does not yet support running doctests
|
||||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||||
|
|
||||||
|
# run all non-pageserver tests
|
||||||
|
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
||||||
|
|
||||||
|
# run pageserver tests with different settings
|
||||||
for io_engine in std-fs tokio-epoll-uring ; do
|
for io_engine in std-fs tokio-epoll-uring ; do
|
||||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
for io_buffer_alignment in 0 1 512 ; do
|
||||||
|
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||||
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
|
|||||||
56
.github/workflows/_push-to-acr.yml
vendored
Normal file
56
.github/workflows/_push-to-acr.yml
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
name: Push images to ACR
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
client_id:
|
||||||
|
description: Client ID of Azure managed identity or Entra app
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
image_tag:
|
||||||
|
description: Tag for the container image
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
images:
|
||||||
|
description: Images to push
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
registry_name:
|
||||||
|
description: Name of the container registry
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
subscription_id:
|
||||||
|
description: Azure subscription ID
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
tenant_id:
|
||||||
|
description: Azure tenant ID
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push-to-acr:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
permissions:
|
||||||
|
contents: read # This is required for actions/checkout
|
||||||
|
id-token: write # This is required for Azure Login to work.
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Azure login
|
||||||
|
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||||
|
with:
|
||||||
|
client-id: ${{ inputs.client_id }}
|
||||||
|
subscription-id: ${{ inputs.subscription_id }}
|
||||||
|
tenant-id: ${{ inputs.tenant_id }}
|
||||||
|
|
||||||
|
- name: Login to ACR
|
||||||
|
run: |
|
||||||
|
az acr login --name=${{ inputs.registry_name }}
|
||||||
|
|
||||||
|
- name: Copy docker images to ACR ${{ inputs.registry_name }}
|
||||||
|
run: |
|
||||||
|
images='${{ inputs.images }}'
|
||||||
|
for image in ${images}; do
|
||||||
|
docker buildx imagetools create \
|
||||||
|
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||||
|
neondatabase/${image}:${{ inputs.image_tag }}
|
||||||
|
done
|
||||||
151
.github/workflows/build_and_test.yml
vendored
151
.github/workflows/build_and_test.yml
vendored
@@ -286,6 +286,7 @@ jobs:
|
|||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||||
|
SYNC_AFTER_EACH_TEST: true
|
||||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||||
# while coverage is currently collected for the debug ones
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
@@ -793,9 +794,6 @@ jobs:
|
|||||||
docker compose -f ./docker-compose/docker-compose.yml down
|
docker compose -f ./docker-compose/docker-compose.yml down
|
||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
permissions:
|
|
||||||
contents: read # This is required for actions/checkout
|
|
||||||
id-token: write # This is required for Azure Login to work.
|
|
||||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
@@ -822,28 +820,6 @@ jobs:
|
|||||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||||
done
|
done
|
||||||
|
|
||||||
- name: Azure login
|
|
||||||
if: github.ref_name == 'main'
|
|
||||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
|
||||||
with:
|
|
||||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
|
||||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
|
||||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
|
||||||
|
|
||||||
- name: Login to ACR
|
|
||||||
if: github.ref_name == 'main'
|
|
||||||
run: |
|
|
||||||
az acr login --name=neoneastus2
|
|
||||||
|
|
||||||
- name: Copy docker images to ACR-dev
|
|
||||||
if: github.ref_name == 'main'
|
|
||||||
run: |
|
|
||||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
|
|
||||||
docker buildx imagetools create \
|
|
||||||
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
|
|
||||||
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Add latest tag to images
|
- name: Add latest tag to images
|
||||||
if: github.ref_name == 'main'
|
if: github.ref_name == 'main'
|
||||||
run: |
|
run: |
|
||||||
@@ -881,6 +857,30 @@ jobs:
|
|||||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
||||||
done
|
done
|
||||||
|
|
||||||
|
push-to-acr-dev:
|
||||||
|
if: github.ref_name == 'main'
|
||||||
|
needs: [ tag, promote-images ]
|
||||||
|
uses: ./.github/workflows/_push-to-acr.yml
|
||||||
|
with:
|
||||||
|
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||||
|
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||||
|
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||||
|
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||||
|
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||||
|
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||||
|
|
||||||
|
push-to-acr-prod:
|
||||||
|
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||||
|
needs: [ tag, promote-images ]
|
||||||
|
uses: ./.github/workflows/_push-to-acr.yml
|
||||||
|
with:
|
||||||
|
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||||
|
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||||
|
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
|
||||||
|
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||||
|
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||||
|
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||||
|
|
||||||
trigger-custom-extensions-build-and-wait:
|
trigger-custom-extensions-build-and-wait:
|
||||||
needs: [ check-permissions, tag ]
|
needs: [ check-permissions, tag ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
@@ -956,8 +956,8 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
|
||||||
|
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, small ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
@@ -1055,43 +1055,88 @@ jobs:
|
|||||||
generate_release_notes: true,
|
generate_release_notes: true,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||||
promote-compatibility-data:
|
promote-compatibility-data:
|
||||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
|
needs: [ deploy ]
|
||||||
if: github.ref_name == 'release'
|
if: github.ref_name == 'release'
|
||||||
|
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: ubuntu-22.04
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
|
||||||
options: --init
|
|
||||||
steps:
|
steps:
|
||||||
- name: Promote compatibility snapshot for the release
|
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
|
||||||
|
id: fetch-last-release-pr-info
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
run: |
|
||||||
|
branch_name_and_pr_number=$(gh pr list \
|
||||||
|
--repo "${GITHUB_REPOSITORY}" \
|
||||||
|
--base release \
|
||||||
|
--state merged \
|
||||||
|
--limit 10 \
|
||||||
|
--json mergeCommit,headRefName,number \
|
||||||
|
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
|
||||||
|
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
|
||||||
|
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
|
||||||
|
|
||||||
|
run_id=$(gh run list \
|
||||||
|
--repo "${GITHUB_REPOSITORY}" \
|
||||||
|
--workflow build_and_test.yml \
|
||||||
|
--branch "${branch_name}" \
|
||||||
|
--json databaseId \
|
||||||
|
--limit 1 \
|
||||||
|
--jq '.[].databaseId')
|
||||||
|
|
||||||
|
last_commit_sha=$(gh pr view "${pr_number}" \
|
||||||
|
--repo "${GITHUB_REPOSITORY}" \
|
||||||
|
--json commits \
|
||||||
|
--jq '.commits[-1].oid')
|
||||||
|
|
||||||
|
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
|
||||||
|
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
|
||||||
|
|
||||||
|
- name: Promote compatibility snapshot and Neon artifact
|
||||||
env:
|
env:
|
||||||
BUCKET: neon-github-public-dev
|
BUCKET: neon-github-public-dev
|
||||||
PREFIX: artifacts/latest
|
AWS_REGION: eu-central-1
|
||||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
|
||||||
|
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
|
||||||
run: |
|
run: |
|
||||||
# Update compatibility snapshot for the release
|
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
|
||||||
for pg_version in v14 v15 v16; do
|
new_prefix="artifacts/latest"
|
||||||
for build_type in debug release; do
|
|
||||||
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
|
|
||||||
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
|
|
||||||
|
|
||||||
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
files_to_promote=()
|
||||||
|
files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
|
||||||
|
|
||||||
|
for arch in X64 ARM64; do
|
||||||
|
for build_type in debug release; do
|
||||||
|
neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
|
||||||
|
s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
|
||||||
|
if [ -z "${s3_key}" ]; then
|
||||||
|
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||||
|
|
||||||
|
for pg_version in v14 v15 v16; do
|
||||||
|
# We run less tests for debug builds, so we don't need to promote them
|
||||||
|
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
|
||||||
|
s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
|
||||||
|
if [ -z "${s3_key}" ]; then
|
||||||
|
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||||
|
done
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
for f in "${files_to_promote[@]}"; do
|
||||||
for build_type in debug release; do
|
time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
|
||||||
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
|
|
||||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
|
||||||
|
|
||||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
|
||||||
if [ -z "${S3_KEY}" ]; then
|
|
||||||
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
|
||||||
done
|
done
|
||||||
|
|
||||||
pin-build-tools-image:
|
pin-build-tools-image:
|
||||||
|
|||||||
34
.github/workflows/label-for-external-users.yml
vendored
34
.github/workflows/label-for-external-users.yml
vendored
@@ -7,6 +7,11 @@ on:
|
|||||||
pull_request_target:
|
pull_request_target:
|
||||||
types:
|
types:
|
||||||
- opened
|
- opened
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
github-actor:
|
||||||
|
description: 'GitHub username. If empty, the username of the current user will be used'
|
||||||
|
required: false
|
||||||
|
|
||||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||||
permissions: {}
|
permissions: {}
|
||||||
@@ -26,12 +31,31 @@ jobs:
|
|||||||
id: check-user
|
id: check-user
|
||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||||
|
ACTOR: ${{ inputs.github-actor || github.actor }}
|
||||||
run: |
|
run: |
|
||||||
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
|
expected_error="User does not exist or is not a member of the organization"
|
||||||
is_member=true
|
output_file=output.txt
|
||||||
else
|
|
||||||
is_member=false
|
for i in $(seq 1 10); do
|
||||||
fi
|
if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
|
||||||
|
-H "Accept: application/vnd.github+json" \
|
||||||
|
-H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
|
||||||
|
|
||||||
|
is_member=true
|
||||||
|
break
|
||||||
|
elif grep -q "${expected_error}" ${output_file}; then
|
||||||
|
is_member=false
|
||||||
|
break
|
||||||
|
elif [ $i -eq 10 ]; then
|
||||||
|
title="Failed to get memmbership status for ${ACTOR}"
|
||||||
|
message="The latest GitHub API error message: '$(cat ${output_file})'"
|
||||||
|
echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
|
||||||
|
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
||||||
|
|
||||||
|
|||||||
203
Cargo.lock
generated
203
Cargo.lock
generated
@@ -915,27 +915,30 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bindgen"
|
name = "bindgen"
|
||||||
version = "0.65.1"
|
version = "0.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
|
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 1.3.2",
|
"bitflags 2.4.1",
|
||||||
"cexpr",
|
"cexpr",
|
||||||
"clang-sys",
|
"clang-sys",
|
||||||
"lazy_static",
|
"itertools 0.12.1",
|
||||||
"lazycell",
|
|
||||||
"log",
|
"log",
|
||||||
"peeking_take_while",
|
"prettyplease 0.2.17",
|
||||||
"prettyplease 0.2.6",
|
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"regex",
|
"regex",
|
||||||
"rustc-hash",
|
"rustc-hash",
|
||||||
"shlex",
|
"shlex",
|
||||||
"syn 2.0.52",
|
"syn 2.0.52",
|
||||||
"which",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bit_field"
|
||||||
|
version = "0.10.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "1.3.2"
|
version = "1.3.2"
|
||||||
@@ -1186,9 +1189,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "comfy-table"
|
name = "comfy-table"
|
||||||
version = "6.1.4"
|
version = "7.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
|
checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"crossterm",
|
"crossterm",
|
||||||
"strum",
|
"strum",
|
||||||
@@ -1243,7 +1246,7 @@ dependencies = [
|
|||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-opentelemetry",
|
"tracing-opentelemetry",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
@@ -1327,7 +1330,6 @@ name = "control_plane"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
|
||||||
"camino",
|
"camino",
|
||||||
"clap",
|
"clap",
|
||||||
"comfy-table",
|
"comfy-table",
|
||||||
@@ -1358,8 +1360,8 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml 0.7.4",
|
"toml",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
@@ -1483,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossterm"
|
name = "crossterm"
|
||||||
version = "0.25.0"
|
version = "0.27.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
|
checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 1.3.2",
|
"bitflags 2.4.1",
|
||||||
"crossterm_winapi",
|
"crossterm_winapi",
|
||||||
"libc",
|
"libc",
|
||||||
"mio",
|
|
||||||
"parking_lot 0.12.1",
|
"parking_lot 0.12.1",
|
||||||
"signal-hook",
|
|
||||||
"signal-hook-mio",
|
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossterm_winapi"
|
name = "crossterm_winapi"
|
||||||
version = "0.9.0"
|
version = "0.9.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
|
checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
@@ -1672,9 +1671,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "diesel"
|
name = "diesel"
|
||||||
version = "2.2.1"
|
version = "2.2.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -2722,6 +2721,12 @@ dependencies = [
|
|||||||
"hashbrown 0.14.5",
|
"hashbrown 0.14.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "indoc"
|
||||||
|
version = "2.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "infer"
|
name = "infer"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
@@ -2938,23 +2943,6 @@ dependencies = [
|
|||||||
"spin 0.5.2",
|
"spin 0.5.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lazycell"
|
|
||||||
version = "1.3.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "leaky-bucket"
|
|
||||||
version = "1.0.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
|
||||||
dependencies = [
|
|
||||||
"parking_lot 0.12.1",
|
|
||||||
"tokio",
|
|
||||||
"tracing",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.150"
|
version = "0.2.150"
|
||||||
@@ -3153,7 +3141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
|
checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
"toml 0.8.14",
|
"toml",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3669,7 +3657,7 @@ dependencies = [
|
|||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"utils",
|
"utils",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
@@ -3683,6 +3671,7 @@ dependencies = [
|
|||||||
"async-compression",
|
"async-compression",
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
"bit_field",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"camino",
|
"camino",
|
||||||
@@ -3706,8 +3695,8 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
|
"indoc",
|
||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
"leaky-bucket",
|
|
||||||
"md5",
|
"md5",
|
||||||
"metrics",
|
"metrics",
|
||||||
"nix 0.27.1",
|
"nix 0.27.1",
|
||||||
@@ -3732,6 +3721,7 @@ dependencies = [
|
|||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
"rpds",
|
"rpds",
|
||||||
"scopeguard",
|
"scopeguard",
|
||||||
|
"send-future",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_path_to_error",
|
"serde_path_to_error",
|
||||||
@@ -3754,7 +3744,7 @@ dependencies = [
|
|||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-tar",
|
"tokio-tar",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"twox-hash",
|
"twox-hash",
|
||||||
"url",
|
"url",
|
||||||
@@ -3771,6 +3761,7 @@ dependencies = [
|
|||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
"camino",
|
||||||
"chrono",
|
"chrono",
|
||||||
"const_format",
|
"const_format",
|
||||||
"enum-map",
|
"enum-map",
|
||||||
@@ -3778,11 +3769,16 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
|
"nix 0.27.1",
|
||||||
|
"postgres_backend",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"remote_storage",
|
||||||
|
"reqwest 0.12.4",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
|
"storage_broker",
|
||||||
"strum",
|
"strum",
|
||||||
"strum_macros",
|
"strum_macros",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -3794,7 +3790,6 @@ name = "pageserver_client"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures",
|
"futures",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -3912,8 +3907,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parquet"
|
name = "parquet"
|
||||||
version = "51.0.0"
|
version = "53.0.0"
|
||||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ahash",
|
"ahash",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -3932,8 +3928,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parquet_derive"
|
name = "parquet_derive"
|
||||||
version = "51.0.0"
|
version = "53.0.0"
|
||||||
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"parquet",
|
"parquet",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
@@ -3970,12 +3967,6 @@ dependencies = [
|
|||||||
"sha2",
|
"sha2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "peeking_take_while"
|
|
||||||
version = "0.1.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pem"
|
name = "pem"
|
||||||
version = "3.0.3"
|
version = "3.0.3"
|
||||||
@@ -4129,7 +4120,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres"
|
name = "postgres"
|
||||||
version = "0.19.4"
|
version = "0.19.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
@@ -4142,7 +4133,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-protocol"
|
name = "postgres-protocol"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.20.0",
|
"base64 0.20.0",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -4161,7 +4152,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-types"
|
name = "postgres-types"
|
||||||
version = "0.2.4"
|
version = "0.2.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
@@ -4273,9 +4264,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "prettyplease"
|
name = "prettyplease"
|
||||||
version = "0.2.6"
|
version = "0.2.17"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"syn 2.0.52",
|
"syn 2.0.52",
|
||||||
@@ -4820,7 +4811,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"utils",
|
"utils",
|
||||||
]
|
]
|
||||||
@@ -5330,7 +5321,7 @@ dependencies = [
|
|||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-tar",
|
"tokio-tar",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"url",
|
"url",
|
||||||
@@ -5455,6 +5446,12 @@ version = "1.0.17"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "send-future"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sentry"
|
name = "sentry"
|
||||||
version = "0.32.3"
|
version = "0.32.3"
|
||||||
@@ -5590,11 +5587,12 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.96"
|
version = "1.0.125"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
|
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa",
|
"itoa",
|
||||||
|
"memchr",
|
||||||
"ryu",
|
"ryu",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
@@ -5732,17 +5730,6 @@ dependencies = [
|
|||||||
"signal-hook-registry",
|
"signal-hook-registry",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "signal-hook-mio"
|
|
||||||
version = "0.2.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
|
|
||||||
dependencies = [
|
|
||||||
"libc",
|
|
||||||
"mio",
|
|
||||||
"signal-hook",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "signal-hook-registry"
|
name = "signal-hook-registry"
|
||||||
version = "1.4.1"
|
version = "1.4.1"
|
||||||
@@ -5949,7 +5936,6 @@ name = "storage_controller_client"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures",
|
"futures",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -6056,21 +6042,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum"
|
name = "strum"
|
||||||
version = "0.24.1"
|
version = "0.26.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
|
checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum_macros"
|
name = "strum_macros"
|
||||||
version = "0.24.3"
|
version = "0.26.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
|
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"heck 0.4.1",
|
"heck 0.5.0",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"rustversion",
|
"rustversion",
|
||||||
"syn 1.0.109",
|
"syn 2.0.52",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6081,8 +6067,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "svg_fmt"
|
name = "svg_fmt"
|
||||||
version = "0.4.2"
|
version = "0.4.3"
|
||||||
source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
@@ -6410,7 +6397,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-postgres"
|
name = "tokio-postgres"
|
||||||
version = "0.7.7"
|
version = "0.7.7"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -6521,18 +6508,6 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "toml"
|
|
||||||
version = "0.7.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
|
|
||||||
dependencies = [
|
|
||||||
"serde",
|
|
||||||
"serde_spanned",
|
|
||||||
"toml_datetime",
|
|
||||||
"toml_edit 0.19.10",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml"
|
name = "toml"
|
||||||
version = "0.8.14"
|
version = "0.8.14"
|
||||||
@@ -6542,7 +6517,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_spanned",
|
"serde_spanned",
|
||||||
"toml_datetime",
|
"toml_datetime",
|
||||||
"toml_edit 0.22.14",
|
"toml_edit",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6554,19 +6529,6 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "toml_edit"
|
|
||||||
version = "0.19.10"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
|
|
||||||
dependencies = [
|
|
||||||
"indexmap 1.9.3",
|
|
||||||
"serde",
|
|
||||||
"serde_spanned",
|
|
||||||
"toml_datetime",
|
|
||||||
"winnow 0.4.6",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml_edit"
|
name = "toml_edit"
|
||||||
version = "0.22.14"
|
version = "0.22.14"
|
||||||
@@ -6577,7 +6539,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_spanned",
|
"serde_spanned",
|
||||||
"toml_datetime",
|
"toml_datetime",
|
||||||
"winnow 0.6.13",
|
"winnow",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6952,7 +6914,6 @@ dependencies = [
|
|||||||
"anyhow",
|
"anyhow",
|
||||||
"arc-swap",
|
"arc-swap",
|
||||||
"async-compression",
|
"async-compression",
|
||||||
"async-trait",
|
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -6968,7 +6929,6 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
"leaky-bucket",
|
|
||||||
"metrics",
|
"metrics",
|
||||||
"nix 0.27.1",
|
"nix 0.27.1",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@@ -6992,7 +6952,7 @@ dependencies = [
|
|||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tokio-tar",
|
"tokio-tar",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit 0.19.10",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-error",
|
"tracing-error",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
@@ -7538,15 +7498,6 @@ version = "0.52.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "winnow"
|
|
||||||
version = "0.4.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
|
|
||||||
dependencies = [
|
|
||||||
"memchr",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winnow"
|
name = "winnow"
|
||||||
version = "0.6.13"
|
version = "0.6.13"
|
||||||
@@ -7616,6 +7567,7 @@ dependencies = [
|
|||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"indexmap 1.9.3",
|
"indexmap 1.9.3",
|
||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
|
"itertools 0.12.1",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
@@ -7653,6 +7605,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.24.0",
|
"tokio-rustls 0.24.0",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
|
"toml_edit",
|
||||||
"tonic",
|
"tonic",
|
||||||
"tower",
|
"tower",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
|||||||
50
Cargo.toml
50
Cargo.toml
@@ -64,7 +64,8 @@ aws-types = "1.2.0"
|
|||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
bindgen = "0.65"
|
bindgen = "0.70"
|
||||||
|
bit_field = "0.10.2"
|
||||||
bstr = "1.0"
|
bstr = "1.0"
|
||||||
byteorder = "1.4"
|
byteorder = "1.4"
|
||||||
bytes = "1.0"
|
bytes = "1.0"
|
||||||
@@ -72,7 +73,7 @@ camino = "1.1.6"
|
|||||||
cfg-if = "1.0.0"
|
cfg-if = "1.0.0"
|
||||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||||
clap = { version = "4.0", features = ["derive"] }
|
clap = { version = "4.0", features = ["derive"] }
|
||||||
comfy-table = "6.1"
|
comfy-table = "7.1"
|
||||||
const_format = "0.2"
|
const_format = "0.2"
|
||||||
crc32c = "0.6"
|
crc32c = "0.6"
|
||||||
crossbeam-deque = "0.8.5"
|
crossbeam-deque = "0.8.5"
|
||||||
@@ -102,18 +103,18 @@ humantime-serde = "1.1.1"
|
|||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
tokio-tungstenite = "0.20.0"
|
tokio-tungstenite = "0.20.0"
|
||||||
indexmap = "2"
|
indexmap = "2"
|
||||||
|
indoc = "2"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
ipnet = "2.9.0"
|
ipnet = "2.9.0"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
jsonwebtoken = "9"
|
jsonwebtoken = "9"
|
||||||
lasso = "0.7"
|
lasso = "0.7"
|
||||||
leaky-bucket = "1.0.1"
|
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
measured = { version = "0.0.22", features=["lasso"] }
|
measured = { version = "0.0.22", features=["lasso"] }
|
||||||
measured-process = { version = "0.0.22" }
|
measured-process = { version = "0.0.22" }
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||||
notify = "6.0.0"
|
notify = "6.0.0"
|
||||||
num_cpus = "1.15"
|
num_cpus = "1.15"
|
||||||
num-traits = "0.2.15"
|
num-traits = "0.2.15"
|
||||||
@@ -122,8 +123,8 @@ opentelemetry = "0.20.0"
|
|||||||
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||||
opentelemetry-semantic-conventions = "0.12.0"
|
opentelemetry-semantic-conventions = "0.12.0"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||||
parquet_derive = "51.0.0"
|
parquet_derive = "53"
|
||||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
procfs = "0.16"
|
procfs = "0.16"
|
||||||
@@ -145,6 +146,7 @@ rustls-split = "0.3"
|
|||||||
scopeguard = "1.1"
|
scopeguard = "1.1"
|
||||||
sysinfo = "0.29.2"
|
sysinfo = "0.29.2"
|
||||||
sd-notify = "0.4.1"
|
sd-notify = "0.4.1"
|
||||||
|
send-future = "0.1.0"
|
||||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
@@ -156,11 +158,10 @@ signal-hook = "0.3"
|
|||||||
smallvec = "1.11"
|
smallvec = "1.11"
|
||||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||||
socket2 = "0.5"
|
socket2 = "0.5"
|
||||||
strum = "0.24"
|
strum = "0.26"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.26"
|
||||||
"subtle" = "2.5.0"
|
"subtle" = "2.5.0"
|
||||||
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
|
svg_fmt = "0.4.3"
|
||||||
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
|
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
task-local-extensions = "0.1.4"
|
||||||
@@ -176,8 +177,8 @@ tokio-rustls = "0.25"
|
|||||||
tokio-stream = "0.1"
|
tokio-stream = "0.1"
|
||||||
tokio-tar = "0.3"
|
tokio-tar = "0.3"
|
||||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||||
toml = "0.7"
|
toml = "0.8"
|
||||||
toml_edit = "0.19"
|
toml_edit = "0.22"
|
||||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||||
tower-service = "0.3.2"
|
tower-service = "0.3.2"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
@@ -200,10 +201,21 @@ env_logger = "0.10"
|
|||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
|
||||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
|
||||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
# We want to use the 'neon' branch for these, but there's currently one
|
||||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
# incompatible change on the branch. See:
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
#
|
||||||
|
# - PR #8076 which contained changes that depended on the new changes in
|
||||||
|
# the rust-postgres crate, and
|
||||||
|
# - PR #8654 which reverted those changes and made the code in proxy incompatible
|
||||||
|
# with the tip of the 'neon' branch again.
|
||||||
|
#
|
||||||
|
# When those proxy changes are re-applied (see PR #8747), we can switch using
|
||||||
|
# the tip of the 'neon' branch again.
|
||||||
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||||
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||||
|
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||||
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||||
|
|
||||||
## Local libraries
|
## Local libraries
|
||||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||||
@@ -240,11 +252,7 @@ tonic-build = "0.9"
|
|||||||
[patch.crates-io]
|
[patch.crates-io]
|
||||||
|
|
||||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||||
|
|
||||||
# bug fixes for UUID
|
|
||||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
|
||||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
|
||||||
|
|
||||||
################# Binary contents sections
|
################# Binary contents sections
|
||||||
|
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
|
|||||||
"pg_distrib_dir='/usr/local/'\n" \
|
"pg_distrib_dir='/usr/local/'\n" \
|
||||||
"listen_pg_addr='0.0.0.0:6400'\n" \
|
"listen_pg_addr='0.0.0.0:6400'\n" \
|
||||||
"listen_http_addr='0.0.0.0:9898'\n" \
|
"listen_http_addr='0.0.0.0:9898'\n" \
|
||||||
|
"availability_zone='local'\n" \
|
||||||
> /data/.neon/pageserver.toml && \
|
> /data/.neon/pageserver.toml && \
|
||||||
chown -R neon:neon /data/.neon
|
chown -R neon:neon /data/.neon
|
||||||
|
|
||||||
|
|||||||
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
# Rust
|
# Rust
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||||
ENV RUSTC_VERSION=1.80.1
|
ENV RUSTC_VERSION=1.81.0
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||||
ARG RUSTFILT_VERSION=0.2.1
|
ARG RUSTFILT_VERSION=0.2.1
|
||||||
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
|||||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||||
. "$HOME/.cargo/env" && \
|
. "$HOME/.cargo/env" && \
|
||||||
cargo --version && rustup --version && \
|
cargo --version && rustup --version && \
|
||||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
rustup component add llvm-tools rustfmt clippy && \
|
||||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||||
|
|||||||
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
|||||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||||
COPY patches/pg_hintplan.patch /ext-src
|
COPY patches/pg_hint_plan.patch /ext-src
|
||||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||||
COPY patches/pg_cron.patch /ext-src
|
COPY patches/pg_cron.patch /ext-src
|
||||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||||
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
|||||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||||
# cmake is required for the h3 test
|
# cmake is required for the h3 test
|
||||||
RUN apt-get update && apt-get install -y cmake
|
RUN apt-get update && apt-get install -y cmake
|
||||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
||||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||||
RUN patch -p1 </ext-src/pg_anon.patch
|
RUN patch -p1 </ext-src/pg_anon.patch
|
||||||
RUN patch -p1 </ext-src/pg_cron.patch
|
RUN patch -p1 </ext-src/pg_cron.patch
|
||||||
|
|||||||
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
|
|||||||
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
|
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you get errors about missing `m4` you may have to install it manually:
|
||||||
|
```
|
||||||
|
brew install m4
|
||||||
|
brew link --force m4
|
||||||
|
```
|
||||||
|
|
||||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||||
```
|
```
|
||||||
# recommended approach from https://www.rust-lang.org/tools/install
|
# recommended approach from https://www.rust-lang.org/tools/install
|
||||||
@@ -126,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
|||||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||||
|
|
||||||
To run the integration tests or Python scripts (not required to use the code), install
|
To run the integration tests or Python scripts (not required to use the code), install
|
||||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
||||||
|
|
||||||
|
|
||||||
#### Running neon database
|
#### Running neon database
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
|
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
@@ -366,6 +367,8 @@ fn wait_spec(
|
|||||||
state.start_time = now;
|
state.start_time = now;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
launch_lsn_lease_bg_task_for_static(&compute);
|
||||||
|
|
||||||
Ok(WaitSpecResult {
|
Ok(WaitSpecResult {
|
||||||
compute,
|
compute,
|
||||||
http_port,
|
http_port,
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ pub mod logger;
|
|||||||
pub mod catalog;
|
pub mod catalog;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
pub mod extension_server;
|
pub mod extension_server;
|
||||||
|
pub mod lsn_lease;
|
||||||
mod migration;
|
mod migration;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
|
|||||||
186
compute_tools/src/lsn_lease.rs
Normal file
186
compute_tools/src/lsn_lease.rs
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
use anyhow::bail;
|
||||||
|
use anyhow::Result;
|
||||||
|
use postgres::{NoTls, SimpleQueryMessage};
|
||||||
|
use std::time::SystemTime;
|
||||||
|
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
||||||
|
use utils::id::TenantId;
|
||||||
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
|
use compute_api::spec::ComputeMode;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
use utils::{
|
||||||
|
lsn::Lsn,
|
||||||
|
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
|
/// Spawns a background thread to periodically renew LSN leases for static compute.
|
||||||
|
/// Do nothing if the compute is not in static mode.
|
||||||
|
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
|
||||||
|
let (tenant_id, timeline_id, lsn) = {
|
||||||
|
let state = compute.state.lock().unwrap();
|
||||||
|
let spec = state.pspec.as_ref().expect("Spec must be set");
|
||||||
|
match spec.spec.mode {
|
||||||
|
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
|
||||||
|
_ => return,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let compute = compute.clone();
|
||||||
|
|
||||||
|
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
|
||||||
|
thread::spawn(move || {
|
||||||
|
let _entered = span.entered();
|
||||||
|
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
|
||||||
|
// TODO: might need stronger error feedback than logging an warning.
|
||||||
|
warn!("Exited with error: {e}");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Renews lsn lease periodically so static compute are not affected by GC.
|
||||||
|
fn lsn_lease_bg_task(
|
||||||
|
compute: Arc<ComputeNode>,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<()> {
|
||||||
|
loop {
|
||||||
|
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
|
||||||
|
let valid_duration = valid_until
|
||||||
|
.duration_since(SystemTime::now())
|
||||||
|
.unwrap_or(Duration::ZERO);
|
||||||
|
|
||||||
|
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
|
||||||
|
let sleep_duration = valid_duration
|
||||||
|
.saturating_sub(Duration::from_secs(60))
|
||||||
|
.max(valid_duration / 2);
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Succeeded, sleeping for {} seconds",
|
||||||
|
sleep_duration.as_secs()
|
||||||
|
);
|
||||||
|
thread::sleep(sleep_duration);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
|
||||||
|
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
|
||||||
|
fn acquire_lsn_lease_with_retry(
|
||||||
|
compute: &Arc<ComputeNode>,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<SystemTime> {
|
||||||
|
let mut attempts = 0usize;
|
||||||
|
let mut retry_period_ms: f64 = 500.0;
|
||||||
|
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||||
|
let configs = {
|
||||||
|
let state = compute.state.lock().unwrap();
|
||||||
|
|
||||||
|
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||||
|
|
||||||
|
let conn_strings = spec.pageserver_connstr.split(',');
|
||||||
|
|
||||||
|
conn_strings
|
||||||
|
.map(|connstr| {
|
||||||
|
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||||
|
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||||
|
info!("Got storage auth token from spec file");
|
||||||
|
config.password(storage_auth_token.clone());
|
||||||
|
} else {
|
||||||
|
info!("Storage auth token not set");
|
||||||
|
}
|
||||||
|
config
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
||||||
|
match result {
|
||||||
|
Ok(Some(res)) => {
|
||||||
|
return Ok(res);
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||||
|
|
||||||
|
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||||
|
retry_period_ms *= 1.5;
|
||||||
|
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
attempts += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tries to acquire an LSN lease through PS page_service API.
|
||||||
|
fn try_acquire_lsn_lease(
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
lsn: Lsn,
|
||||||
|
configs: &[postgres::Config],
|
||||||
|
) -> Result<Option<SystemTime>> {
|
||||||
|
fn get_valid_until(
|
||||||
|
config: &postgres::Config,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<Option<SystemTime>> {
|
||||||
|
let mut client = config.connect(NoTls)?;
|
||||||
|
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
|
||||||
|
let res = client.simple_query(&cmd)?;
|
||||||
|
let msg = match res.first() {
|
||||||
|
Some(msg) => msg,
|
||||||
|
None => bail!("empty response"),
|
||||||
|
};
|
||||||
|
let row = match msg {
|
||||||
|
SimpleQueryMessage::Row(row) => row,
|
||||||
|
_ => bail!("error parsing lsn lease response"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Note: this will be None if a lease is explicitly not granted.
|
||||||
|
let valid_until_str = row.get("valid_until");
|
||||||
|
|
||||||
|
let valid_until = valid_until_str.map(|s| {
|
||||||
|
SystemTime::UNIX_EPOCH
|
||||||
|
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||||
|
.expect("Time larger than max SystemTime could handle")
|
||||||
|
});
|
||||||
|
Ok(valid_until)
|
||||||
|
}
|
||||||
|
|
||||||
|
let shard_count = configs.len();
|
||||||
|
|
||||||
|
let valid_until = if shard_count > 1 {
|
||||||
|
configs
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(shard_number, config)| {
|
||||||
|
let tenant_shard_id = TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount::new(shard_count as u8),
|
||||||
|
shard_number: ShardNumber(shard_number as u8),
|
||||||
|
};
|
||||||
|
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
||||||
|
.into_iter()
|
||||||
|
.min()
|
||||||
|
.unwrap()
|
||||||
|
} else {
|
||||||
|
get_valid_until(
|
||||||
|
&configs[0],
|
||||||
|
TenantShardId::unsharded(tenant_id),
|
||||||
|
timeline_id,
|
||||||
|
lsn,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(valid_until)
|
||||||
|
}
|
||||||
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
|||||||
|
|
||||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||||
|
|
||||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
/// Escape a string for including it in a SQL literal.
|
||||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
///
|
||||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
/// Wrapping the result with `E'{}'` or `'{}'` is not required,
|
||||||
|
/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||||
/// for the original implementation.
|
/// for the original implementation.
|
||||||
pub fn escape_literal(s: &str) -> String {
|
pub fn escape_literal(s: &str) -> String {
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
async-trait.workspace = true
|
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
comfy-table.workspace = true
|
comfy-table.workspace = true
|
||||||
|
|||||||
@@ -640,6 +640,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
}
|
}
|
||||||
Some(("branch", branch_match)) => {
|
Some(("branch", branch_match)) => {
|
||||||
let tenant_id = get_tenant_id(branch_match, env)?;
|
let tenant_id = get_tenant_id(branch_match, env)?;
|
||||||
|
let new_timeline_id =
|
||||||
|
parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
|
||||||
let new_branch_name = branch_match
|
let new_branch_name = branch_match
|
||||||
.get_one::<String>("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||||
@@ -658,7 +660,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||||
let new_timeline_id = TimelineId::generate();
|
|
||||||
let storage_controller = StorageController::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let create_req = TimelineCreateRequest {
|
let create_req = TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
@@ -1570,7 +1571,6 @@ fn cli() -> Command {
|
|||||||
.value_parser(value_parser!(PathBuf))
|
.value_parser(value_parser!(PathBuf))
|
||||||
.value_name("config")
|
.value_name("config")
|
||||||
)
|
)
|
||||||
.arg(pg_version_arg.clone())
|
|
||||||
.arg(force_arg)
|
.arg(force_arg)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
@@ -1583,6 +1583,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("branch")
|
.subcommand(Command::new("branch")
|
||||||
.about("Create a new timeline, using another timeline as a base, copying its data")
|
.about("Create a new timeline, using another timeline as a base, copying its data")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
.arg(branch_name_arg.clone())
|
.arg(branch_name_arg.clone())
|
||||||
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
|
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
|
||||||
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
||||||
|
|||||||
@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
|
|||||||
pub split_threshold: Option<u64>,
|
pub split_threshold: Option<u64>,
|
||||||
|
|
||||||
pub max_secondary_lag_bytes: Option<u64>,
|
pub max_secondary_lag_bytes: Option<u64>,
|
||||||
|
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub heartbeat_interval: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl NeonStorageControllerConf {
|
impl NeonStorageControllerConf {
|
||||||
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
|
|||||||
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||||
|
|
||||||
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
|
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
|
||||||
|
|
||||||
|
// Very tight heartbeat interval to speed up tests
|
||||||
|
const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for NeonStorageControllerConf {
|
impl Default for NeonStorageControllerConf {
|
||||||
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
|
|||||||
database_url: None,
|
database_url: None,
|
||||||
split_threshold: None,
|
split_threshold: None,
|
||||||
max_secondary_lag_bytes: None,
|
max_secondary_lag_bytes: None,
|
||||||
|
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,14 +75,14 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
|
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
|
||||||
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
|
toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_init_make_toml(
|
fn pageserver_init_make_toml(
|
||||||
&self,
|
&self,
|
||||||
conf: NeonLocalInitPageserverConf,
|
conf: NeonLocalInitPageserverConf,
|
||||||
) -> anyhow::Result<toml_edit::Document> {
|
) -> anyhow::Result<toml_edit::DocumentMut> {
|
||||||
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
||||||
|
|
||||||
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
||||||
@@ -137,9 +137,9 @@ impl PageServerNode {
|
|||||||
|
|
||||||
// Turn `overrides` into a toml document.
|
// Turn `overrides` into a toml document.
|
||||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||||
let mut config_toml = toml_edit::Document::new();
|
let mut config_toml = toml_edit::DocumentMut::new();
|
||||||
for fragment_str in overrides {
|
for fragment_str in overrides {
|
||||||
let fragment = toml_edit::Document::from_str(&fragment_str)
|
let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
|
||||||
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||||
for (key, item) in fragment.iter() {
|
for (key, item) in fragment.iter() {
|
||||||
config_toml.insert(key, item.clone());
|
config_toml.insert(key, item.clone());
|
||||||
@@ -181,6 +181,23 @@ impl PageServerNode {
|
|||||||
);
|
);
|
||||||
io::stdout().flush()?;
|
io::stdout().flush()?;
|
||||||
|
|
||||||
|
// If the config file we got as a CLI argument includes the `availability_zone`
|
||||||
|
// config, then use that to populate the `metadata.json` file for the pageserver.
|
||||||
|
// In production the deployment orchestrator does this for us.
|
||||||
|
let az_id = conf
|
||||||
|
.other
|
||||||
|
.get("availability_zone")
|
||||||
|
.map(|toml| {
|
||||||
|
let az_str = toml.to_string();
|
||||||
|
// Trim the (") chars from the toml representation
|
||||||
|
if az_str.starts_with('"') && az_str.ends_with('"') {
|
||||||
|
az_str[1..az_str.len() - 1].to_string()
|
||||||
|
} else {
|
||||||
|
az_str
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unwrap_or("local".to_string());
|
||||||
|
|
||||||
let config = self
|
let config = self
|
||||||
.pageserver_init_make_toml(conf)
|
.pageserver_init_make_toml(conf)
|
||||||
.context("make pageserver toml")?;
|
.context("make pageserver toml")?;
|
||||||
@@ -216,6 +233,7 @@ impl PageServerNode {
|
|||||||
let (_http_host, http_port) =
|
let (_http_host, http_port) =
|
||||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||||
let http_port = http_port.unwrap_or(9898);
|
let http_port = http_port.unwrap_or(9898);
|
||||||
|
|
||||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||||
// in case the pageserver-side structure is edited, and reflects the real life
|
// in case the pageserver-side structure is edited, and reflects the real life
|
||||||
// situation: the metadata is written by some other script.
|
// situation: the metadata is written by some other script.
|
||||||
@@ -226,7 +244,10 @@ impl PageServerNode {
|
|||||||
postgres_port: self.pg_connection_config.port(),
|
postgres_port: self.pg_connection_config.port(),
|
||||||
http_host: "localhost".to_string(),
|
http_host: "localhost".to_string(),
|
||||||
http_port,
|
http_port,
|
||||||
other: HashMap::new(),
|
other: HashMap::from([(
|
||||||
|
"availability_zone_id".to_string(),
|
||||||
|
serde_json::json!(az_id),
|
||||||
|
)]),
|
||||||
})
|
})
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
//! ```text
|
//! ```text
|
||||||
//! .neon/safekeepers/<safekeeper id>
|
//! .neon/safekeepers/<safekeeper id>
|
||||||
//! ```
|
//! ```
|
||||||
|
use std::future::Future;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -34,12 +35,10 @@ pub enum SafekeeperHttpError {
|
|||||||
|
|
||||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||||
pub trait ResponseErrorMessageExt: Sized {
|
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
||||||
async fn error_from_body(self) -> Result<Self>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl ResponseErrorMessageExt for reqwest::Response {
|
impl ResponseErrorMessageExt for reqwest::Response {
|
||||||
async fn error_from_body(self) -> Result<Self> {
|
async fn error_from_body(self) -> Result<Self> {
|
||||||
let status = self.status();
|
let status = self.status();
|
||||||
|
|||||||
@@ -437,6 +437,8 @@ impl StorageController {
|
|||||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||||
"--max-warming-up-interval",
|
"--max-warming-up-interval",
|
||||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||||
|
"--heartbeat-interval",
|
||||||
|
&humantime::Duration::from(self.config.heartbeat_interval).to_string(),
|
||||||
"--address-for-peers",
|
"--address-for-peers",
|
||||||
&address_for_peers.to_string(),
|
&address_for_peers.to_string(),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
|
|||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
|
||||||
TenantDescribeResponse, TenantPolicyRequest,
|
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||||
},
|
},
|
||||||
models::{
|
models::{
|
||||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||||
@@ -41,6 +41,8 @@ enum Command {
|
|||||||
listen_http_addr: String,
|
listen_http_addr: String,
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
listen_http_port: u16,
|
listen_http_port: u16,
|
||||||
|
#[arg(long)]
|
||||||
|
availability_zone_id: String,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Modify a node's configuration in the storage controller
|
/// Modify a node's configuration in the storage controller
|
||||||
@@ -78,7 +80,10 @@ enum Command {
|
|||||||
/// List nodes known to the storage controller
|
/// List nodes known to the storage controller
|
||||||
Nodes {},
|
Nodes {},
|
||||||
/// List tenants known to the storage controller
|
/// List tenants known to the storage controller
|
||||||
Tenants {},
|
Tenants {
|
||||||
|
/// If this field is set, it will list the tenants on a specific node
|
||||||
|
node_id: Option<NodeId>,
|
||||||
|
},
|
||||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||||
TenantCreate {
|
TenantCreate {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
@@ -147,9 +152,9 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
threshold: humantime::Duration,
|
threshold: humantime::Duration,
|
||||||
},
|
},
|
||||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
||||||
// outside of the specified set.
|
// outside of the specified set.
|
||||||
Drain {
|
BulkMigrate {
|
||||||
// Set of pageserver node ids to drain.
|
// Set of pageserver node ids to drain.
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
nodes: Vec<NodeId>,
|
nodes: Vec<NodeId>,
|
||||||
@@ -163,6 +168,34 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
dry_run: Option<bool>,
|
dry_run: Option<bool>,
|
||||||
},
|
},
|
||||||
|
/// Start draining the specified pageserver.
|
||||||
|
/// The drain is complete when the schedulling policy returns to active.
|
||||||
|
StartDrain {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
},
|
||||||
|
/// Cancel draining the specified pageserver and wait for `timeout`
|
||||||
|
/// for the operation to be canceled. May be retried.
|
||||||
|
CancelDrain {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
#[arg(long)]
|
||||||
|
timeout: humantime::Duration,
|
||||||
|
},
|
||||||
|
/// Start filling the specified pageserver.
|
||||||
|
/// The drain is complete when the schedulling policy returns to active.
|
||||||
|
StartFill {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
},
|
||||||
|
/// Cancel filling the specified pageserver and wait for `timeout`
|
||||||
|
/// for the operation to be canceled. May be retried.
|
||||||
|
CancelFill {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
#[arg(long)]
|
||||||
|
timeout: humantime::Duration,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
@@ -249,6 +282,34 @@ impl FromStr for NodeAvailabilityArg {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn wait_for_scheduling_policy<F>(
|
||||||
|
client: Client,
|
||||||
|
node_id: NodeId,
|
||||||
|
timeout: Duration,
|
||||||
|
f: F,
|
||||||
|
) -> anyhow::Result<NodeSchedulingPolicy>
|
||||||
|
where
|
||||||
|
F: Fn(NodeSchedulingPolicy) -> bool,
|
||||||
|
{
|
||||||
|
let waiter = tokio::time::timeout(timeout, async move {
|
||||||
|
loop {
|
||||||
|
let node = client
|
||||||
|
.dispatch::<(), NodeDescribeResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/node/{node_id}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if f(node.scheduling) {
|
||||||
|
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(waiter.await??)
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
@@ -266,6 +327,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
listen_http_addr,
|
listen_http_addr,
|
||||||
listen_http_port,
|
listen_http_port,
|
||||||
|
availability_zone_id,
|
||||||
} => {
|
} => {
|
||||||
storcon_client
|
storcon_client
|
||||||
.dispatch::<_, ()>(
|
.dispatch::<_, ()>(
|
||||||
@@ -277,6 +339,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
listen_http_addr,
|
listen_http_addr,
|
||||||
listen_http_port,
|
listen_http_port,
|
||||||
|
availability_zone_id,
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -343,7 +406,41 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
Command::Tenants {} => {
|
Command::Tenants {
|
||||||
|
node_id: Some(node_id),
|
||||||
|
} => {
|
||||||
|
let describe_response = storcon_client
|
||||||
|
.dispatch::<(), NodeShardResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/node/{node_id}/shards"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let shards = describe_response.shards;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header([
|
||||||
|
"Shard",
|
||||||
|
"Intended Primary/Secondary",
|
||||||
|
"Observed Primary/Secondary",
|
||||||
|
]);
|
||||||
|
for shard in shards {
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", shard.tenant_shard_id),
|
||||||
|
match shard.is_intended_secondary {
|
||||||
|
None => "".to_string(),
|
||||||
|
Some(true) => "Secondary".to_string(),
|
||||||
|
Some(false) => "Primary".to_string(),
|
||||||
|
},
|
||||||
|
match shard.is_observed_secondary {
|
||||||
|
None => "".to_string(),
|
||||||
|
Some(true) => "Secondary".to_string(),
|
||||||
|
Some(false) => "Primary".to_string(),
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::Tenants { node_id: None } => {
|
||||||
let mut resp = storcon_client
|
let mut resp = storcon_client
|
||||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||||
Method::GET,
|
Method::GET,
|
||||||
@@ -628,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
})
|
})
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
Command::Drain {
|
Command::BulkMigrate {
|
||||||
nodes,
|
nodes,
|
||||||
concurrency,
|
concurrency,
|
||||||
max_shards,
|
max_shards,
|
||||||
@@ -657,7 +754,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if nodes.len() != node_to_drain_descs.len() {
|
if nodes.len() != node_to_drain_descs.len() {
|
||||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
||||||
}
|
}
|
||||||
|
|
||||||
node_to_fill_descs.retain(|desc| {
|
node_to_fill_descs.retain(|desc| {
|
||||||
@@ -669,7 +766,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if node_to_fill_descs.is_empty() {
|
if node_to_fill_descs.is_empty() {
|
||||||
anyhow::bail!("There are no nodes to drain to")
|
anyhow::bail!("There are no nodes to migrate to")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the node scheduling policy to draining for the nodes which
|
// Set the node scheduling policy to draining for the nodes which
|
||||||
@@ -690,7 +787,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform the drain: move each tenant shard scheduled on a node to
|
// Perform the migration: move each tenant shard scheduled on a node to
|
||||||
// be drained to a node which is being filled. A simple round robin
|
// be drained to a node which is being filled. A simple round robin
|
||||||
// strategy is used to pick the new node.
|
// strategy is used to pick the new node.
|
||||||
let tenants = storcon_client
|
let tenants = storcon_client
|
||||||
@@ -703,13 +800,13 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
let mut selected_node_idx = 0;
|
let mut selected_node_idx = 0;
|
||||||
|
|
||||||
struct DrainMove {
|
struct MigrationMove {
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
from: NodeId,
|
from: NodeId,
|
||||||
to: NodeId,
|
to: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut moves: Vec<DrainMove> = Vec::new();
|
let mut moves: Vec<MigrationMove> = Vec::new();
|
||||||
|
|
||||||
let shards = tenants
|
let shards = tenants
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -739,7 +836,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
moves.push(DrainMove {
|
moves.push(MigrationMove {
|
||||||
tenant_shard_id: shard.tenant_shard_id,
|
tenant_shard_id: shard.tenant_shard_id,
|
||||||
from: shard
|
from: shard
|
||||||
.node_attached
|
.node_attached
|
||||||
@@ -816,6 +913,67 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
failure
|
failure
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Command::StartDrain { node_id } => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<(), ()>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/node/{node_id}/drain"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!("Drain started for {node_id}");
|
||||||
|
}
|
||||||
|
Command::CancelDrain { node_id, timeout } => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<(), ()>(
|
||||||
|
Method::DELETE,
|
||||||
|
format!("control/v1/node/{node_id}/drain"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||||
|
|
||||||
|
let final_policy =
|
||||||
|
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||||
|
use NodeSchedulingPolicy::*;
|
||||||
|
matches!(sched, Active | PauseForRestart)
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Command::StartFill { node_id } => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("Fill started for {node_id}");
|
||||||
|
}
|
||||||
|
Command::CancelFill { node_id, timeout } => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<(), ()>(
|
||||||
|
Method::DELETE,
|
||||||
|
format!("control/v1/node/{node_id}/fill"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||||
|
|
||||||
|
let final_policy =
|
||||||
|
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||||
|
use NodeSchedulingPolicy::*;
|
||||||
|
matches!(sched, Active)
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ set -x
|
|||||||
|
|
||||||
cd /ext-src || exit 2
|
cd /ext-src || exit 2
|
||||||
FAILED=
|
FAILED=
|
||||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||||
for d in ${LIST}
|
for d in ${LIST}
|
||||||
do
|
do
|
||||||
[ -d "${d}" ] || continue
|
[ -d "${d}" ] || continue
|
||||||
|
|||||||
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
# Rolling Storage Controller Restarts
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This RFC describes the issues around the current storage controller restart procedure
|
||||||
|
and describes an implementation which reduces downtime to a few milliseconds on the happy path.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
|
||||||
|
While the storage controller does not sit on the main data path, it's generally not acceptable
|
||||||
|
to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
|
||||||
|
|
||||||
|
### Current Implementation
|
||||||
|
|
||||||
|
The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
|
||||||
|
In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
|
||||||
|
a new instance is created.
|
||||||
|
|
||||||
|
At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
|
||||||
|
latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
|
||||||
|
under unfavourable circumstances: pageservers are heavily loaded or unavailable.
|
||||||
|
|
||||||
|
## Prior Art
|
||||||
|
|
||||||
|
There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
|
||||||
|
* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
|
||||||
|
For fail-over, traffic is routed to one of the standbys (which becomes active).
|
||||||
|
* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
|
||||||
|
and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
* Reduce storage controller unavailability during upgrades to milliseconds
|
||||||
|
* Minimize the interval in which it's possible for more than one storage controller
|
||||||
|
to issue reconciles.
|
||||||
|
* Have one uniform implementation for restarts and upgrades
|
||||||
|
* Fit in with the current Kubernetes deployment scheme
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
* Implement our own consensus algorithm from scratch
|
||||||
|
* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
|
||||||
|
like a transient error to the control plane
|
||||||
|
|
||||||
|
## Impacted Components
|
||||||
|
|
||||||
|
* storage controller
|
||||||
|
* deployment orchestration (i.e. Ansible)
|
||||||
|
* helm charts
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
|
||||||
|
at start-up by quering pageservers
|
||||||
|
* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
|
||||||
|
a set of replicas
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### High Level Flow
|
||||||
|
|
||||||
|
At a very high level the proposed idea is to start a new storage controller instance while
|
||||||
|
the previous one is still running and cut-over to it when it becomes ready. The new instance,
|
||||||
|
should coordinate with the existing one and transition responsibility gracefully. While the controller
|
||||||
|
has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
|
||||||
|
scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
|
||||||
|
were operating at the same time and require operator intervention to remedy.
|
||||||
|
|
||||||
|
### Kubernetes Deployment Configuration
|
||||||
|
|
||||||
|
On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
|
||||||
|
to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
|
||||||
|
Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
|
||||||
|
scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
|
||||||
|
|
||||||
|
The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
|
||||||
|
|
||||||
|
### Storage Controller Start-Up
|
||||||
|
|
||||||
|
This section describes the primitives required on the storage controller side and the flow of the happy path.
|
||||||
|
|
||||||
|
#### Database Table For Leader Synchronization
|
||||||
|
|
||||||
|
A new table should be added to the storage controller database for leader synchronization during startup.
|
||||||
|
This table will always contain at most one row. The proposed name for the table is `leader` and the schema
|
||||||
|
contains two elements:
|
||||||
|
* `hostname`: represents the hostname for the current storage controller leader - should be addressible
|
||||||
|
from other pods in the deployment
|
||||||
|
* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
|
||||||
|
for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
|
||||||
|
|
||||||
|
Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
|
||||||
|
at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
|
||||||
|
situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
|
||||||
|
level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
|
||||||
|
READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
|
||||||
|
the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
|
||||||
|
our needs here.
|
||||||
|
|
||||||
|
```
|
||||||
|
START TRANSACTION ISOLATION LEVEL REPEATABLE READ
|
||||||
|
UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
|
||||||
|
WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
|
||||||
|
```
|
||||||
|
|
||||||
|
If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
|
||||||
|
|
||||||
|
#### Step Down API
|
||||||
|
|
||||||
|
A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
|
||||||
|
request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
|
||||||
|
and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
|
||||||
|
snapshot of the observed state.
|
||||||
|
|
||||||
|
If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
|
||||||
|
for failure scenario handling - see [Handling Failures](#handling-failures)).
|
||||||
|
|
||||||
|
#### Graceful Restart Happy Path
|
||||||
|
|
||||||
|
At start-up, the first thing the storage controller does is retrieve the sole row from the new
|
||||||
|
`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
|
||||||
|
This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
|
||||||
|
observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
|
||||||
|
pageservers in order to build up the observed state.
|
||||||
|
|
||||||
|
Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
|
||||||
|
section. If this step fails, the storage controller process exits.
|
||||||
|
|
||||||
|
Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
|
||||||
|
(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
|
||||||
|
|
||||||
|
Summary of proposed new start-up sequence:
|
||||||
|
1. Call `/step_down`
|
||||||
|
2. Perform any pending database migrations
|
||||||
|
3. Load state from database
|
||||||
|
4. Load observed state returned in step (1) into memory
|
||||||
|
5. Do initial heartbeat round (may be moved after 5)
|
||||||
|
7. Mark self as leader by updating the database
|
||||||
|
8. Reschedule and reconcile everything
|
||||||
|
|
||||||
|
Some things to note from the steps above:
|
||||||
|
* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
|
||||||
|
calls to the pageserver and no compute notifications)
|
||||||
|
* Ask the current leader to step down before loading state from database so we don't get a lost update
|
||||||
|
if the transactions overlap.
|
||||||
|
* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
|
||||||
|
fall back to asking the pageservers about their current locations.
|
||||||
|
* Database migrations should only run **after** the previous instance steps down (or the step down times out).
|
||||||
|
|
||||||
|
|
||||||
|
[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
|
||||||
|
so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
|
||||||
|
|
||||||
|
### Handling Failures
|
||||||
|
|
||||||
|
#### Storage Controller Crash Or Restart
|
||||||
|
|
||||||
|
The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
|
||||||
|
`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
|
||||||
|
start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
|
||||||
|
exists and consistency is maintained.
|
||||||
|
|
||||||
|
#### Previous Leader Crashes Before New Leader Readiness
|
||||||
|
|
||||||
|
When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
|
||||||
|
reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
|
||||||
|
(see [2]).
|
||||||
|
|
||||||
|
Now we have two cases to consider:
|
||||||
|
* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
|
||||||
|
by Kubernetes depending on timings.
|
||||||
|
* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
|
||||||
|
The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
|
||||||
|
create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
|
||||||
|
|
||||||
|
[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
|
||||||
|
should avoid this self reference and fail the API call at the client if the persisted hostname matches
|
||||||
|
the current one.
|
||||||
|
|
||||||
|
#### Previous Leader Crashes After New Leader Readiness
|
||||||
|
|
||||||
|
The deployment's replica sets already satisfy the deployment's replica count requirements and the
|
||||||
|
Kubernetes deployment rollout will just clean up the dead pod.
|
||||||
|
|
||||||
|
#### New Leader Crashes Before Pasing Readiness Check
|
||||||
|
|
||||||
|
The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
|
||||||
|
with the new pod.
|
||||||
|
|
||||||
|
#### Network Partition Between New Pod and Previous Leader
|
||||||
|
|
||||||
|
This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
|
||||||
|
API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
|
||||||
|
Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
|
||||||
|
|
||||||
|
### Dealing With Split Brain Scenarios
|
||||||
|
|
||||||
|
As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
|
||||||
|
duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
|
||||||
|
scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
|
||||||
|
The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
|
||||||
|
|
||||||
|
### Ensure Leadership Before Producing Side Effects
|
||||||
|
|
||||||
|
The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
|
||||||
|
Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
|
||||||
|
applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
|
||||||
|
|
||||||
|
### Leadership Lease
|
||||||
|
|
||||||
|
Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
|
||||||
|
to be renewed periodically. Two new columns would be added to the leaders table:
|
||||||
|
1. `last_renewed` - timestamp indicating when the lease was last renewed
|
||||||
|
2. `lease_duration` - duration indicating the amount of time after which the lease expires
|
||||||
|
|
||||||
|
The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
|
||||||
|
same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
|
||||||
|
to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
|
||||||
|
|
||||||
|
### Notify Pageserver Of Storage Controller Term
|
||||||
|
|
||||||
|
Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
|
||||||
|
Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
|
||||||
|
anything which contains a stale term (i.e. smaller than the current one).
|
||||||
|
|
||||||
|
### Observability
|
||||||
|
|
||||||
|
* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
|
||||||
|
Per region alerts should be added on this metric which triggers when:
|
||||||
|
+ no storage controller has been in the `Active` state for an extended period of time
|
||||||
|
+ more than one storage controllers are in the `Active` state
|
||||||
|
|
||||||
|
* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
|
||||||
|
We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
### Kubernetes Leases
|
||||||
|
|
||||||
|
Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
|
||||||
|
Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
|
||||||
|
|
||||||
|
In our case, it would work something like this:
|
||||||
|
* `/step_down` deletes the lease or stops it from renewing
|
||||||
|
* lease acquisition becomes part of the start-up procedure
|
||||||
|
|
||||||
|
The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
|
||||||
|
not exactly trivial to implement.
|
||||||
|
|
||||||
|
This approach has the benefit of baked in observability (`kubectl describe lease`), but:
|
||||||
|
* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
|
||||||
|
* More code surface than the simple "row in database" approach. Also, most of this code would be in
|
||||||
|
a dependency not subject to code review, etc.
|
||||||
|
* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
|
||||||
|
so is not simple and complictes and the test set-up.
|
||||||
|
|
||||||
|
To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
|
||||||
|
to something external.
|
||||||
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
|
|||||||
1. Create a new branch based on the stable branch you are updating.
|
1. Create a new branch based on the stable branch you are updating.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git checkout -b my-branch REL_15_STABLE_neon
|
git checkout -b my-branch-15 REL_15_STABLE_neon
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Tag the last commit on the stable branch you are updating.
|
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
|
||||||
|
|
||||||
```shell
|
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
|
||||||
git tag REL_15_3_neon
|
|
||||||
```
|
|
||||||
|
|
||||||
1. Push the new tag to the Neon Postgres repository.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
git push origin REL_15_3_neon
|
|
||||||
```
|
|
||||||
|
|
||||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
|
||||||
|
|
||||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git fetch upstream REL_15_4
|
git fetch upstream REL_15_4
|
||||||
git rebase REL_15_4
|
git merge REL_15_4
|
||||||
```
|
```
|
||||||
|
|
||||||
|
In the commit message of the merge commit, mention if there were
|
||||||
|
any non-trivial conflicts or other issues.
|
||||||
|
|
||||||
1. Run the Postgres test suite to make sure our commits have not affected
|
1. Run the Postgres test suite to make sure our commits have not affected
|
||||||
Postgres in a negative way.
|
Postgres in a negative way.
|
||||||
|
|
||||||
@@ -57,7 +48,7 @@ Postgres in a negative way.
|
|||||||
1. Push your branch to the Neon Postgres repository.
|
1. Push your branch to the Neon Postgres repository.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git push origin my-branch
|
git push origin my-branch-15
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Clone the Neon repository if you have not done so already.
|
1. Clone the Neon repository if you have not done so already.
|
||||||
@@ -74,7 +65,7 @@ branch.
|
|||||||
1. Update the Git submodule.
|
1. Update the Git submodule.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
|
||||||
git submodule update --remote vendor/postgres-v15
|
git submodule update --remote vendor/postgres-v15
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -89,14 +80,12 @@ minor Postgres release.
|
|||||||
|
|
||||||
1. Create a pull request, and wait for CI to go green.
|
1. Create a pull request, and wait for CI to go green.
|
||||||
|
|
||||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git push --force origin my-branch:REL_15_STABLE_neon
|
git push origin my-branch-15:REL_15_STABLE_neon
|
||||||
```
|
```
|
||||||
|
|
||||||
It may require disabling various branch protections.
|
|
||||||
|
|
||||||
1. Update your Neon PR to point at the branches.
|
1. Update your Neon PR to point at the branches.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
|
|||||||
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||||
|
|
||||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||||
|
///
|
||||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||||
/// while holding the lock.
|
/// while holding the lock.
|
||||||
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
||||||
|
|||||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
|
[features]
|
||||||
|
# See pageserver/Cargo.toml
|
||||||
|
testing = ["dep:nix"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
@@ -23,6 +27,12 @@ thiserror.workspace = true
|
|||||||
humantime-serde.workspace = true
|
humantime-serde.workspace = true
|
||||||
chrono = { workspace = true, features = ["serde"] }
|
chrono = { workspace = true, features = ["serde"] }
|
||||||
itertools.workspace = true
|
itertools.workspace = true
|
||||||
|
storage_broker.workspace = true
|
||||||
|
camino = {workspace = true, features = ["serde1"]}
|
||||||
|
remote_storage.workspace = true
|
||||||
|
postgres_backend.workspace = true
|
||||||
|
nix = {workspace = true, optional = true}
|
||||||
|
reqwest.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
|
|||||||
@@ -1,15 +1,28 @@
|
|||||||
use std::collections::HashMap;
|
use camino::Utf8PathBuf;
|
||||||
|
|
||||||
use const_format::formatcp;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
|
use const_format::formatcp;
|
||||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||||
|
|
||||||
|
use postgres_backend::AuthType;
|
||||||
|
use remote_storage::RemoteStorageConfig;
|
||||||
|
use serde_with::serde_as;
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
|
str::FromStr,
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
|
use utils::logging::LogFormat;
|
||||||
|
|
||||||
|
use crate::models::ImageCompressionAlgorithm;
|
||||||
|
use crate::models::LsnLease;
|
||||||
|
|
||||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||||
// as a separate structure. This information is not neeed by the pageserver
|
// as a separate structure. This information is not neeed by the pageserver
|
||||||
// itself, it is only used for registering the pageserver with the control
|
// itself, it is only used for registering the pageserver with the control
|
||||||
@@ -29,3 +42,476 @@ pub struct NodeMetadata {
|
|||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub other: HashMap<String, serde_json::Value>,
|
pub other: HashMap<String, serde_json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `pageserver.toml`
|
||||||
|
///
|
||||||
|
/// We use serde derive with `#[serde(default)]` to generate a deserializer
|
||||||
|
/// that fills in the default values for each config field.
|
||||||
|
///
|
||||||
|
/// If there cannot be a static default value because we need to make runtime
|
||||||
|
/// checks to determine the default, make it an `Option` (which defaults to None).
|
||||||
|
/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
|
||||||
|
#[serde(default, deny_unknown_fields)]
|
||||||
|
pub struct ConfigToml {
|
||||||
|
// types mapped 1:1 into the runtime PageServerConfig type
|
||||||
|
pub listen_pg_addr: String,
|
||||||
|
pub listen_http_addr: String,
|
||||||
|
pub availability_zone: Option<String>,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub wait_lsn_timeout: Duration,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub wal_redo_timeout: Duration,
|
||||||
|
pub superuser: String,
|
||||||
|
pub page_cache_size: usize,
|
||||||
|
pub max_file_descriptors: usize,
|
||||||
|
pub pg_distrib_dir: Option<Utf8PathBuf>,
|
||||||
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
|
pub http_auth_type: AuthType,
|
||||||
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
|
pub pg_auth_type: AuthType,
|
||||||
|
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
|
||||||
|
pub remote_storage: Option<RemoteStorageConfig>,
|
||||||
|
pub tenant_config: TenantConfigToml,
|
||||||
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
|
pub broker_endpoint: storage_broker::Uri,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub broker_keepalive_interval: Duration,
|
||||||
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
|
pub log_format: LogFormat,
|
||||||
|
pub concurrent_tenant_warmup: NonZeroUsize,
|
||||||
|
pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub metric_collection_interval: Duration,
|
||||||
|
pub metric_collection_endpoint: Option<reqwest::Url>,
|
||||||
|
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub synthetic_size_calculation_interval: Duration,
|
||||||
|
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||||
|
pub test_remote_failures: u64,
|
||||||
|
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub background_task_maximum_delay: Duration,
|
||||||
|
pub control_plane_api: Option<reqwest::Url>,
|
||||||
|
pub control_plane_api_token: Option<String>,
|
||||||
|
pub control_plane_emergency_mode: bool,
|
||||||
|
pub heatmap_upload_concurrency: usize,
|
||||||
|
pub secondary_download_concurrency: usize,
|
||||||
|
pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
|
||||||
|
pub ingest_batch_size: u64,
|
||||||
|
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||||
|
pub image_compression: ImageCompressionAlgorithm,
|
||||||
|
pub ephemeral_bytes_per_memory_kb: usize,
|
||||||
|
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||||
|
#[serde(skip_serializing)]
|
||||||
|
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
|
||||||
|
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
|
||||||
|
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||||
|
pub io_buffer_alignment: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(deny_unknown_fields)]
|
||||||
|
pub struct DiskUsageEvictionTaskConfig {
|
||||||
|
pub max_usage_pct: utils::serde_percent::Percent,
|
||||||
|
pub min_avail_bytes: u64,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub period: Duration,
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
pub mock_statvfs: Option<statvfs::mock::Behavior>,
|
||||||
|
/// Select sorting for evicted layers
|
||||||
|
#[serde(default)]
|
||||||
|
pub eviction_order: EvictionOrder,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod statvfs {
|
||||||
|
pub mod mock {
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(tag = "type")]
|
||||||
|
pub enum Behavior {
|
||||||
|
Success {
|
||||||
|
blocksize: u64,
|
||||||
|
total_blocks: u64,
|
||||||
|
name_filter: Option<utils::serde_regex::Regex>,
|
||||||
|
},
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
Failure { mocked_error: MockedError },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[allow(clippy::upper_case_acronyms)]
|
||||||
|
pub enum MockedError {
|
||||||
|
EIO,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
impl From<MockedError> for nix::Error {
|
||||||
|
fn from(e: MockedError) -> Self {
|
||||||
|
match e {
|
||||||
|
MockedError::EIO => nix::Error::EIO,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(tag = "type", content = "args")]
|
||||||
|
pub enum EvictionOrder {
|
||||||
|
RelativeAccessed {
|
||||||
|
highest_layer_count_loses_first: bool,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EvictionOrder {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::RelativeAccessed {
|
||||||
|
highest_layer_count_loses_first: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(
|
||||||
|
Eq,
|
||||||
|
PartialEq,
|
||||||
|
Debug,
|
||||||
|
Copy,
|
||||||
|
Clone,
|
||||||
|
strum_macros::EnumString,
|
||||||
|
strum_macros::Display,
|
||||||
|
serde_with::DeserializeFromStr,
|
||||||
|
serde_with::SerializeDisplay,
|
||||||
|
)]
|
||||||
|
#[strum(serialize_all = "kebab-case")]
|
||||||
|
pub enum GetVectoredImpl {
|
||||||
|
Sequential,
|
||||||
|
Vectored,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(
|
||||||
|
Eq,
|
||||||
|
PartialEq,
|
||||||
|
Debug,
|
||||||
|
Copy,
|
||||||
|
Clone,
|
||||||
|
strum_macros::EnumString,
|
||||||
|
strum_macros::Display,
|
||||||
|
serde_with::DeserializeFromStr,
|
||||||
|
serde_with::SerializeDisplay,
|
||||||
|
)]
|
||||||
|
#[strum(serialize_all = "kebab-case")]
|
||||||
|
pub enum GetImpl {
|
||||||
|
Legacy,
|
||||||
|
Vectored,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(transparent)]
|
||||||
|
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||||
|
|
||||||
|
/// A tenant's calcuated configuration, which is the result of merging a
|
||||||
|
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||||
|
///
|
||||||
|
/// For storing and transmitting individual tenant's configuration, see
|
||||||
|
/// TenantConfOpt.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(deny_unknown_fields, default)]
|
||||||
|
pub struct TenantConfigToml {
|
||||||
|
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||||
|
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||||
|
// page server crashes.
|
||||||
|
// This parameter actually determines L0 layer file size.
|
||||||
|
pub checkpoint_distance: u64,
|
||||||
|
// Inmemory layer is also flushed at least once in checkpoint_timeout to
|
||||||
|
// eventually upload WAL after activity is stopped.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub checkpoint_timeout: Duration,
|
||||||
|
// Target file size, when creating image and delta layers.
|
||||||
|
// This parameter determines L1 layer file size.
|
||||||
|
pub compaction_target_size: u64,
|
||||||
|
// How often to check if there's compaction work to be done.
|
||||||
|
// Duration::ZERO means automatic compaction is disabled.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub compaction_period: Duration,
|
||||||
|
// Level0 delta layer threshold for compaction.
|
||||||
|
pub compaction_threshold: usize,
|
||||||
|
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
|
||||||
|
// Determines how much history is retained, to allow
|
||||||
|
// branching and read replicas at an older point in time.
|
||||||
|
// The unit is #of bytes of WAL.
|
||||||
|
// Page versions older than this are garbage collected away.
|
||||||
|
pub gc_horizon: u64,
|
||||||
|
// Interval at which garbage collection is triggered.
|
||||||
|
// Duration::ZERO means automatic GC is disabled
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub gc_period: Duration,
|
||||||
|
// Delta layer churn threshold to create L1 image layers.
|
||||||
|
pub image_creation_threshold: usize,
|
||||||
|
// Determines how much history is retained, to allow
|
||||||
|
// branching and read replicas at an older point in time.
|
||||||
|
// The unit is time.
|
||||||
|
// Page versions older than this are garbage collected away.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub pitr_interval: Duration,
|
||||||
|
/// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub walreceiver_connect_timeout: Duration,
|
||||||
|
/// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
|
||||||
|
/// A stalled safekeeper will be changed to a newer one when it appears.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub lagging_wal_timeout: Duration,
|
||||||
|
/// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
|
||||||
|
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||||
|
/// to avoid eager reconnects.
|
||||||
|
pub max_lsn_wal_lag: NonZeroU64,
|
||||||
|
pub eviction_policy: crate::models::EvictionPolicy,
|
||||||
|
pub min_resident_size_override: Option<u64>,
|
||||||
|
// See the corresponding metric's help string.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||||
|
|
||||||
|
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||||
|
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||||
|
/// locations will use the heatmap uploaded by attached locations.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub heatmap_period: Duration,
|
||||||
|
|
||||||
|
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||||
|
pub lazy_slru_download: bool,
|
||||||
|
|
||||||
|
pub timeline_get_throttle: crate::models::ThrottleConfig,
|
||||||
|
|
||||||
|
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||||
|
// Expresed in multiples of checkpoint distance.
|
||||||
|
pub image_layer_creation_check_threshold: u8,
|
||||||
|
|
||||||
|
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||||
|
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||||
|
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||||
|
/// file is written.
|
||||||
|
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
|
||||||
|
|
||||||
|
/// The length for an explicit LSN lease request.
|
||||||
|
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub lsn_lease_length: Duration,
|
||||||
|
|
||||||
|
/// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
|
||||||
|
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub lsn_lease_length_for_ts: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod defaults {
|
||||||
|
use crate::models::ImageCompressionAlgorithm;
|
||||||
|
|
||||||
|
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||||
|
|
||||||
|
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||||
|
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||||
|
|
||||||
|
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||||
|
|
||||||
|
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||||
|
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||||
|
|
||||||
|
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||||
|
|
||||||
|
pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
|
||||||
|
|
||||||
|
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
|
||||||
|
|
||||||
|
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||||
|
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||||
|
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||||
|
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||||
|
|
||||||
|
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||||
|
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
||||||
|
|
||||||
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
|
|
||||||
|
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||||
|
|
||||||
|
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||||
|
ImageCompressionAlgorithm::Zstd { level: Some(1) };
|
||||||
|
|
||||||
|
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||||
|
|
||||||
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
|
|
||||||
|
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ConfigToml {
|
||||||
|
fn default() -> Self {
|
||||||
|
use defaults::*;
|
||||||
|
|
||||||
|
Self {
|
||||||
|
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||||
|
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||||
|
availability_zone: (None),
|
||||||
|
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||||
|
.expect("cannot parse default wait lsn timeout")),
|
||||||
|
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||||
|
.expect("cannot parse default wal redo timeout")),
|
||||||
|
superuser: (DEFAULT_SUPERUSER.to_string()),
|
||||||
|
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
|
||||||
|
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
|
||||||
|
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
|
||||||
|
http_auth_type: (AuthType::Trust),
|
||||||
|
pg_auth_type: (AuthType::Trust),
|
||||||
|
auth_validation_public_key_path: (None),
|
||||||
|
remote_storage: None,
|
||||||
|
broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
|
||||||
|
.parse()
|
||||||
|
.expect("failed to parse default broker endpoint")),
|
||||||
|
broker_keepalive_interval: (humantime::parse_duration(
|
||||||
|
storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
|
||||||
|
)
|
||||||
|
.expect("cannot parse default keepalive interval")),
|
||||||
|
log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||||
|
|
||||||
|
concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
|
||||||
|
.expect("Invalid default constant")),
|
||||||
|
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
|
||||||
|
metric_collection_interval: (humantime::parse_duration(
|
||||||
|
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||||
|
)
|
||||||
|
.expect("cannot parse default metric collection interval")),
|
||||||
|
synthetic_size_calculation_interval: (humantime::parse_duration(
|
||||||
|
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
||||||
|
)
|
||||||
|
.expect("cannot parse default synthetic size calculation interval")),
|
||||||
|
metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||||
|
|
||||||
|
metric_collection_bucket: (None),
|
||||||
|
|
||||||
|
disk_usage_based_eviction: (None),
|
||||||
|
|
||||||
|
test_remote_failures: (0),
|
||||||
|
|
||||||
|
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||||
|
|
||||||
|
background_task_maximum_delay: (humantime::parse_duration(
|
||||||
|
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||||
|
)
|
||||||
|
.unwrap()),
|
||||||
|
|
||||||
|
control_plane_api: (None),
|
||||||
|
control_plane_api_token: (None),
|
||||||
|
control_plane_emergency_mode: (false),
|
||||||
|
|
||||||
|
heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||||
|
secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||||
|
|
||||||
|
ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
|
||||||
|
|
||||||
|
virtual_file_io_engine: None,
|
||||||
|
|
||||||
|
max_vectored_read_bytes: (MaxVectoredReadBytes(
|
||||||
|
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||||
|
)),
|
||||||
|
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||||
|
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
|
l0_flush: None,
|
||||||
|
compact_level0_phase1_value_access: Default::default(),
|
||||||
|
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||||
|
|
||||||
|
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||||
|
|
||||||
|
tenant_config: TenantConfigToml::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod tenant_conf_defaults {
|
||||||
|
|
||||||
|
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||||
|
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||||
|
// which is good for now to trigger bugs.
|
||||||
|
// This parameter actually determines L0 layer file size.
|
||||||
|
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||||
|
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
||||||
|
|
||||||
|
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
||||||
|
// has different parameters.
|
||||||
|
|
||||||
|
// Target file size, when creating image and delta layers.
|
||||||
|
// This parameter determines L1 layer file size.
|
||||||
|
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||||
|
|
||||||
|
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||||
|
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||||
|
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
|
||||||
|
crate::models::CompactionAlgorithm::Legacy;
|
||||||
|
|
||||||
|
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||||
|
|
||||||
|
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
||||||
|
// If there's a need to decrease this value, first make sure that GC
|
||||||
|
// doesn't hold a layer map write lock for non-trivial operations.
|
||||||
|
// Relevant: https://github.com/neondatabase/neon/issues/3394
|
||||||
|
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
||||||
|
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||||
|
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||||
|
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
||||||
|
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||||
|
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
|
||||||
|
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
|
||||||
|
// throughputs up to 1GiB/s per timeline.
|
||||||
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||||
|
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||||
|
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||||
|
// image layers should be created.
|
||||||
|
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||||
|
|
||||||
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TenantConfigToml {
|
||||||
|
fn default() -> Self {
|
||||||
|
use tenant_conf_defaults::*;
|
||||||
|
Self {
|
||||||
|
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
||||||
|
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
|
||||||
|
.expect("cannot parse default checkpoint timeout"),
|
||||||
|
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
|
||||||
|
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||||
|
.expect("cannot parse default compaction period"),
|
||||||
|
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||||
|
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
|
||||||
|
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||||
|
},
|
||||||
|
gc_horizon: DEFAULT_GC_HORIZON,
|
||||||
|
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||||
|
.expect("cannot parse default gc period"),
|
||||||
|
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||||
|
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||||
|
.expect("cannot parse default PITR interval"),
|
||||||
|
walreceiver_connect_timeout: humantime::parse_duration(
|
||||||
|
DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
|
||||||
|
)
|
||||||
|
.expect("cannot parse default walreceiver connect timeout"),
|
||||||
|
lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
|
||||||
|
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||||
|
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||||
|
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||||
|
eviction_policy: crate::models::EvictionPolicy::NoEviction,
|
||||||
|
min_resident_size_override: None,
|
||||||
|
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||||
|
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||||
|
)
|
||||||
|
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||||
|
heatmap_period: Duration::ZERO,
|
||||||
|
lazy_slru_download: false,
|
||||||
|
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
|
||||||
|
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||||
|
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
|
||||||
|
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||||
|
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
|
use crate::models::PageserverUtilization;
|
||||||
use crate::{
|
use crate::{
|
||||||
models::{ShardParameters, TenantConfig},
|
models::{ShardParameters, TenantConfig},
|
||||||
shard::{ShardStripeSize, TenantShardId},
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
@@ -55,6 +56,8 @@ pub struct NodeRegisterRequest {
|
|||||||
|
|
||||||
pub listen_http_addr: String,
|
pub listen_http_addr: String,
|
||||||
pub listen_http_port: u16,
|
pub listen_http_port: u16,
|
||||||
|
|
||||||
|
pub availability_zone_id: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@@ -71,6 +74,17 @@ pub struct TenantPolicyRequest {
|
|||||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct ShardsPreferredAzsRequest {
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub preferred_az_ids: HashMap<TenantShardId, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct ShardsPreferredAzsResponse {
|
||||||
|
pub updated: Vec<TenantShardId>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct TenantLocateResponseShard {
|
pub struct TenantLocateResponseShard {
|
||||||
pub shard_id: TenantShardId,
|
pub shard_id: TenantShardId,
|
||||||
@@ -98,6 +112,21 @@ pub struct TenantDescribeResponse {
|
|||||||
pub config: TenantConfig,
|
pub config: TenantConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct NodeShardResponse {
|
||||||
|
pub node_id: NodeId,
|
||||||
|
pub shards: Vec<NodeShard>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct NodeShard {
|
||||||
|
pub tenant_shard_id: TenantShardId,
|
||||||
|
/// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||||
|
pub is_observed_secondary: Option<bool>,
|
||||||
|
/// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
|
||||||
|
pub is_intended_secondary: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct NodeDescribeResponse {
|
pub struct NodeDescribeResponse {
|
||||||
pub id: NodeId,
|
pub id: NodeId,
|
||||||
@@ -129,8 +158,12 @@ pub struct TenantDescribeResponseShard {
|
|||||||
pub is_splitting: bool,
|
pub is_splitting: bool,
|
||||||
|
|
||||||
pub scheduling_policy: ShardSchedulingPolicy,
|
pub scheduling_policy: ShardSchedulingPolicy,
|
||||||
|
|
||||||
|
pub preferred_az_id: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Migration request for a given tenant shard to a given node.
|
||||||
|
///
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
/// Explicitly migrating a particular shard is a low level operation
|
||||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||||
@@ -140,23 +173,11 @@ pub struct TenantShardMigrateRequest {
|
|||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Utilisation score indicating how good a candidate a pageserver
|
#[derive(Serialize, Clone, Debug)]
|
||||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
|
||||||
/// Lower values are better.
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
|
||||||
pub struct UtilizationScore(pub u64);
|
|
||||||
|
|
||||||
impl UtilizationScore {
|
|
||||||
pub fn worst() -> Self {
|
|
||||||
UtilizationScore(u64::MAX)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Clone, Copy, Debug)]
|
|
||||||
#[serde(into = "NodeAvailabilityWrapper")]
|
#[serde(into = "NodeAvailabilityWrapper")]
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
Active(UtilizationScore),
|
Active(PageserverUtilization),
|
||||||
// Node is warming up, but we expect it to become available soon. Covers
|
// Node is warming up, but we expect it to become available soon. Covers
|
||||||
// the time span between the re-attach response being composed on the storage controller
|
// the time span between the re-attach response being composed on the storage controller
|
||||||
// and the first successful heartbeat after the processing of the re-attach response
|
// and the first successful heartbeat after the processing of the re-attach response
|
||||||
@@ -195,7 +216,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
|||||||
match val {
|
match val {
|
||||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||||
// the heartbeats.
|
// the heartbeats.
|
||||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
NodeAvailabilityWrapper::Active => {
|
||||||
|
NodeAvailability::Active(PageserverUtilization::full())
|
||||||
|
}
|
||||||
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
||||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,14 +108,41 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This function checks more extensively what keys we can take on the write path.
|
||||||
|
/// If a key beginning with 00 does not have a global/default tablespace OID, it
|
||||||
|
/// will be rejected on the write path.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn is_valid_key_on_write_path_strong(&self) -> bool {
|
||||||
|
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||||
|
if !self.is_i128_representable() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if self.field1 == 0
|
||||||
|
&& !(self.field2 == GLOBALTABLESPACE_OID
|
||||||
|
|| self.field2 == DEFAULTTABLESPACE_OID
|
||||||
|
|| self.field2 == 0)
|
||||||
|
{
|
||||||
|
return false; // User defined tablespaces are not supported
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
|
||||||
|
/// checks if the key is i128 representable. Note that some keys can be successfully
|
||||||
|
/// ingested into the pageserver, but will cause errors on generating basebackup.
|
||||||
|
pub fn is_valid_key_on_write_path(&self) -> bool {
|
||||||
|
self.is_i128_representable()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_i128_representable(&self) -> bool {
|
||||||
|
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
|
||||||
|
}
|
||||||
|
|
||||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||||
pub fn to_i128(&self) -> i128 {
|
pub fn to_i128(&self) -> i128 {
|
||||||
assert!(
|
assert!(self.is_i128_representable(), "invalid key: {self}");
|
||||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
|
|
||||||
"invalid key: {self}",
|
|
||||||
);
|
|
||||||
(((self.field1 & 0x7F) as i128) << 120)
|
(((self.field1 & 0x7F) as i128) << 120)
|
||||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||||
| ((self.field3 as i128) << 72)
|
| ((self.field3 as i128) << 72)
|
||||||
|
|||||||
@@ -6,8 +6,9 @@ pub use utilization::PageserverUtilization;
|
|||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
fmt::Display,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::atomic::AtomicUsize,
|
sync::atomic::AtomicUsize,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
@@ -61,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
|||||||
serde::Serialize,
|
serde::Serialize,
|
||||||
serde::Deserialize,
|
serde::Deserialize,
|
||||||
strum_macros::Display,
|
strum_macros::Display,
|
||||||
strum_macros::EnumVariantNames,
|
strum_macros::VariantNames,
|
||||||
strum_macros::AsRefStr,
|
strum_macros::AsRefStr,
|
||||||
strum_macros::IntoStaticStr,
|
strum_macros::IntoStaticStr,
|
||||||
)]
|
)]
|
||||||
@@ -304,8 +305,10 @@ pub struct TenantConfig {
|
|||||||
pub lsn_lease_length_for_ts: Option<String>,
|
pub lsn_lease_length_for_ts: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
/// The policy for the aux file storage.
|
||||||
/// tenant config. When the first aux file written, the policy will be persisted in the
|
///
|
||||||
|
/// It can be switched through `switch_aux_file_policy` tenant config.
|
||||||
|
/// When the first aux file written, the policy will be persisted in the
|
||||||
/// `index_part.json` file and has a limited migration path.
|
/// `index_part.json` file and has a limited migration path.
|
||||||
///
|
///
|
||||||
/// Currently, we only allow the following migration path:
|
/// Currently, we only allow the following migration path:
|
||||||
@@ -348,7 +351,7 @@ impl AuxFilePolicy {
|
|||||||
|
|
||||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||||
pub fn default_tenant_config() -> Self {
|
pub fn default_tenant_config() -> Self {
|
||||||
Self::V1
|
Self::V2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -435,7 +438,9 @@ pub enum CompactionAlgorithm {
|
|||||||
Tiered,
|
Tiered,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(
|
||||||
|
Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
|
||||||
|
)]
|
||||||
pub enum ImageCompressionAlgorithm {
|
pub enum ImageCompressionAlgorithm {
|
||||||
// Disabled for writes, support decompressing during read path
|
// Disabled for writes, support decompressing during read path
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -470,11 +475,33 @@ impl FromStr for ImageCompressionAlgorithm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Display for ImageCompressionAlgorithm {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
|
||||||
|
ImageCompressionAlgorithm::Zstd { level } => {
|
||||||
|
if let Some(level) = level {
|
||||||
|
write!(f, "zstd({})", level)
|
||||||
|
} else {
|
||||||
|
write!(f, "zstd")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct CompactionAlgorithmSettings {
|
pub struct CompactionAlgorithmSettings {
|
||||||
pub kind: CompactionAlgorithm,
|
pub kind: CompactionAlgorithm,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
|
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||||
|
pub enum L0FlushConfig {
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
Direct { max_concurrency: NonZeroUsize },
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct EvictionPolicyLayerAccessThreshold {
|
pub struct EvictionPolicyLayerAccessThreshold {
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
@@ -486,12 +513,11 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
|||||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||||
pub struct ThrottleConfig {
|
pub struct ThrottleConfig {
|
||||||
pub task_kinds: Vec<String>, // TaskKind
|
pub task_kinds: Vec<String>, // TaskKind
|
||||||
pub initial: usize,
|
pub initial: u32,
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
pub refill_interval: Duration,
|
pub refill_interval: Duration,
|
||||||
pub refill_amount: NonZeroUsize,
|
pub refill_amount: NonZeroU32,
|
||||||
pub max: usize,
|
pub max: u32,
|
||||||
pub fair: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ThrottleConfig {
|
impl ThrottleConfig {
|
||||||
@@ -501,9 +527,8 @@ impl ThrottleConfig {
|
|||||||
// other values don't matter with emtpy `task_kinds`.
|
// other values don't matter with emtpy `task_kinds`.
|
||||||
initial: 0,
|
initial: 0,
|
||||||
refill_interval: Duration::from_millis(1),
|
refill_interval: Duration::from_millis(1),
|
||||||
refill_amount: NonZeroUsize::new(1).unwrap(),
|
refill_amount: NonZeroU32::new(1).unwrap(),
|
||||||
max: 1,
|
max: 1,
|
||||||
fair: true,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// The requests per second allowed by the given config.
|
/// The requests per second allowed by the given config.
|
||||||
@@ -721,8 +746,14 @@ pub struct TimelineInfo {
|
|||||||
|
|
||||||
pub walreceiver_status: String,
|
pub walreceiver_status: String,
|
||||||
|
|
||||||
|
// ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
|
||||||
|
// Backward compatibility: you will get a JSON not containing the newly-added field.
|
||||||
|
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
|
||||||
|
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
|
||||||
|
// read.
|
||||||
/// The last aux file policy being used on this timeline
|
/// The last aux file policy being used on this timeline
|
||||||
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||||
|
pub is_archived: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@@ -867,7 +898,9 @@ pub struct WalRedoManagerStatus {
|
|||||||
pub process: Option<WalRedoManagerProcessStatus>,
|
pub process: Option<WalRedoManagerProcessStatus>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
/// The progress of a secondary tenant.
|
||||||
|
///
|
||||||
|
/// It is mostly useful when doing a long running download: e.g. initiating
|
||||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||||
/// what's happening.
|
/// what's happening.
|
||||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||||
@@ -1062,7 +1095,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// In the V2 protocol version, a GetPage request contains two LSN values:
|
// A GetPage request contains two LSN values:
|
||||||
//
|
//
|
||||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||||
@@ -1075,7 +1108,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||||
// request without waiting for 'request_lsn' to arrive.
|
// request without waiting for 'request_lsn' to arrive.
|
||||||
//
|
//
|
||||||
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||||
// standby to request a page at a particular non-latest LSN, and also include the
|
// standby to request a page at a particular non-latest LSN, and also include the
|
||||||
@@ -1083,15 +1116,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||||
// difference in the responses between V1 and V2.
|
// difference in the responses between V1 and V2.
|
||||||
//
|
//
|
||||||
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
|
||||||
// maps the old format requests to the new format.
|
|
||||||
//
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub enum PagestreamProtocolVersion {
|
pub enum PagestreamProtocolVersion {
|
||||||
V1,
|
|
||||||
V2,
|
V2,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1230,36 +1259,17 @@ impl PagestreamFeMessage {
|
|||||||
bytes.into()
|
bytes.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse<R: std::io::Read>(
|
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||||
body: &mut R,
|
|
||||||
protocol_version: PagestreamProtocolVersion,
|
|
||||||
) -> anyhow::Result<PagestreamFeMessage> {
|
|
||||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||||
//
|
//
|
||||||
// TODO: consider using protobuf or serde bincode for less error prone
|
// TODO: consider using protobuf or serde bincode for less error prone
|
||||||
// serialization.
|
// serialization.
|
||||||
let msg_tag = body.read_u8()?;
|
let msg_tag = body.read_u8()?;
|
||||||
|
|
||||||
let (request_lsn, not_modified_since) = match protocol_version {
|
// these two fields are the same for every request type
|
||||||
PagestreamProtocolVersion::V2 => (
|
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
|
||||||
),
|
|
||||||
PagestreamProtocolVersion::V1 => {
|
|
||||||
// In the old protocol, each message starts with a boolean 'latest' flag,
|
|
||||||
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
|
||||||
// 'not_modified_since', used in the new protocol version.
|
|
||||||
let latest = body.read_u8()? != 0;
|
|
||||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
|
||||||
if latest {
|
|
||||||
(Lsn::MAX, request_lsn) // get latest version
|
|
||||||
} else {
|
|
||||||
(request_lsn, request_lsn) // get version at specified LSN
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// The rest of the messages are the same between V1 and V2
|
|
||||||
match msg_tag {
|
match msg_tag {
|
||||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||||
request_lsn,
|
request_lsn,
|
||||||
@@ -1467,9 +1477,7 @@ mod tests {
|
|||||||
];
|
];
|
||||||
for msg in messages {
|
for msg in messages {
|
||||||
let bytes = msg.serialize();
|
let bytes = msg.serialize();
|
||||||
let reconstructed =
|
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
|
||||||
.unwrap();
|
|
||||||
assert!(msg == reconstructed);
|
assert!(msg == reconstructed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1677,21 +1685,33 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_image_compression_algorithm_parsing() {
|
fn test_image_compression_algorithm_parsing() {
|
||||||
use ImageCompressionAlgorithm::*;
|
use ImageCompressionAlgorithm::*;
|
||||||
assert_eq!(
|
let cases = [
|
||||||
ImageCompressionAlgorithm::from_str("disabled").unwrap(),
|
("disabled", Disabled),
|
||||||
Disabled
|
("zstd", Zstd { level: None }),
|
||||||
);
|
("zstd(18)", Zstd { level: Some(18) }),
|
||||||
assert_eq!(
|
("zstd(-3)", Zstd { level: Some(-3) }),
|
||||||
ImageCompressionAlgorithm::from_str("zstd").unwrap(),
|
];
|
||||||
Zstd { level: None }
|
|
||||||
);
|
for (display, expected) in cases {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
|
ImageCompressionAlgorithm::from_str(display).unwrap(),
|
||||||
Zstd { level: Some(18) }
|
expected,
|
||||||
);
|
"parsing works"
|
||||||
assert_eq!(
|
);
|
||||||
ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
|
assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
|
||||||
Zstd { level: Some(-3) }
|
|
||||||
);
|
let ser = serde_json::to_string(&expected).expect("serialization");
|
||||||
|
assert_eq!(
|
||||||
|
serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
|
||||||
|
expected,
|
||||||
|
"serde roundtrip"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
serde_json::Value::String(display.to_string()),
|
||||||
|
serde_json::to_value(expected).unwrap(),
|
||||||
|
"Display is the serde serialization"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
|
|||||||
pub max_shard_count: u32,
|
pub max_shard_count: u32,
|
||||||
|
|
||||||
/// Cached result of [`Self::score`]
|
/// Cached result of [`Self::score`]
|
||||||
pub utilization_score: u64,
|
pub utilization_score: Option<u64>,
|
||||||
|
|
||||||
/// When was this snapshot captured, pageserver local time.
|
/// When was this snapshot captured, pageserver local time.
|
||||||
///
|
///
|
||||||
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
|
|||||||
Percent::new(0).unwrap()
|
Percent::new(0).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub type RawScore = u64;
|
||||||
|
|
||||||
impl PageserverUtilization {
|
impl PageserverUtilization {
|
||||||
const UTILIZATION_FULL: u64 = 1000000;
|
const UTILIZATION_FULL: u64 = 1000000;
|
||||||
|
|
||||||
@@ -62,7 +64,7 @@ impl PageserverUtilization {
|
|||||||
/// - Negative values are forbidden
|
/// - Negative values are forbidden
|
||||||
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
||||||
/// layer eviction.
|
/// layer eviction.
|
||||||
pub fn score(&self) -> u64 {
|
pub fn score(&self) -> RawScore {
|
||||||
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
||||||
* self.disk_usable_pct.get() as u64)
|
* self.disk_usable_pct.get() as u64)
|
||||||
/ 100;
|
/ 100;
|
||||||
@@ -74,8 +76,41 @@ impl PageserverUtilization {
|
|||||||
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn refresh_score(&mut self) {
|
pub fn cached_score(&mut self) -> RawScore {
|
||||||
self.utilization_score = self.score();
|
match self.utilization_score {
|
||||||
|
None => {
|
||||||
|
let s = self.score();
|
||||||
|
self.utilization_score = Some(s);
|
||||||
|
s
|
||||||
|
}
|
||||||
|
Some(s) => s,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
||||||
|
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
||||||
|
///
|
||||||
|
/// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
|
||||||
|
/// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
|
||||||
|
pub fn is_overloaded(score: RawScore) -> bool {
|
||||||
|
// Why the factor of two? This is unscientific but reflects behavior of real systems:
|
||||||
|
// - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
|
||||||
|
// startup and housekeeping jobs nice and responsive. We can go to double this limit if needed
|
||||||
|
// until some more nodes are deployed.
|
||||||
|
// - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
|
||||||
|
// hold its biggest timeline fully on disk, which is tends to be an over estimate when
|
||||||
|
// some tenants are very idle and have dropped layers from disk. In practice going up to
|
||||||
|
// double is generally better than giving up and scheduling in a sub-optimal AZ.
|
||||||
|
score >= 2 * Self::UTILIZATION_FULL
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
||||||
|
if self.shard_count < shard_count {
|
||||||
|
self.shard_count = shard_count;
|
||||||
|
|
||||||
|
// Dirty cache: this will be calculated next time someone retrives the score
|
||||||
|
self.utilization_score = None;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
||||||
@@ -88,7 +123,38 @@ impl PageserverUtilization {
|
|||||||
disk_usable_pct: Percent::new(100).unwrap(),
|
disk_usable_pct: Percent::new(100).unwrap(),
|
||||||
shard_count: 1,
|
shard_count: 1,
|
||||||
max_shard_count: 1,
|
max_shard_count: 1,
|
||||||
utilization_score: Self::UTILIZATION_FULL,
|
utilization_score: Some(Self::UTILIZATION_FULL),
|
||||||
|
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test helper
|
||||||
|
pub mod test_utilization {
|
||||||
|
use super::PageserverUtilization;
|
||||||
|
use std::time::SystemTime;
|
||||||
|
use utils::{
|
||||||
|
serde_percent::Percent,
|
||||||
|
serde_system_time::{self},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parameters of the imaginary node used for test utilization instances
|
||||||
|
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
|
||||||
|
const TEST_SHARDS_MAX: u32 = 1000;
|
||||||
|
|
||||||
|
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
|
||||||
|
/// not abuse this function from non-test code.
|
||||||
|
///
|
||||||
|
/// Emulates a node with a 1000 shard limit and a 1TB disk.
|
||||||
|
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
|
||||||
|
PageserverUtilization {
|
||||||
|
disk_usage_bytes: disk_wanted_bytes,
|
||||||
|
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
|
||||||
|
disk_wanted_bytes,
|
||||||
|
disk_usable_pct: Percent::new(100).unwrap(),
|
||||||
|
shard_count,
|
||||||
|
max_shard_count: TEST_SHARDS_MAX,
|
||||||
|
utilization_score: None,
|
||||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -120,7 +186,7 @@ mod tests {
|
|||||||
disk_usage_bytes: u64::MAX,
|
disk_usage_bytes: u64::MAX,
|
||||||
free_space_bytes: 0,
|
free_space_bytes: 0,
|
||||||
disk_wanted_bytes: u64::MAX,
|
disk_wanted_bytes: u64::MAX,
|
||||||
utilization_score: 13,
|
utilization_score: Some(13),
|
||||||
disk_usable_pct: Percent::new(90).unwrap(),
|
disk_usable_pct: Percent::new(90).unwrap(),
|
||||||
shard_count: 100,
|
shard_count: 100,
|
||||||
max_shard_count: 200,
|
max_shard_count: 200,
|
||||||
|
|||||||
@@ -69,8 +69,10 @@ impl QueryError {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if the given error is a normal consequence of a network issue,
|
/// Returns true if the given error is a normal consequence of a network issue,
|
||||||
/// or the client closing the connection. These errors can happen during normal
|
/// or the client closing the connection.
|
||||||
/// operations, and don't indicate a bug in our code.
|
///
|
||||||
|
/// These errors can happen during normal operations,
|
||||||
|
/// and don't indicate a bug in our code.
|
||||||
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||||
use io::ErrorKind::*;
|
use io::ErrorKind::*;
|
||||||
matches!(
|
matches!(
|
||||||
@@ -79,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
pub trait Handler<IO> {
|
pub trait Handler<IO> {
|
||||||
/// Handle single query.
|
/// Handle single query.
|
||||||
/// postgres_backend will issue ReadyForQuery after calling this (this
|
/// postgres_backend will issue ReadyForQuery after calling this (this
|
||||||
/// might be not what we want after CopyData streaming, but currently we don't
|
/// might be not what we want after CopyData streaming, but currently we don't
|
||||||
/// care). It will also flush out the output buffer.
|
/// care). It will also flush out the output buffer.
|
||||||
async fn process_query(
|
fn process_query(
|
||||||
&mut self,
|
&mut self,
|
||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
query_string: &str,
|
query_string: &str,
|
||||||
) -> Result<(), QueryError>;
|
) -> impl Future<Output = Result<(), QueryError>>;
|
||||||
|
|
||||||
/// Called on startup packet receival, allows to process params.
|
/// Called on startup packet receival, allows to process params.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
|
|||||||
|
|
||||||
struct TestHandler {}
|
struct TestHandler {}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
|
impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
|
||||||
// return single col 'hey' for any query
|
// return single col 'hey' for any query
|
||||||
async fn process_query(
|
async fn process_query(
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use std::fmt;
|
|||||||
use url::Host;
|
use url::Host;
|
||||||
|
|
||||||
/// Parses a string of format either `host:port` or `host` into a corresponding pair.
|
/// Parses a string of format either `host:port` or `host` into a corresponding pair.
|
||||||
|
///
|
||||||
/// The `host` part should be a correct `url::Host`, while `port` (if present) should be
|
/// The `host` part should be a correct `url::Host`, while `port` (if present) should be
|
||||||
/// a valid decimal u16 of digits only.
|
/// a valid decimal u16 of digits only.
|
||||||
pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
|
pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
|
|||||||
fn include_file(&self, filename: &str) {
|
fn include_file(&self, filename: &str) {
|
||||||
// This does the equivalent of passing bindgen::CargoCallbacks
|
// This does the equivalent of passing bindgen::CargoCallbacks
|
||||||
// to the builder .parse_callbacks() method.
|
// to the builder .parse_callbacks() method.
|
||||||
let cargo_callbacks = bindgen::CargoCallbacks;
|
let cargo_callbacks = bindgen::CargoCallbacks::new();
|
||||||
cargo_callbacks.include_file(filename)
|
cargo_callbacks.include_file(filename)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.allowlist_type("XLogPageHeaderData")
|
.allowlist_type("XLogPageHeaderData")
|
||||||
.allowlist_type("XLogLongPageHeaderData")
|
.allowlist_type("XLogLongPageHeaderData")
|
||||||
.allowlist_var("XLOG_PAGE_MAGIC")
|
.allowlist_var("XLOG_PAGE_MAGIC")
|
||||||
|
.allowlist_var("PG_MAJORVERSION_NUM")
|
||||||
.allowlist_var("PG_CONTROL_FILE_SIZE")
|
.allowlist_var("PG_CONTROL_FILE_SIZE")
|
||||||
.allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
.allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||||
.allowlist_type("PageHeaderData")
|
.allowlist_type("PageHeaderData")
|
||||||
|
|||||||
@@ -44,6 +44,9 @@ macro_rules! postgres_ffi {
|
|||||||
// Re-export some symbols from bindings
|
// Re-export some symbols from bindings
|
||||||
pub use bindings::DBState_DB_SHUTDOWNED;
|
pub use bindings::DBState_DB_SHUTDOWNED;
|
||||||
pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
|
pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
|
||||||
|
|
||||||
|
pub const ZERO_CHECKPOINT: bytes::Bytes =
|
||||||
|
bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! enum_pgversion_dispatch {
|
||||||
|
($name:expr, $typ:ident, $bind:ident, $code:block) => {
|
||||||
|
enum_pgversion_dispatch!(
|
||||||
|
name = $name,
|
||||||
|
bind = $bind,
|
||||||
|
typ = $typ,
|
||||||
|
code = $code,
|
||||||
|
pgversions = [
|
||||||
|
V14 : v14,
|
||||||
|
V15 : v15,
|
||||||
|
V16 : v16,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
};
|
||||||
|
(name = $name:expr,
|
||||||
|
bind = $bind:ident,
|
||||||
|
typ = $typ:ident,
|
||||||
|
code = $code:block,
|
||||||
|
pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
|
||||||
|
match $name {
|
||||||
|
$(
|
||||||
|
self::$typ::$variant($bind) => {
|
||||||
|
use $crate::$md as pgv;
|
||||||
|
$code
|
||||||
|
}
|
||||||
|
),+,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! enum_pgversion {
|
||||||
|
{$name:ident, pgv :: $t:ident} => {
|
||||||
|
enum_pgversion!{
|
||||||
|
name = $name,
|
||||||
|
typ = $t,
|
||||||
|
pgversions = [
|
||||||
|
V14 : v14,
|
||||||
|
V15 : v15,
|
||||||
|
V16 : v16,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
{$name:ident, pgv :: $p:ident :: $t:ident} => {
|
||||||
|
enum_pgversion!{
|
||||||
|
name = $name,
|
||||||
|
path = $p,
|
||||||
|
typ = $t,
|
||||||
|
pgversions = [
|
||||||
|
V14 : v14,
|
||||||
|
V15 : v15,
|
||||||
|
V16 : v16,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
{name = $name:ident,
|
||||||
|
typ = $t:ident,
|
||||||
|
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
|
||||||
|
pub enum $name {
|
||||||
|
$($variant ( $crate::$md::$t )),+
|
||||||
|
}
|
||||||
|
impl self::$name {
|
||||||
|
pub fn pg_version(&self) -> u32 {
|
||||||
|
enum_pgversion_dispatch!(self, $name, _ign, {
|
||||||
|
pgv::bindings::PG_MAJORVERSION_NUM
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$(
|
||||||
|
impl Into<self::$name> for $crate::$md::$t {
|
||||||
|
fn into(self) -> self::$name {
|
||||||
|
self::$name::$variant (self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)+
|
||||||
|
};
|
||||||
|
{name = $name:ident,
|
||||||
|
path = $p:ident,
|
||||||
|
typ = $t:ident,
|
||||||
|
pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
|
||||||
|
pub enum $name {
|
||||||
|
$($variant ($crate::$md::$p::$t)),+
|
||||||
|
}
|
||||||
|
impl $name {
|
||||||
|
pub fn pg_version(&self) -> u32 {
|
||||||
|
enum_pgversion_dispatch!(self, $name, _ign, {
|
||||||
|
pgv::bindings::PG_MAJORVERSION_NUM
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$(
|
||||||
|
impl Into<$name> for $crate::$md::$p::$t {
|
||||||
|
fn into(self) -> $name {
|
||||||
|
$name::$variant (self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)+
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub mod pg_constants;
|
pub mod pg_constants;
|
||||||
pub mod relfile_utils;
|
pub mod relfile_utils;
|
||||||
|
|
||||||
@@ -136,9 +240,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
|||||||
|
|
||||||
// Export some version independent functions that are used outside of this mod
|
// Export some version independent functions that are used outside of this mod
|
||||||
pub use v14::xlog_utils::encode_logical_message;
|
pub use v14::xlog_utils::encode_logical_message;
|
||||||
pub use v14::xlog_utils::from_pg_timestamp;
|
|
||||||
pub use v14::xlog_utils::get_current_timestamp;
|
pub use v14::xlog_utils::get_current_timestamp;
|
||||||
pub use v14::xlog_utils::to_pg_timestamp;
|
pub use v14::xlog_utils::to_pg_timestamp;
|
||||||
|
pub use v14::xlog_utils::try_from_pg_timestamp;
|
||||||
pub use v14::xlog_utils::XLogFileName;
|
pub use v14::xlog_utils::XLogFileName;
|
||||||
|
|
||||||
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||||
|
|||||||
@@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz {
|
|||||||
mod timestamp_conversions {
|
mod timestamp_conversions {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
||||||
@@ -154,18 +156,18 @@ mod timestamp_conversions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
|
pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
|
||||||
let time: u64 = time
|
let time: u64 = time
|
||||||
.try_into()
|
.try_into()
|
||||||
.expect("timestamp before millenium (postgres epoch)");
|
.context("timestamp before millenium (postgres epoch)")?;
|
||||||
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
||||||
SystemTime::UNIX_EPOCH
|
SystemTime::UNIX_EPOCH
|
||||||
.checked_add(Duration::from_micros(since_unix_epoch))
|
.checked_add(Duration::from_micros(since_unix_epoch))
|
||||||
.expect("SystemTime overflow")
|
.context("SystemTime overflow")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
|
pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
|
||||||
|
|
||||||
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
||||||
// start_lsn must point to some previously known record boundary (beginning of
|
// start_lsn must point to some previously known record boundary (beginning of
|
||||||
@@ -545,14 +547,14 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_ts_conversion() {
|
fn test_ts_conversion() {
|
||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
|
let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
|
||||||
|
|
||||||
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||||
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||||
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
||||||
|
|
||||||
let now_pg = get_current_timestamp();
|
let now_pg = get_current_timestamp();
|
||||||
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
|
let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
|
||||||
|
|
||||||
assert_eq!(now_pg, round_trip_pg);
|
assert_eq!(now_pg, round_trip_pg);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -185,7 +185,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
|
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
|
||||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
|
||||||
RemoteStorageConfig::from_toml(toml.as_item())
|
RemoteStorageConfig::from_toml(toml.as_item())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,6 +235,31 @@ timeout = '5s'";
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_storage_class_serde_roundtrip() {
|
||||||
|
let classes = [
|
||||||
|
None,
|
||||||
|
Some(StorageClass::Standard),
|
||||||
|
Some(StorageClass::IntelligentTiering),
|
||||||
|
];
|
||||||
|
for class in classes {
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
struct Wrapper {
|
||||||
|
#[serde(
|
||||||
|
deserialize_with = "deserialize_storage_class",
|
||||||
|
serialize_with = "serialize_storage_class"
|
||||||
|
)]
|
||||||
|
class: Option<StorageClass>,
|
||||||
|
}
|
||||||
|
let wrapped = Wrapper {
|
||||||
|
class: class.clone(),
|
||||||
|
};
|
||||||
|
let serialized = serde_json::to_string(&wrapped).unwrap();
|
||||||
|
let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
|
||||||
|
assert_eq!(class, deserialized.class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_azure_parsing() {
|
fn test_azure_parsing() {
|
||||||
let toml = "\
|
let toml = "\
|
||||||
|
|||||||
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
|
|||||||
|
|
||||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||||
|
|
||||||
|
/// Default concurrency limit for S3 operations
|
||||||
|
///
|
||||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||||
/// ~200 RPS for IAM services
|
/// ~200 RPS for IAM services
|
||||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||||
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
) -> Result<(), TimeTravelError>;
|
) -> Result<(), TimeTravelError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
/// Data part of an ongoing [`Download`].
|
||||||
|
///
|
||||||
|
/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
|
||||||
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
||||||
/// with `tokio::io::copy_buf`.
|
/// with `tokio::io::copy_buf`.
|
||||||
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
||||||
|
|||||||
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
|
|||||||
pub target_timeline_id: TimelineId,
|
pub target_timeline_id: TimelineId,
|
||||||
pub until_lsn: Lsn,
|
pub until_lsn: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct TimelineTermBumpRequest {
|
||||||
|
/// bump to
|
||||||
|
pub term: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct TimelineTermBumpResponse {
|
||||||
|
// before the request
|
||||||
|
pub previous_term: u64,
|
||||||
|
pub current_term: u64,
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,9 +5,10 @@
|
|||||||
mod calculation;
|
mod calculation;
|
||||||
pub mod svg;
|
pub mod svg;
|
||||||
|
|
||||||
/// StorageModel is the input to the synthetic size calculation. It represents
|
/// StorageModel is the input to the synthetic size calculation.
|
||||||
/// a tree of timelines, with just the information that's needed for the
|
///
|
||||||
/// calculation. This doesn't track timeline names or where each timeline
|
/// It represents a tree of timelines, with just the information that's needed
|
||||||
|
/// for the calculation. This doesn't track timeline names or where each timeline
|
||||||
/// begins and ends, for example. Instead, it consists of "points of interest"
|
/// begins and ends, for example. Instead, it consists of "points of interest"
|
||||||
/// on the timelines. A point of interest could be the timeline start or end point,
|
/// on the timelines. A point of interest could be the timeline start or end point,
|
||||||
/// the oldest point on a timeline that needs to be retained because of PITR
|
/// the oldest point on a timeline that needs to be retained because of PITR
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ testing = ["fail/failpoints"]
|
|||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
async-trait.workspace = true
|
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
@@ -26,7 +25,6 @@ hyper = { workspace = true, features = ["full"] }
|
|||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
jsonwebtoken.workspace = true
|
jsonwebtoken.workspace = true
|
||||||
leaky-bucket.workspace = true
|
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
pin-project-lite.workspace = true
|
pin-project-lite.workspace = true
|
||||||
|
|||||||
@@ -5,8 +5,10 @@ use std::{
|
|||||||
|
|
||||||
use metrics::IntCounter;
|
use metrics::IntCounter;
|
||||||
|
|
||||||
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
|
/// Circuit breakers are for operations that are expensive and fallible.
|
||||||
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
|
///
|
||||||
|
/// If a circuit breaker fails repeatedly, we will stop attempting it for some
|
||||||
|
/// period of time, to avoid denial-of-service from retries, and
|
||||||
/// to mitigate the log spam from repeated failures.
|
/// to mitigate the log spam from repeated failures.
|
||||||
pub struct CircuitBreaker {
|
pub struct CircuitBreaker {
|
||||||
/// An identifier that enables us to log useful errors when a circuit is broken
|
/// An identifier that enables us to log useful errors when a circuit is broken
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use std::os::fd::AsRawFd;
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
fs::{self, File},
|
fs::{self, File},
|
||||||
@@ -203,6 +204,27 @@ pub fn overwrite(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Syncs the filesystem for the given file descriptor.
|
||||||
|
#[cfg_attr(target_os = "macos", allow(unused_variables))]
|
||||||
|
pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
|
||||||
|
// Linux guarantees durability for syncfs.
|
||||||
|
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
{
|
||||||
|
use anyhow::Context;
|
||||||
|
nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
|
||||||
|
}
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
{
|
||||||
|
// macOS is not a production platform for Neon, don't even bother.
|
||||||
|
}
|
||||||
|
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||||
|
{
|
||||||
|
compile_error!("Unsupported OS");
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
|||||||
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Neon timeline IDs are different from PostgreSQL timeline
|
/// Neon timeline ID.
|
||||||
/// IDs. They serve a similar purpose though: they differentiate
|
///
|
||||||
|
/// They are different from PostgreSQL timeline
|
||||||
|
/// IDs, but serve a similar purpose: they differentiate
|
||||||
/// between different "histories" of the same cluster. However,
|
/// between different "histories" of the same cluster. However,
|
||||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||||
/// 32-bits wide, and they must be in ascending order in any given
|
/// 32-bits wide, and they must be in ascending order in any given
|
||||||
|
|||||||
280
libs/utils/src/leaky_bucket.rs
Normal file
280
libs/utils/src/leaky_bucket.rs
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
//! This module implements the Generic Cell Rate Algorithm for a simplified
|
||||||
|
//! version of the Leaky Bucket rate limiting system.
|
||||||
|
//!
|
||||||
|
//! # Leaky Bucket
|
||||||
|
//!
|
||||||
|
//! If the bucket is full, no new requests are allowed and are throttled/errored.
|
||||||
|
//! If the bucket is partially full/empty, new requests are added to the bucket in
|
||||||
|
//! terms of "tokens".
|
||||||
|
//!
|
||||||
|
//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
|
||||||
|
//!
|
||||||
|
//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
|
||||||
|
//!
|
||||||
|
//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
|
||||||
|
//!
|
||||||
|
//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
|
||||||
|
//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
|
||||||
|
//!
|
||||||
|
//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
|
||||||
|
//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
|
||||||
|
//!
|
||||||
|
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
|
||||||
|
|
||||||
|
use std::{sync::Mutex, time::Duration};
|
||||||
|
|
||||||
|
use tokio::{sync::Notify, time::Instant};
|
||||||
|
|
||||||
|
pub struct LeakyBucketConfig {
|
||||||
|
/// This is the "time cost" of a single request unit.
|
||||||
|
/// Should loosely represent how long it takes to handle a request unit in active resource time.
|
||||||
|
/// Loosely speaking this is the inverse of the steady-rate requests-per-second
|
||||||
|
pub cost: Duration,
|
||||||
|
|
||||||
|
/// total size of the bucket
|
||||||
|
pub bucket_width: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LeakyBucketConfig {
|
||||||
|
pub fn new(rps: f64, bucket_size: f64) -> Self {
|
||||||
|
let cost = Duration::from_secs_f64(rps.recip());
|
||||||
|
let bucket_width = cost.mul_f64(bucket_size);
|
||||||
|
Self { cost, bucket_width }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LeakyBucketState {
|
||||||
|
/// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
|
||||||
|
///
|
||||||
|
/// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
|
||||||
|
/// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
|
||||||
|
/// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
|
||||||
|
/// Draining the bucket will happen naturally as `now` moves forward.
|
||||||
|
///
|
||||||
|
/// Let `n` be some "time cost" for the request,
|
||||||
|
/// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
|
||||||
|
/// If now is within the `bucket window + n`, we are within time budget.
|
||||||
|
/// If now is before the `bucket window + n`, we have run out of budget.
|
||||||
|
///
|
||||||
|
/// This is inspired by the generic cell rate algorithm (GCRA) and works
|
||||||
|
/// exactly the same as a leaky-bucket.
|
||||||
|
pub empty_at: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LeakyBucketState {
|
||||||
|
pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||||
|
LeakyBucketState {
|
||||||
|
empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bucket_is_empty(&self, now: Instant) -> bool {
|
||||||
|
// if self.end is after now, the bucket is not empty
|
||||||
|
self.empty_at <= now
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Immediately adds tokens to the bucket, if there is space.
|
||||||
|
///
|
||||||
|
/// In a scenario where you are waiting for available rate,
|
||||||
|
/// rather than just erroring immediately, `started` corresponds to when this waiting started.
|
||||||
|
///
|
||||||
|
/// `n` is the number of tokens that will be filled in the bucket.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
///
|
||||||
|
/// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
|
||||||
|
/// there will be space again.
|
||||||
|
pub fn add_tokens(
|
||||||
|
&mut self,
|
||||||
|
config: &LeakyBucketConfig,
|
||||||
|
started: Instant,
|
||||||
|
n: f64,
|
||||||
|
) -> Result<(), Instant> {
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
// invariant: started <= now
|
||||||
|
debug_assert!(started <= now);
|
||||||
|
|
||||||
|
// If the bucket was empty when we started our search,
|
||||||
|
// we should update the `empty_at` value accordingly.
|
||||||
|
// this prevents us from having negative tokens in the bucket.
|
||||||
|
let mut empty_at = self.empty_at;
|
||||||
|
if empty_at < started {
|
||||||
|
empty_at = started;
|
||||||
|
}
|
||||||
|
|
||||||
|
let n = config.cost.mul_f64(n);
|
||||||
|
let new_empty_at = empty_at + n;
|
||||||
|
let allow_at = new_empty_at.checked_sub(config.bucket_width);
|
||||||
|
|
||||||
|
// empty_at
|
||||||
|
// allow_at | new_empty_at
|
||||||
|
// / | /
|
||||||
|
// -------o-[---------o-|--]---------
|
||||||
|
// now1 ^ now2 ^
|
||||||
|
//
|
||||||
|
// at now1, the bucket would be completely filled if we add n tokens.
|
||||||
|
// at now2, the bucket would be partially filled if we add n tokens.
|
||||||
|
|
||||||
|
match allow_at {
|
||||||
|
Some(allow_at) if now < allow_at => Err(allow_at),
|
||||||
|
_ => {
|
||||||
|
self.empty_at = new_empty_at;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RateLimiter {
|
||||||
|
pub config: LeakyBucketConfig,
|
||||||
|
pub state: Mutex<LeakyBucketState>,
|
||||||
|
/// a queue to provide this fair ordering.
|
||||||
|
pub queue: Notify,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Requeue<'a>(&'a Notify);
|
||||||
|
|
||||||
|
impl Drop for Requeue<'_> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.0.notify_one();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RateLimiter {
|
||||||
|
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||||
|
RateLimiter {
|
||||||
|
state: Mutex::new(LeakyBucketState::with_initial_tokens(
|
||||||
|
&config,
|
||||||
|
initial_tokens,
|
||||||
|
)),
|
||||||
|
config,
|
||||||
|
queue: {
|
||||||
|
let queue = Notify::new();
|
||||||
|
queue.notify_one();
|
||||||
|
queue
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn steady_rps(&self) -> f64 {
|
||||||
|
self.config.cost.as_secs_f64().recip()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// returns true if we did throttle
|
||||||
|
pub async fn acquire(&self, count: usize) -> bool {
|
||||||
|
let mut throttled = false;
|
||||||
|
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
|
||||||
|
// wait until we are the first in the queue
|
||||||
|
let mut notified = std::pin::pin!(self.queue.notified());
|
||||||
|
if !notified.as_mut().enable() {
|
||||||
|
throttled = true;
|
||||||
|
notified.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// notify the next waiter in the queue when we are done.
|
||||||
|
let _guard = Requeue(&self.queue);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let res = self
|
||||||
|
.state
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.add_tokens(&self.config, start, count as f64);
|
||||||
|
match res {
|
||||||
|
Ok(()) => return throttled,
|
||||||
|
Err(ready_at) => {
|
||||||
|
throttled = true;
|
||||||
|
tokio::time::sleep_until(ready_at).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use tokio::time::Instant;
|
||||||
|
|
||||||
|
use super::{LeakyBucketConfig, LeakyBucketState};
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn check() {
|
||||||
|
let config = LeakyBucketConfig {
|
||||||
|
// average 100rps
|
||||||
|
cost: Duration::from_millis(10),
|
||||||
|
// burst up to 100 requests
|
||||||
|
bucket_width: Duration::from_millis(1000),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut state = LeakyBucketState {
|
||||||
|
empty_at: Instant::now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// supports burst
|
||||||
|
{
|
||||||
|
// should work for 100 requests this instant
|
||||||
|
for _ in 0..100 {
|
||||||
|
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||||
|
}
|
||||||
|
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||||
|
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||||
|
}
|
||||||
|
|
||||||
|
// doesn't overfill
|
||||||
|
{
|
||||||
|
// after 1s we should have an empty bucket again.
|
||||||
|
tokio::time::advance(Duration::from_secs(1)).await;
|
||||||
|
assert!(state.bucket_is_empty(Instant::now()));
|
||||||
|
|
||||||
|
// after 1s more, we should not over count the tokens and allow more than 200 requests.
|
||||||
|
tokio::time::advance(Duration::from_secs(1)).await;
|
||||||
|
for _ in 0..100 {
|
||||||
|
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||||
|
}
|
||||||
|
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||||
|
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||||
|
}
|
||||||
|
|
||||||
|
// supports sustained rate over a long period
|
||||||
|
{
|
||||||
|
tokio::time::advance(Duration::from_secs(1)).await;
|
||||||
|
|
||||||
|
// should sustain 100rps
|
||||||
|
for _ in 0..2000 {
|
||||||
|
tokio::time::advance(Duration::from_millis(10)).await;
|
||||||
|
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// supports requesting more tokens than can be stored in the bucket
|
||||||
|
// we just wait a little bit longer upfront.
|
||||||
|
{
|
||||||
|
// start the bucket completely empty
|
||||||
|
tokio::time::advance(Duration::from_secs(5)).await;
|
||||||
|
assert!(state.bucket_is_empty(Instant::now()));
|
||||||
|
|
||||||
|
// requesting 200 tokens of space should take 200*cost = 2s
|
||||||
|
// but we already have 1s available, so we wait 1s from start.
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
||||||
|
assert_eq!(ready - Instant::now(), Duration::from_secs(1));
|
||||||
|
|
||||||
|
tokio::time::advance(Duration::from_millis(500)).await;
|
||||||
|
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
||||||
|
assert_eq!(ready - Instant::now(), Duration::from_millis(500));
|
||||||
|
|
||||||
|
tokio::time::advance(Duration::from_millis(500)).await;
|
||||||
|
state.add_tokens(&config, start, 200.0).unwrap();
|
||||||
|
|
||||||
|
// bucket should be completely full now
|
||||||
|
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||||
|
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -71,6 +71,7 @@ pub mod postgres_client;
|
|||||||
|
|
||||||
pub mod tracing_span_assert;
|
pub mod tracing_span_assert;
|
||||||
|
|
||||||
|
pub mod leaky_bucket;
|
||||||
pub mod rate_limit;
|
pub mod rate_limit;
|
||||||
|
|
||||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||||
|
|||||||
@@ -100,7 +100,9 @@ pub enum LockFileRead {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
|
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
|
||||||
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
|
/// inspect its content.
|
||||||
|
///
|
||||||
|
/// It is not an `Err(...)` if the file does not exist or is already locked.
|
||||||
/// Check the [`LockFileRead`] variants for details.
|
/// Check the [`LockFileRead`] variants for details.
|
||||||
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
|
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
|
||||||
let res = fs::OpenOptions::new().read(true).open(path);
|
let res = fs::OpenOptions::new().read(true).open(path);
|
||||||
|
|||||||
@@ -3,9 +3,9 @@ use std::str::FromStr;
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use metrics::{IntCounter, IntCounterVec};
|
use metrics::{IntCounter, IntCounterVec};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use strum_macros::{EnumString, EnumVariantNames};
|
use strum_macros::{EnumString, VariantNames};
|
||||||
|
|
||||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||||
#[strum(serialize_all = "snake_case")]
|
#[strum(serialize_all = "snake_case")]
|
||||||
pub enum LogFormat {
|
pub enum LogFormat {
|
||||||
Plain,
|
Plain,
|
||||||
@@ -188,7 +188,7 @@ impl Drop for TracingPanicHookGuard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Named symbol for our panic hook, which logs the panic.
|
/// Named symbol for our panic hook, which logs the panic.
|
||||||
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
|
fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
|
||||||
// following rust 1.66.1 std implementation:
|
// following rust 1.66.1 std implementation:
|
||||||
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
|
||||||
let location = info.location();
|
let location = info.location();
|
||||||
@@ -274,6 +274,14 @@ impl From<String> for SecretString {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl FromStr for SecretString {
|
||||||
|
type Err = std::convert::Infallible;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
Ok(Self(s.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for SecretString {
|
impl std::fmt::Debug for SecretString {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
write!(f, "[SECRET]")
|
write!(f, "[SECRET]")
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
|
|||||||
use crate::lsn::Lsn;
|
use crate::lsn::Lsn;
|
||||||
|
|
||||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||||
|
///
|
||||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||||
/// Standby status update / Hot standby feedback messages.
|
/// Standby status update / Hot standby feedback messages.
|
||||||
|
|||||||
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Armed pointer to a [`Poison`].
|
||||||
|
///
|
||||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||||
/// Once modifications are done, use [`Self::disarm`].
|
/// Once modifications are done, use [`Self::disarm`].
|
||||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||||
|
|||||||
@@ -5,6 +5,15 @@ use std::time::{Duration, Instant};
|
|||||||
pub struct RateLimit {
|
pub struct RateLimit {
|
||||||
last: Option<Instant>,
|
last: Option<Instant>,
|
||||||
interval: Duration,
|
interval: Duration,
|
||||||
|
dropped: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RateLimitStats(u64);
|
||||||
|
|
||||||
|
impl std::fmt::Display for RateLimitStats {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
write!(f, "{} dropped calls", self.0)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RateLimit {
|
impl RateLimit {
|
||||||
@@ -12,20 +21,27 @@ impl RateLimit {
|
|||||||
Self {
|
Self {
|
||||||
last: None,
|
last: None,
|
||||||
interval,
|
interval,
|
||||||
|
dropped: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Call `f` if the rate limit allows.
|
/// Call `f` if the rate limit allows.
|
||||||
/// Don't call it otherwise.
|
/// Don't call it otherwise.
|
||||||
pub fn call<F: FnOnce()>(&mut self, f: F) {
|
pub fn call<F: FnOnce()>(&mut self, f: F) {
|
||||||
|
self.call2(|_| f())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
match self.last {
|
match self.last {
|
||||||
Some(last) if now - last <= self.interval => {
|
Some(last) if now - last <= self.interval => {
|
||||||
// ratelimit
|
// ratelimit
|
||||||
|
self.dropped += 1;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
self.last = Some(now);
|
self.last = Some(now);
|
||||||
f();
|
f(RateLimitStats(self.dropped));
|
||||||
|
self.dropped = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
|
|||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardCount(pub u8);
|
pub struct ShardCount(pub u8);
|
||||||
|
|
||||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
/// Combination of ShardNumber and ShardCount.
|
||||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
///
|
||||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
/// For use within the context of a particular tenant, when we need to know which shard we're
|
||||||
/// the fully qualified TenantShardId.
|
/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
|
||||||
|
/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
pub struct ShardIndex {
|
pub struct ShardIndex {
|
||||||
pub shard_number: ShardNumber,
|
pub shard_number: ShardNumber,
|
||||||
|
|||||||
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
|
|||||||
|
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
|
|
||||||
///
|
|
||||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||||
/// (for very long). Storing to the Rcu updates the value, making new readers
|
/// (for very long).
|
||||||
/// immediately see the new value, but it also waits for all current readers to
|
|
||||||
/// finish.
|
|
||||||
///
|
///
|
||||||
|
/// Storing to the Rcu updates the value, making new readers immediately see
|
||||||
|
/// the new value, but it also waits for all current readers to finish.
|
||||||
pub struct Rcu<V> {
|
pub struct Rcu<V> {
|
||||||
inner: RwLock<RcuInner<V>>,
|
inner: RwLock<RcuInner<V>>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,9 @@ use std::sync::{
|
|||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
|
||||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||||
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
|
/// `SemaphorePermit`.
|
||||||
|
///
|
||||||
|
/// Allows use of `take` which does not require holding an outer mutex guard
|
||||||
/// for the duration of initialization.
|
/// for the duration of initialization.
|
||||||
///
|
///
|
||||||
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
|
|||||||
where
|
where
|
||||||
T: serde::de::DeserializeOwned,
|
T: serde::de::DeserializeOwned,
|
||||||
{
|
{
|
||||||
let document: toml_edit::Document = match item {
|
let document: toml_edit::DocumentMut = match item {
|
||||||
toml_edit::Item::Table(toml) => toml.clone().into(),
|
toml_edit::Item::Table(toml) => toml.clone().into(),
|
||||||
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
|
||||||
toml.clone().into_table().into()
|
toml.clone().into_table().into()
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Ordered map datastructure implemented in a Vec.
|
/// Ordered map datastructure implemented in a Vec.
|
||||||
|
///
|
||||||
/// Append only - can only add keys that are larger than the
|
/// Append only - can only add keys that are larger than the
|
||||||
/// current max key.
|
/// current max key.
|
||||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||||
|
|||||||
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
|
|||||||
Cancelled,
|
Cancelled,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
/// Helper for long synchronous loops, e.g. over all tenants in the system.
|
||||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
///
|
||||||
/// cancellation token to drop out promptly on shutdown.
|
/// Periodically yields to avoid blocking the executor, and after resuming
|
||||||
|
/// checks the provided cancellation token to drop out promptly on shutdown.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub async fn yielding_loop<I, T, F>(
|
pub async fn yielding_loop<I, T, F>(
|
||||||
interval: usize,
|
interval: usize,
|
||||||
@@ -23,7 +24,7 @@ where
|
|||||||
for (i, item) in iter.enumerate() {
|
for (i, item) in iter.enumerate() {
|
||||||
visitor(item);
|
visitor(item);
|
||||||
|
|
||||||
if i + 1 % interval == 0 {
|
if (i + 1) % interval == 0 {
|
||||||
tokio::task::yield_now().await;
|
tokio::task::yield_now().await;
|
||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
return Err(YieldingLoopError::Cancelled);
|
return Err(YieldingLoopError::Cancelled);
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
use std::{env, path::PathBuf, process::Command};
|
use std::{env, path::PathBuf, process::Command};
|
||||||
|
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use bindgen::CargoCallbacks;
|
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||||
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let unwind_abi_functions = [
|
||||||
|
"log_internal",
|
||||||
|
"recovery_download",
|
||||||
|
"start_streaming",
|
||||||
|
"finish_sync_safekeepers",
|
||||||
|
"wait_event_set",
|
||||||
|
"WalProposerStart",
|
||||||
|
];
|
||||||
|
|
||||||
// The bindgen::Builder is the main entry point
|
// The bindgen::Builder is the main entry point
|
||||||
// to bindgen, and lets you build up options for
|
// to bindgen, and lets you build up options for
|
||||||
// the resulting bindings.
|
// the resulting bindings.
|
||||||
let bindings = bindgen::Builder::default()
|
let mut builder = bindgen::Builder::default()
|
||||||
// The input header we would like to generate
|
// The input header we would like to generate
|
||||||
// bindings for.
|
// bindings for.
|
||||||
.header("bindgen_deps.h")
|
.header("bindgen_deps.h")
|
||||||
// Tell cargo to invalidate the built crate whenever any of the
|
// Tell cargo to invalidate the built crate whenever any of the
|
||||||
// included header files changed.
|
// included header files changed.
|
||||||
.parse_callbacks(Box::new(CargoCallbacks))
|
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
|
||||||
.allowlist_type("WalProposer")
|
.allowlist_type("WalProposer")
|
||||||
.allowlist_type("WalProposerConfig")
|
.allowlist_type("WalProposerConfig")
|
||||||
.allowlist_type("walproposer_api")
|
.allowlist_type("walproposer_api")
|
||||||
@@ -95,6 +103,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.allowlist_var("ERROR")
|
.allowlist_var("ERROR")
|
||||||
.allowlist_var("FATAL")
|
.allowlist_var("FATAL")
|
||||||
.allowlist_var("PANIC")
|
.allowlist_var("PANIC")
|
||||||
|
.allowlist_var("PG_VERSION_NUM")
|
||||||
.allowlist_var("WPEVENT")
|
.allowlist_var("WPEVENT")
|
||||||
.allowlist_var("WL_LATCH_SET")
|
.allowlist_var("WL_LATCH_SET")
|
||||||
.allowlist_var("WL_SOCKET_READABLE")
|
.allowlist_var("WL_SOCKET_READABLE")
|
||||||
@@ -104,7 +113,12 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.allowlist_var("WL_SOCKET_MASK")
|
.allowlist_var("WL_SOCKET_MASK")
|
||||||
.clang_arg("-DWALPROPOSER_LIB")
|
.clang_arg("-DWALPROPOSER_LIB")
|
||||||
.clang_arg(format!("-I{pgxn_neon}"))
|
.clang_arg(format!("-I{pgxn_neon}"))
|
||||||
.clang_arg(format!("-I{inc_server_path}"))
|
.clang_arg(format!("-I{inc_server_path}"));
|
||||||
|
|
||||||
|
for name in unwind_abi_functions {
|
||||||
|
builder = builder.override_abi(bindgen::Abi::CUnwind, name);
|
||||||
|
}
|
||||||
|
let bindings = builder
|
||||||
// Finish the builder and generate the bindings.
|
// Finish the builder and generate the bindings.
|
||||||
.generate()
|
.generate()
|
||||||
// Unwrap the Result and panic on failure.
|
// Unwrap the Result and panic on failure.
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*wp).config).callback_data;
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn wait_event_set(
|
extern "C-unwind" fn wait_event_set(
|
||||||
wp: *mut WalProposer,
|
wp: *mut WalProposer,
|
||||||
timeout: ::std::os::raw::c_long,
|
timeout: ::std::os::raw::c_long,
|
||||||
event_sk: *mut *mut Safekeeper,
|
event_sk: *mut *mut Safekeeper,
|
||||||
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*wp).config).callback_data;
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn log_internal(
|
extern "C-unwind" fn log_internal(
|
||||||
wp: *mut WalProposer,
|
wp: *mut WalProposer,
|
||||||
level: ::std::os::raw::c_int,
|
level: ::std::os::raw::c_int,
|
||||||
line: *const ::std::os::raw::c_char,
|
line: *const ::std::os::raw::c_char,
|
||||||
|
|||||||
@@ -282,7 +282,11 @@ mod tests {
|
|||||||
use std::cell::UnsafeCell;
|
use std::cell::UnsafeCell;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
use crate::{
|
||||||
|
api_bindings::Level,
|
||||||
|
bindings::{NeonWALReadResult, PG_VERSION_NUM},
|
||||||
|
walproposer::Wrapper,
|
||||||
|
};
|
||||||
|
|
||||||
use super::ApiImpl;
|
use super::ApiImpl;
|
||||||
|
|
||||||
@@ -489,41 +493,79 @@ mod tests {
|
|||||||
|
|
||||||
let (sender, receiver) = sync_channel(1);
|
let (sender, receiver) = sync_channel(1);
|
||||||
|
|
||||||
|
// Messages definitions are at walproposer.h
|
||||||
|
// xxx: it would be better to extract them from safekeeper crate and
|
||||||
|
// use serialization/deserialization here.
|
||||||
|
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
||||||
|
let proto_version = 2_u32.to_ne_bytes();
|
||||||
|
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
||||||
|
let proposer_id = [0; 16];
|
||||||
|
let system_id = 0_u64.to_ne_bytes();
|
||||||
|
let tenant_id = ttid.tenant_id.as_arr();
|
||||||
|
let timeline_id = ttid.timeline_id.as_arr();
|
||||||
|
let pg_tli = 1_u32.to_ne_bytes();
|
||||||
|
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
||||||
|
let proposer_greeting = [
|
||||||
|
greeting_tag.as_slice(),
|
||||||
|
proto_version.as_slice(),
|
||||||
|
pg_version.as_slice(),
|
||||||
|
proposer_id.as_slice(),
|
||||||
|
system_id.as_slice(),
|
||||||
|
tenant_id.as_slice(),
|
||||||
|
timeline_id.as_slice(),
|
||||||
|
pg_tli.as_slice(),
|
||||||
|
wal_seg_size.as_slice(),
|
||||||
|
]
|
||||||
|
.concat();
|
||||||
|
|
||||||
|
let voting_tag = (b'v' as u64).to_ne_bytes();
|
||||||
|
let vote_request_term = 3_u64.to_ne_bytes();
|
||||||
|
let proposer_id = [0; 16];
|
||||||
|
let vote_request = [
|
||||||
|
voting_tag.as_slice(),
|
||||||
|
vote_request_term.as_slice(),
|
||||||
|
proposer_id.as_slice(),
|
||||||
|
]
|
||||||
|
.concat();
|
||||||
|
|
||||||
|
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
||||||
|
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
||||||
|
let acceptor_greeting = [
|
||||||
|
greeting_tag.as_slice(),
|
||||||
|
acceptor_greeting_term.as_slice(),
|
||||||
|
acceptor_greeting_node_id.as_slice(),
|
||||||
|
]
|
||||||
|
.concat();
|
||||||
|
|
||||||
|
let vote_response_term = 3_u64.to_ne_bytes();
|
||||||
|
let vote_given = 1_u64.to_ne_bytes();
|
||||||
|
let flush_lsn = 0x539_u64.to_ne_bytes();
|
||||||
|
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
||||||
|
let th_len = 1_u32.to_ne_bytes();
|
||||||
|
let th_term = 2_u64.to_ne_bytes();
|
||||||
|
let th_lsn = 0x539_u64.to_ne_bytes();
|
||||||
|
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
||||||
|
let vote_response = [
|
||||||
|
voting_tag.as_slice(),
|
||||||
|
vote_response_term.as_slice(),
|
||||||
|
vote_given.as_slice(),
|
||||||
|
flush_lsn.as_slice(),
|
||||||
|
truncate_lsn.as_slice(),
|
||||||
|
th_len.as_slice(),
|
||||||
|
th_term.as_slice(),
|
||||||
|
th_lsn.as_slice(),
|
||||||
|
timeline_start_lsn.as_slice(),
|
||||||
|
]
|
||||||
|
.concat();
|
||||||
|
|
||||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||||
wait_events: Cell::new(WaitEventsData {
|
wait_events: Cell::new(WaitEventsData {
|
||||||
sk: std::ptr::null_mut(),
|
sk: std::ptr::null_mut(),
|
||||||
event_mask: 0,
|
event_mask: 0,
|
||||||
}),
|
}),
|
||||||
expected_messages: vec![
|
expected_messages: vec![proposer_greeting, vote_request],
|
||||||
// TODO: When updating Postgres versions, this test will cause
|
|
||||||
// problems. Postgres version in message needs updating.
|
|
||||||
//
|
|
||||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
|
||||||
vec![
|
|
||||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
|
||||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
|
||||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
|
||||||
],
|
|
||||||
// VoteRequest(VoteRequest { term: 3 })
|
|
||||||
vec![
|
|
||||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
||||||
0, 0, 0, 0, 0, 0,
|
|
||||||
],
|
|
||||||
],
|
|
||||||
expected_ptr: AtomicUsize::new(0),
|
expected_ptr: AtomicUsize::new(0),
|
||||||
safekeeper_replies: vec![
|
safekeeper_replies: vec![acceptor_greeting, vote_response],
|
||||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
|
||||||
vec![
|
|
||||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
|
||||||
],
|
|
||||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
|
||||||
vec![
|
|
||||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
|
||||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
|
||||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
|
||||||
],
|
|
||||||
],
|
|
||||||
replies_ptr: AtomicUsize::new(0),
|
replies_ptr: AtomicUsize::new(0),
|
||||||
sync_channel: sender,
|
sync_channel: sender,
|
||||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ license.workspace = true
|
|||||||
default = []
|
default = []
|
||||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||||
# which adds some runtime cost to run tests on outage conditions
|
# which adds some runtime cost to run tests on outage conditions
|
||||||
testing = ["fail/failpoints"]
|
testing = ["fail/failpoints", "pageserver_api/testing" ]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
|||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
async-stream.workspace = true
|
async-stream.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
|
bit_field.workspace = true
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
@@ -36,7 +37,6 @@ humantime.workspace = true
|
|||||||
humantime-serde.workspace = true
|
humantime-serde.workspace = true
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
itertools.workspace = true
|
itertools.workspace = true
|
||||||
leaky-bucket.workspace = true
|
|
||||||
md5.workspace = true
|
md5.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
# hack to get the number of worker threads tokio uses
|
# hack to get the number of worker threads tokio uses
|
||||||
@@ -52,6 +52,7 @@ rand.workspace = true
|
|||||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
|
send-future.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
serde_path_to_error.workspace = true
|
serde_path_to_error.workspace = true
|
||||||
@@ -100,6 +101,7 @@ procfs.workspace = true
|
|||||||
criterion.workspace = true
|
criterion.workspace = true
|
||||||
hex-literal.workspace = true
|
hex-literal.workspace = true
|
||||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||||
|
indoc.workspace = true
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "bench_layer_map"
|
name = "bench_layer_map"
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use pageserver::{
|
|||||||
page_cache,
|
page_cache,
|
||||||
repository::Value,
|
repository::Value,
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
|
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||||
tenant::storage_layer::InMemoryLayer,
|
tenant::storage_layer::InMemoryLayer,
|
||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
@@ -67,12 +68,16 @@ async fn ingest(
|
|||||||
let layer =
|
let layer =
|
||||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||||
|
|
||||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||||
|
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||||
let ctx = RequestContext::new(
|
let ctx = RequestContext::new(
|
||||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||||
pageserver::context::DownloadBehavior::Download,
|
pageserver::context::DownloadBehavior::Download,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const BATCH_SIZE: usize = 16;
|
||||||
|
let mut batch = Vec::new();
|
||||||
|
|
||||||
for i in 0..put_count {
|
for i in 0..put_count {
|
||||||
lsn += put_size as u64;
|
lsn += put_size as u64;
|
||||||
|
|
||||||
@@ -95,7 +100,17 @@ async fn ingest(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
|
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||||
|
if batch.len() >= BATCH_SIZE {
|
||||||
|
let this_batch = std::mem::take(&mut batch);
|
||||||
|
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||||
|
layer.put_batch(serialized, &ctx).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !batch.is_empty() {
|
||||||
|
let this_batch = std::mem::take(&mut batch);
|
||||||
|
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||||
|
layer.put_batch(serialized, &ctx).await?;
|
||||||
}
|
}
|
||||||
layer.freeze(lsn + 1).await;
|
layer.freeze(lsn + 1).await;
|
||||||
|
|
||||||
@@ -149,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|
|||||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||||
));
|
));
|
||||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
virtual_file::init(
|
||||||
|
16384,
|
||||||
|
virtual_file::io_engine_for_bench(),
|
||||||
|
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||||
|
);
|
||||||
page_cache::init(conf.page_cache_size);
|
page_cache::init(conf.page_cache_size);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ license.workspace = true
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
async-trait.workspace = true
|
|
||||||
reqwest = { workspace = true, features = [ "stream" ] }
|
reqwest = { workspace = true, features = [ "stream" ] }
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
|
|||||||
@@ -1,2 +1,20 @@
|
|||||||
pub mod mgmt_api;
|
pub mod mgmt_api;
|
||||||
pub mod page_service;
|
pub mod page_service;
|
||||||
|
|
||||||
|
/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
|
||||||
|
// If file structure is per-kind not per-feature then where to put this?
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum BlockUnblock {
|
||||||
|
Block,
|
||||||
|
Unblock,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for BlockUnblock {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
let s = match self {
|
||||||
|
BlockUnblock::Block => "block",
|
||||||
|
BlockUnblock::Unblock => "unblock",
|
||||||
|
};
|
||||||
|
f.write_str(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ use utils::{
|
|||||||
|
|
||||||
pub use reqwest::Body as ReqwestBody;
|
pub use reqwest::Body as ReqwestBody;
|
||||||
|
|
||||||
|
use crate::BlockUnblock;
|
||||||
|
|
||||||
pub mod util;
|
pub mod util;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -419,6 +421,24 @@ impl Client {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn timeline_archival_config(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
req: &TimelineArchivalConfigRequest,
|
||||||
|
) -> Result<()> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
|
||||||
|
self.mgmt_api_endpoint
|
||||||
|
);
|
||||||
|
|
||||||
|
self.request(Method::POST, &uri, req)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn timeline_detach_ancestor(
|
pub async fn timeline_detach_ancestor(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -436,6 +456,20 @@ impl Client {
|
|||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn timeline_block_unblock_gc(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
dir: BlockUnblock,
|
||||||
|
) -> Result<()> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
|
||||||
|
self.mgmt_api_endpoint,
|
||||||
|
);
|
||||||
|
|
||||||
|
self.request(Method::POST, &uri, ()).await.map(|_| ())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{}/reset",
|
"{}/v1/tenant/{}/reset",
|
||||||
@@ -506,6 +540,16 @@ impl Client {
|
|||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Configs io buffer alignment at runtime.
|
||||||
|
pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
|
||||||
|
let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
|
||||||
|
self.request(Method::PUT, uri, align)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||||
self.get(uri)
|
self.get(uri)
|
||||||
|
|||||||
@@ -144,7 +144,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
|||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
|
|
||||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
pageserver::virtual_file::init(
|
||||||
|
10,
|
||||||
|
virtual_file::api::IoEngineKind::StdFs,
|
||||||
|
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||||
|
);
|
||||||
pageserver::page_cache::init(100);
|
pageserver::page_cache::init(100);
|
||||||
|
|
||||||
let mut total_delta_layers = 0usize;
|
let mut total_delta_layers = 0usize;
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
|
|||||||
|
|
||||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
|
||||||
page_cache::init(100);
|
page_cache::init(100);
|
||||||
let file = VirtualFile::open(path, ctx).await?;
|
let file = VirtualFile::open(path, ctx).await?;
|
||||||
let file_id = page_cache::next_file_id();
|
let file_id = page_cache::next_file_id();
|
||||||
@@ -89,6 +89,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
|||||||
for (k, v) in all {
|
for (k, v) in all {
|
||||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||||
println!("key:{} value_len:{}", k, value.len());
|
println!("key:{} value_len:{}", k, value.len());
|
||||||
|
assert!(k.is_i128_representable(), "invalid key: ");
|
||||||
}
|
}
|
||||||
// TODO(chi): special handling for last key?
|
// TODO(chi): special handling for last key?
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
new_tenant_id,
|
new_tenant_id,
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
} => {
|
} => {
|
||||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
pageserver::virtual_file::init(
|
||||||
|
10,
|
||||||
|
virtual_file::api::IoEngineKind::StdFs,
|
||||||
|
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||||
|
);
|
||||||
pageserver::page_cache::init(100);
|
pageserver::page_cache::init(100);
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ use pageserver::{
|
|||||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
|
||||||
use postgres_ffi::ControlFileData;
|
use postgres_ffi::ControlFileData;
|
||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
println!("specified prefix '{}' failed validation", cmd.prefix);
|
println!("specified prefix '{}' failed validation", cmd.prefix);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
};
|
};
|
||||||
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
|
||||||
let toml_item = toml_document
|
let toml_item = toml_document
|
||||||
.get("remote_storage")
|
.get("remote_storage")
|
||||||
.expect("need remote_storage");
|
.expect("need remote_storage");
|
||||||
@@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
|||||||
|
|
||||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
virtual_file::init(
|
||||||
|
10,
|
||||||
|
virtual_file::api::IoEngineKind::StdFs,
|
||||||
|
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||||
|
);
|
||||||
page_cache::init(100);
|
page_cache::init(100);
|
||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
dump_layerfile_from_path(path, true, &ctx).await
|
dump_layerfile_from_path(path, true, &ctx).await
|
||||||
|
|||||||
@@ -58,6 +58,11 @@ pub(crate) struct Args {
|
|||||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||||
#[clap(long)]
|
#[clap(long)]
|
||||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||||
|
|
||||||
|
/// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
|
||||||
|
#[clap(long)]
|
||||||
|
set_io_alignment: Option<usize>,
|
||||||
|
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
targets: Option<Vec<TenantTimelineId>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -124,6 +129,10 @@ async fn main_impl(
|
|||||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(align) = args.set_io_alignment {
|
||||||
|
mgmt_api_client.put_io_alignment(align).await?;
|
||||||
|
}
|
||||||
|
|
||||||
// discover targets
|
// discover targets
|
||||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||||
&mgmt_api_client,
|
&mgmt_api_client,
|
||||||
|
|||||||
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
|
||||||
|
|
||||||
|
pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
|
||||||
|
if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
|
||||||
|
panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pub(crate) trait U64IsUsize {
|
||||||
|
fn into_usize(self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl U64IsUsize for u64 {
|
||||||
|
#[inline(always)]
|
||||||
|
fn into_usize(self) -> usize {
|
||||||
|
#[allow(clippy::let_unit_value)]
|
||||||
|
let _ = _ASSERT_U64_EQ_USIZE;
|
||||||
|
self as usize
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) trait UsizeIsU64 {
|
||||||
|
fn into_u64(self) -> u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UsizeIsU64 for usize {
|
||||||
|
#[inline(always)]
|
||||||
|
fn into_u64(self) -> u64 {
|
||||||
|
#[allow(clippy::let_unit_value)]
|
||||||
|
let _ = _ASSERT_U64_EQ_USIZE;
|
||||||
|
self as u64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn u64_to_usize(x: u64) -> usize {
|
||||||
|
#[allow(clippy::let_unit_value)]
|
||||||
|
let _ = _ASSERT_U64_EQ_USIZE;
|
||||||
|
x as usize
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@
|
|||||||
use std::env;
|
use std::env;
|
||||||
use std::env::{var, VarError};
|
use std::env::{var, VarError};
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
@@ -36,6 +37,7 @@ use pageserver::{
|
|||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
|
use utils::crashsafe::syncfs;
|
||||||
use utils::failpoint_support;
|
use utils::failpoint_support;
|
||||||
use utils::logging::TracingErrorLayerEnablement;
|
use utils::logging::TracingErrorLayerEnablement;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -124,19 +126,53 @@ fn main() -> anyhow::Result<()> {
|
|||||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
||||||
|
|
||||||
|
// The tenants directory contains all the pageserver local disk state.
|
||||||
|
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||||
|
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
|
||||||
|
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
|
||||||
|
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
if !tenants_path.exists() {
|
{
|
||||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
let open = || {
|
||||||
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
|
nix::dir::Dir::open(
|
||||||
|
tenants_path.as_std_path(),
|
||||||
|
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
|
||||||
|
nix::sys::stat::Mode::empty(),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let dirfd = match open() {
|
||||||
|
Ok(dirfd) => dirfd,
|
||||||
|
Err(e) => match e {
|
||||||
|
nix::errno::Errno::ENOENT => {
|
||||||
|
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
|
||||||
|
format!("Failed to create tenants root dir at '{tenants_path}'")
|
||||||
|
})?;
|
||||||
|
open().context("open tenants dir after creating it")?
|
||||||
|
}
|
||||||
|
e => anyhow::bail!(e),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let started = Instant::now();
|
||||||
|
syncfs(dirfd)?;
|
||||||
|
let elapsed = started.elapsed();
|
||||||
|
info!(
|
||||||
|
elapsed_ms = elapsed.as_millis(),
|
||||||
|
"made tenant directory contents durable"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize up failpoints support
|
// Initialize up failpoints support
|
||||||
let scenario = failpoint_support::init();
|
let scenario = failpoint_support::init();
|
||||||
|
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
virtual_file::init(
|
||||||
|
conf.max_file_descriptors,
|
||||||
|
conf.virtual_file_io_engine,
|
||||||
|
conf.io_buffer_alignment,
|
||||||
|
);
|
||||||
page_cache::init(conf.page_cache_size);
|
page_cache::init(conf.page_cache_size);
|
||||||
|
|
||||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||||
@@ -172,27 +208,15 @@ fn initialize_config(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
|
let config_file_contents =
|
||||||
Ok(mut f) => {
|
std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
|
||||||
let md = f.metadata().context("stat config file")?;
|
let config_toml = serde_path_to_error::deserialize(
|
||||||
if md.is_file() {
|
toml_edit::de::Deserializer::from_str(&config_file_contents)
|
||||||
let mut s = String::new();
|
.context("build toml deserializer")?,
|
||||||
f.read_to_string(&mut s).context("read config file")?;
|
)
|
||||||
s.parse().context("parse config file toml")?
|
.context("deserialize config toml")?;
|
||||||
} else {
|
let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
|
||||||
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
|
.context("runtime-validation of config toml")?;
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!("Using pageserver toml: {config}");
|
|
||||||
|
|
||||||
// Construct the runtime representation
|
|
||||||
let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
|
|
||||||
.context("Failed to parse pageserver configuration")?;
|
|
||||||
|
|
||||||
Ok(Box::leak(Box::new(conf)))
|
Ok(Box::leak(Box::new(conf)))
|
||||||
}
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,8 @@
|
|||||||
//! Periodically collect consumption metrics for all active tenants
|
//! Periodically collect consumption metrics for all active tenants
|
||||||
//! and push them to a HTTP endpoint.
|
//! and push them to a HTTP endpoint.
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
|
use crate::consumption_metrics::metrics::MetricsKey;
|
||||||
|
use crate::consumption_metrics::upload::KeyGen as _;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||||
@@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
|
|||||||
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
|
use itertools::Itertools as _;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
@@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
mod metrics;
|
|
||||||
use crate::consumption_metrics::metrics::MetricsKey;
|
|
||||||
mod disk_cache;
|
mod disk_cache;
|
||||||
|
mod metrics;
|
||||||
mod upload;
|
mod upload;
|
||||||
|
|
||||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||||
@@ -143,6 +145,12 @@ async fn collect_metrics(
|
|||||||
// these are point in time, with variable "now"
|
// these are point in time, with variable "now"
|
||||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||||
|
|
||||||
|
// Pre-generate event idempotency keys, to reuse them across the bucket
|
||||||
|
// and HTTP sinks.
|
||||||
|
let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
|
||||||
|
.take(metrics.len())
|
||||||
|
.collect_vec();
|
||||||
|
|
||||||
let metrics = Arc::new(metrics);
|
let metrics = Arc::new(metrics);
|
||||||
|
|
||||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||||
@@ -161,8 +169,14 @@ async fn collect_metrics(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(bucket_client) = &bucket_client {
|
if let Some(bucket_client) = &bucket_client {
|
||||||
let res =
|
let res = upload::upload_metrics_bucket(
|
||||||
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
bucket_client,
|
||||||
|
&cancel,
|
||||||
|
&node_id,
|
||||||
|
&metrics,
|
||||||
|
&idempotency_keys,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
tracing::error!("failed to upload to S3: {e:#}");
|
tracing::error!("failed to upload to S3: {e:#}");
|
||||||
}
|
}
|
||||||
@@ -174,9 +188,9 @@ async fn collect_metrics(
|
|||||||
&client,
|
&client,
|
||||||
metric_collection_endpoint,
|
metric_collection_endpoint,
|
||||||
&cancel,
|
&cancel,
|
||||||
&node_id,
|
|
||||||
&metrics,
|
&metrics,
|
||||||
&mut cached_metrics,
|
&mut cached_metrics,
|
||||||
|
&idempotency_keys,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
|
|||||||
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
|
|||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
metric_collection_endpoint: &reqwest::Url,
|
metric_collection_endpoint: &reqwest::Url,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
node_id: &str,
|
|
||||||
metrics: &[RawMetric],
|
metrics: &[RawMetric],
|
||||||
cached_metrics: &mut Cache,
|
cached_metrics: &mut Cache,
|
||||||
|
idempotency_keys: &[IdempotencyKey<'_>],
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut uploaded = 0;
|
let mut uploaded = 0;
|
||||||
let mut failed = 0;
|
let mut failed = 0;
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
|
|
||||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
|
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
|
||||||
|
|
||||||
while let Some(res) = iter.next() {
|
while let Some(res) = iter.next() {
|
||||||
let (chunk, body) = res?;
|
let (chunk, body) = res?;
|
||||||
@@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
node_id: &str,
|
node_id: &str,
|
||||||
metrics: &[RawMetric],
|
metrics: &[RawMetric],
|
||||||
|
idempotency_keys: &[IdempotencyKey<'_>],
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
if metrics.is_empty() {
|
if metrics.is_empty() {
|
||||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||||
@@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
|
|
||||||
// Serialize and write into compressed buffer
|
// Serialize and write into compressed buffer
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
|
||||||
let (_chunk, body) = res?;
|
let (_chunk, body) = res?;
|
||||||
gzip_writer.write_all(&body).await?;
|
gzip_writer.write_all(&body).await?;
|
||||||
}
|
}
|
||||||
@@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// The return type is quite ugly, but we gain testability in isolation
|
/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
|
||||||
fn serialize_in_chunks<'a, F>(
|
/// idempotency keys are injected into the corresponding metric events (reused
|
||||||
|
/// across different metrics sinks), and must have the same length as input.
|
||||||
|
fn serialize_in_chunks<'a>(
|
||||||
chunk_size: usize,
|
chunk_size: usize,
|
||||||
input: &'a [RawMetric],
|
input: &'a [RawMetric],
|
||||||
factory: F,
|
idempotency_keys: &'a [IdempotencyKey<'a>],
|
||||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||||
where
|
|
||||||
F: KeyGen<'a> + 'a,
|
|
||||||
{
|
{
|
||||||
use bytes::BufMut;
|
use bytes::BufMut;
|
||||||
|
|
||||||
struct Iter<'a, F> {
|
assert_eq!(input.len(), idempotency_keys.len());
|
||||||
|
|
||||||
|
struct Iter<'a> {
|
||||||
inner: std::slice::Chunks<'a, RawMetric>,
|
inner: std::slice::Chunks<'a, RawMetric>,
|
||||||
|
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
||||||
chunk_size: usize,
|
chunk_size: usize,
|
||||||
|
|
||||||
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
||||||
buffer: bytes::BytesMut,
|
buffer: bytes::BytesMut,
|
||||||
// chunk amount of events are reused to produce the serialized document
|
// chunk amount of events are reused to produce the serialized document
|
||||||
scratch: Vec<Event<Ids, Name>>,
|
scratch: Vec<Event<Ids, Name>>,
|
||||||
factory: F,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
|
impl<'a> Iterator for Iter<'a> {
|
||||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
@@ -167,17 +170,14 @@ where
|
|||||||
self.scratch.extend(
|
self.scratch.extend(
|
||||||
chunk
|
chunk
|
||||||
.iter()
|
.iter()
|
||||||
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
|
.zip(&mut self.idempotency_keys)
|
||||||
|
.map(|(raw_metric, key)| raw_metric.as_event(key)),
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
// next rounds: update_in_place to reuse allocations
|
// next rounds: update_in_place to reuse allocations
|
||||||
assert_eq!(self.scratch.len(), self.chunk_size);
|
assert_eq!(self.scratch.len(), self.chunk_size);
|
||||||
self.scratch
|
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
|
||||||
.iter_mut()
|
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
|
||||||
.zip(chunk.iter())
|
|
||||||
.for_each(|(slot, raw_metric)| {
|
|
||||||
raw_metric.update_in_place(slot, &self.factory.generate())
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let res = serde_json::to_writer(
|
let res = serde_json::to_writer(
|
||||||
@@ -198,18 +198,19 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
|
impl<'a> ExactSizeIterator for Iter<'a> {}
|
||||||
|
|
||||||
let buffer = bytes::BytesMut::new();
|
let buffer = bytes::BytesMut::new();
|
||||||
let inner = input.chunks(chunk_size);
|
let inner = input.chunks(chunk_size);
|
||||||
|
let idempotency_keys = idempotency_keys.iter();
|
||||||
let scratch = Vec::new();
|
let scratch = Vec::new();
|
||||||
|
|
||||||
Iter {
|
Iter {
|
||||||
inner,
|
inner,
|
||||||
|
idempotency_keys,
|
||||||
chunk_size,
|
chunk_size,
|
||||||
buffer,
|
buffer,
|
||||||
scratch,
|
scratch,
|
||||||
factory,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
trait KeyGen<'a>: Copy {
|
pub(crate) trait KeyGen<'a> {
|
||||||
fn generate(&self) -> IdempotencyKey<'a>;
|
fn generate(&self) -> IdempotencyKey<'a>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -389,7 +390,10 @@ mod tests {
|
|||||||
let examples = metric_samples();
|
let examples = metric_samples();
|
||||||
assert!(examples.len() > 1);
|
assert!(examples.len() > 1);
|
||||||
|
|
||||||
let factory = FixedGen::new(Utc::now(), "1", 42);
|
let now = Utc::now();
|
||||||
|
let idempotency_keys = (0..examples.len())
|
||||||
|
.map(|i| FixedGen::new(now, "1", i as u16).generate())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
||||||
// hashmap
|
// hashmap
|
||||||
@@ -398,13 +402,13 @@ mod tests {
|
|||||||
events: Vec<Event<Ids, Name>>,
|
events: Vec<Event<Ids, Name>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
let correct = serialize_in_chunks(examples.len(), &examples, factory)
|
let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
|
||||||
.map(|res| res.unwrap().1)
|
.map(|res| res.unwrap().1)
|
||||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for chunk_size in 1..examples.len() {
|
for chunk_size in 1..examples.len() {
|
||||||
let actual = serialize_in_chunks(chunk_size, &examples, factory)
|
let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
|
||||||
.map(|res| res.unwrap().1)
|
.map(|res| res.unwrap().1)
|
||||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
//! This module defines `RequestContext`, a structure that we use throughout
|
//! Defines [`RequestContext`].
|
||||||
//! the pageserver to propagate high-level context from places
|
//!
|
||||||
//! that _originate_ activity down to the shared code paths at the
|
//! It is a structure that we use throughout the pageserver to propagate
|
||||||
//! heart of the pageserver. It's inspired by Golang's `context.Context`.
|
//! high-level context from places that _originate_ activity down to the
|
||||||
|
//! shared code paths at the heart of the pageserver. It's inspired by
|
||||||
|
//! Golang's `context.Context`.
|
||||||
//!
|
//!
|
||||||
//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
|
//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
|
||||||
//! 1. What high-level activity ([`TaskKind`]) needs this page?
|
//! 1. What high-level activity ([`TaskKind`]) needs this page?
|
||||||
@@ -105,8 +107,10 @@ pub struct RequestContext {
|
|||||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||||
pub enum PageContentKind {
|
pub enum PageContentKind {
|
||||||
Unknown,
|
Unknown,
|
||||||
|
DeltaLayerSummary,
|
||||||
DeltaLayerBtreeNode,
|
DeltaLayerBtreeNode,
|
||||||
DeltaLayerValue,
|
DeltaLayerValue,
|
||||||
|
ImageLayerSummary,
|
||||||
ImageLayerBtreeNode,
|
ImageLayerBtreeNode,
|
||||||
ImageLayerValue,
|
ImageLayerValue,
|
||||||
InMemoryLayer,
|
InMemoryLayer,
|
||||||
|
|||||||
@@ -141,12 +141,32 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
m.other
|
m.other
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let az_id = {
|
||||||
|
let az_id_from_metadata = m
|
||||||
|
.other
|
||||||
|
.get("availability_zone_id")
|
||||||
|
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||||
|
|
||||||
|
match az_id_from_metadata {
|
||||||
|
Some(az_id) => Some(az_id),
|
||||||
|
None => {
|
||||||
|
tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
|
||||||
|
conf.availability_zone.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if az_id.is_none() {
|
||||||
|
panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
|
||||||
|
}
|
||||||
|
|
||||||
Some(NodeRegisterRequest {
|
Some(NodeRegisterRequest {
|
||||||
node_id: conf.id,
|
node_id: conf.id,
|
||||||
listen_pg_addr: m.postgres_host,
|
listen_pg_addr: m.postgres_host,
|
||||||
listen_pg_port: m.postgres_port,
|
listen_pg_port: m.postgres_port,
|
||||||
listen_http_addr: m.http_host,
|
listen_http_addr: m.http_host,
|
||||||
listen_http_port: m.http_port,
|
listen_http_port: m.http_port,
|
||||||
|
availability_zone_id: az_id.expect("Checked above"),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|||||||
@@ -41,19 +41,15 @@
|
|||||||
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
|
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
|
||||||
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
||||||
|
|
||||||
use std::{
|
use std::{sync::Arc, time::SystemTime};
|
||||||
sync::Arc,
|
|
||||||
time::{Duration, SystemTime},
|
|
||||||
};
|
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::Serialize;
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||||
use utils::serde_percent::Percent;
|
|
||||||
use utils::{completion, id::TimelineId};
|
use utils::{completion, id::TimelineId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -69,23 +65,9 @@ use crate::{
|
|||||||
CancellableTask, DiskUsageEvictionTask,
|
CancellableTask, DiskUsageEvictionTask,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
||||||
pub struct DiskUsageEvictionTaskConfig {
|
|
||||||
pub max_usage_pct: Percent,
|
|
||||||
pub min_avail_bytes: u64,
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub period: Duration,
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
|
||||||
/// Select sorting for evicted layers
|
|
||||||
#[serde(default)]
|
|
||||||
pub eviction_order: EvictionOrder,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||||
/// partitioning.
|
/// partitioning.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
#[serde(tag = "type", content = "args")]
|
|
||||||
pub enum EvictionOrder {
|
pub enum EvictionOrder {
|
||||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||||
/// the set of resident layers of a tenant.
|
/// the set of resident layers of a tenant.
|
||||||
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
|
|||||||
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
|
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
|
||||||
/// to ensure nondeterminism by adding in a random number to break the
|
/// to ensure nondeterminism by adding in a random number to break the
|
||||||
/// `relative_last_activity==0.0` ties.
|
/// `relative_last_activity==0.0` ties.
|
||||||
#[serde(default = "default_highest_layer_count_loses_first")]
|
|
||||||
highest_layer_count_loses_first: bool,
|
highest_layer_count_loses_first: bool,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for EvictionOrder {
|
impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
|
||||||
fn default() -> Self {
|
fn from(value: pageserver_api::config::EvictionOrder) -> Self {
|
||||||
Self::RelativeAccessed {
|
match value {
|
||||||
highest_layer_count_loses_first: true,
|
pageserver_api::config::EvictionOrder::RelativeAccessed {
|
||||||
|
highest_layer_count_loses_first,
|
||||||
|
} => Self::RelativeAccessed {
|
||||||
|
highest_layer_count_loses_first,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_highest_layer_count_loses_first() -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EvictionOrder {
|
impl EvictionOrder {
|
||||||
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
|
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
|
||||||
use EvictionOrder::*;
|
use EvictionOrder::*;
|
||||||
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
|
|||||||
storage,
|
storage,
|
||||||
usage_pre,
|
usage_pre,
|
||||||
tenant_manager,
|
tenant_manager,
|
||||||
task_config.eviction_order,
|
task_config.eviction_order.into(),
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn max_usage_pct_pressure() {
|
fn max_usage_pct_pressure() {
|
||||||
use super::EvictionOrder;
|
|
||||||
use super::Usage as _;
|
use super::Usage as _;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::serde_percent::Percent;
|
use utils::serde_percent::Percent;
|
||||||
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
|
|||||||
period: Duration::MAX,
|
period: Duration::MAX,
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
mock_statvfs: None,
|
mock_statvfs: None,
|
||||||
eviction_order: EvictionOrder::default(),
|
eviction_order: pageserver_api::config::EvictionOrder::default(),
|
||||||
},
|
},
|
||||||
total_bytes: 100_000,
|
total_bytes: 100_000,
|
||||||
avail_bytes: 0,
|
avail_bytes: 0,
|
||||||
|
|||||||
@@ -318,6 +318,27 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||||
|
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
|
||||||
|
use crate::tenant::TimelineArchivalError::*;
|
||||||
|
match value {
|
||||||
|
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||||
|
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||||
|
e @ HasArchivedParent(_) => {
|
||||||
|
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
|
||||||
|
}
|
||||||
|
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
|
||||||
|
format!(
|
||||||
|
"Cannot archive timeline which has non-archived child timelines: {children:?}"
|
||||||
|
)
|
||||||
|
.into_boxed_str(),
|
||||||
|
),
|
||||||
|
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||||
|
Other(e) => ApiError::InternalServerError(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||||
@@ -405,6 +426,8 @@ async fn build_timeline_info_common(
|
|||||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
|
// Report is_archived = false if the timeline is still loading
|
||||||
|
let is_archived = timeline.is_archived().unwrap_or(false);
|
||||||
let remote_consistent_lsn_projected = timeline
|
let remote_consistent_lsn_projected = timeline
|
||||||
.get_remote_consistent_lsn_projected()
|
.get_remote_consistent_lsn_projected()
|
||||||
.unwrap_or(Lsn(0));
|
.unwrap_or(Lsn(0));
|
||||||
@@ -445,6 +468,7 @@ async fn build_timeline_info_common(
|
|||||||
pg_version: timeline.pg_version,
|
pg_version: timeline.pg_version,
|
||||||
|
|
||||||
state,
|
state,
|
||||||
|
is_archived: Some(is_archived),
|
||||||
|
|
||||||
walreceiver_status,
|
walreceiver_status,
|
||||||
|
|
||||||
@@ -686,9 +710,7 @@ async fn timeline_archival_config_handler(
|
|||||||
|
|
||||||
tenant
|
tenant
|
||||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||||
.await
|
.await?;
|
||||||
.context("applying archival config")
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
Ok::<_, ApiError>(())
|
Ok::<_, ApiError>(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_archival_config",
|
.instrument(info_span!("timeline_archival_config",
|
||||||
@@ -852,7 +874,10 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
|
|
||||||
match result {
|
match result {
|
||||||
Some(time) => {
|
Some(time) => {
|
||||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
let time = format_rfc3339(
|
||||||
|
postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
|
||||||
|
)
|
||||||
|
.to_string();
|
||||||
json_response(StatusCode::OK, time)
|
json_response(StatusCode::OK, time)
|
||||||
}
|
}
|
||||||
None => Err(ApiError::NotFound(
|
None => Err(ApiError::NotFound(
|
||||||
@@ -1706,13 +1731,12 @@ async fn timeline_compact_handler(
|
|||||||
flags |= CompactFlags::ForceImageLayerCreation;
|
flags |= CompactFlags::ForceImageLayerCreation;
|
||||||
}
|
}
|
||||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||||
if !cfg!(feature = "testing") {
|
|
||||||
return Err(ApiError::InternalServerError(anyhow!(
|
|
||||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||||
}
|
}
|
||||||
|
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
|
||||||
|
flags |= CompactFlags::DryRun;
|
||||||
|
}
|
||||||
|
|
||||||
let wait_until_uploaded =
|
let wait_until_uploaded =
|
||||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||||
|
|
||||||
@@ -2052,7 +2076,7 @@ async fn disk_usage_eviction_run(
|
|||||||
evict_bytes: u64,
|
evict_bytes: u64,
|
||||||
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
eviction_order: pageserver_api::config::EvictionOrder,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||||
@@ -2088,7 +2112,7 @@ async fn disk_usage_eviction_run(
|
|||||||
&state.remote_storage,
|
&state.remote_storage,
|
||||||
usage,
|
usage,
|
||||||
&state.tenant_manager,
|
&state.tenant_manager,
|
||||||
config.eviction_order,
|
config.eviction_order.into(),
|
||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -2330,6 +2354,20 @@ async fn put_io_engine_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn put_io_alignment_handler(
|
||||||
|
mut r: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permission(&r, None)?;
|
||||||
|
let align: usize = json_request(&mut r).await?;
|
||||||
|
crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
|
||||||
|
ApiError::PreconditionFailed(
|
||||||
|
format!("Requested io alignment ({align}) is not a power of two").into(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
/// Polled by control plane.
|
/// Polled by control plane.
|
||||||
///
|
///
|
||||||
/// See [`crate::utilization`].
|
/// See [`crate::utilization`].
|
||||||
@@ -2942,7 +2980,7 @@ pub fn make_router(
|
|||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||||
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
|r| api_handler(r, timeline_compact_handler),
|
||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||||
@@ -3017,6 +3055,9 @@ pub fn make_router(
|
|||||||
|r| api_handler(r, timeline_collect_keyspace),
|
|r| api_handler(r, timeline_collect_keyspace),
|
||||||
)
|
)
|
||||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||||
|
.put("/v1/io_alignment", |r| {
|
||||||
|
api_handler(r, put_io_alignment_handler)
|
||||||
|
})
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
|
|||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
|
use crate::walrecord::decode_wal_record;
|
||||||
use crate::walrecord::DecodedWALRecord;
|
use crate::walrecord::DecodedWALRecord;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
use postgres_ffi::pg_constants;
|
use postgres_ffi::pg_constants;
|
||||||
@@ -310,11 +311,13 @@ async fn import_wal(
|
|||||||
|
|
||||||
let mut nrecords = 0;
|
let mut nrecords = 0;
|
||||||
let mut modification = tline.begin_modification(last_lsn);
|
let mut modification = tline.begin_modification(last_lsn);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
|
||||||
while last_lsn <= endpoint {
|
while last_lsn <= endpoint {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
|
let mut decoded = DecodedWALRecord::default();
|
||||||
|
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||||
|
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
WAL_INGEST.records_committed.inc();
|
WAL_INGEST.records_committed.inc();
|
||||||
|
|
||||||
@@ -449,11 +452,12 @@ pub async fn import_wal_from_tar(
|
|||||||
waldecoder.feed_bytes(&bytes[offset..]);
|
waldecoder.feed_bytes(&bytes[offset..]);
|
||||||
|
|
||||||
let mut modification = tline.begin_modification(last_lsn);
|
let mut modification = tline.begin_modification(last_lsn);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
|
||||||
while last_lsn <= end_lsn {
|
while last_lsn <= end_lsn {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
|
let mut decoded = DecodedWALRecord::default();
|
||||||
|
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
modification.commit(ctx).await?;
|
modification.commit(ctx).await?;
|
||||||
last_lsn = lsn;
|
last_lsn = lsn;
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use std::{num::NonZeroUsize, sync::Arc};
|
use std::{num::NonZeroUsize, sync::Arc};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
|
||||||
pub enum L0FlushConfig {
|
pub enum L0FlushConfig {
|
||||||
#[serde(rename_all = "snake_case")]
|
|
||||||
Direct { max_concurrency: NonZeroUsize },
|
Direct { max_concurrency: NonZeroUsize },
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
|
||||||
|
fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
|
||||||
|
match config {
|
||||||
|
pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
|
||||||
|
Self::Direct { max_concurrency }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ pub mod l0_flush;
|
|||||||
use futures::{stream::FuturesUnordered, StreamExt};
|
use futures::{stream::FuturesUnordered, StreamExt};
|
||||||
pub use pageserver_api::keyspace;
|
pub use pageserver_api::keyspace;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
mod assert_u64_eq_usize;
|
||||||
pub mod aux_file;
|
pub mod aux_file;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod page_cache;
|
pub mod page_cache;
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use metrics::{
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use strum::{EnumCount, VariantNames};
|
use strum::{EnumCount, VariantNames};
|
||||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
use strum_macros::{IntoStaticStr, VariantNames};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
|||||||
];
|
];
|
||||||
|
|
||||||
// Metrics collected on operations on the storage repository.
|
// Metrics collected on operations on the storage repository.
|
||||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
#[derive(Debug, VariantNames, IntoStaticStr)]
|
||||||
#[strum(serialize_all = "kebab_case")]
|
#[strum(serialize_all = "kebab_case")]
|
||||||
pub(crate) enum StorageTimeOperation {
|
pub(crate) enum StorageTimeOperation {
|
||||||
#[strum(serialize = "layer flush")]
|
#[strum(serialize = "layer flush")]
|
||||||
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
|||||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||||
pub(crate) enum ComputeCommandKind {
|
pub(crate) enum ComputeCommandKind {
|
||||||
PageStreamV2,
|
PageStreamV2,
|
||||||
PageStream,
|
|
||||||
Basebackup,
|
Basebackup,
|
||||||
Fullbackup,
|
Fullbackup,
|
||||||
LeaseLsn,
|
LeaseLsn,
|
||||||
@@ -1803,6 +1802,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||||
|
register_uint_gauge!(
|
||||||
|
"pageserver_utilization_score",
|
||||||
|
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_secondary_heatmap_total_size",
|
"pageserver_secondary_heatmap_total_size",
|
||||||
|
|||||||
@@ -557,7 +557,7 @@ impl PageServerHandler {
|
|||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
protocol_version: PagestreamProtocolVersion,
|
_protocol_version: PagestreamProtocolVersion,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> Result<(), QueryError>
|
) -> Result<(), QueryError>
|
||||||
where
|
where
|
||||||
@@ -601,8 +601,7 @@ impl PageServerHandler {
|
|||||||
fail::fail_point!("ps::handle-pagerequest-message");
|
fail::fail_point!("ps::handle-pagerequest-message");
|
||||||
|
|
||||||
// parse request
|
// parse request
|
||||||
let neon_fe_msg =
|
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
|
||||||
|
|
||||||
// invoke handler function
|
// invoke handler function
|
||||||
let (handler_result, span) = match neon_fe_msg {
|
let (handler_result, span) = match neon_fe_msg {
|
||||||
@@ -754,16 +753,21 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if request_lsn < **latest_gc_cutoff_lsn {
|
if request_lsn < **latest_gc_cutoff_lsn {
|
||||||
// Check explicitly for INVALID just to get a less scary error message if the
|
let gc_info = &timeline.gc_info.read().unwrap();
|
||||||
// request is obviously bogus
|
if !gc_info.leases.contains_key(&request_lsn) {
|
||||||
return Err(if request_lsn == Lsn::INVALID {
|
// The requested LSN is below gc cutoff and is not guarded by a lease.
|
||||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
|
||||||
} else {
|
// Check explicitly for INVALID just to get a less scary error message if the
|
||||||
PageStreamError::BadRequest(format!(
|
// request is obviously bogus
|
||||||
|
return Err(if request_lsn == Lsn::INVALID {
|
||||||
|
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||||
|
} else {
|
||||||
|
PageStreamError::BadRequest(format!(
|
||||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||||
request_lsn, **latest_gc_cutoff_lsn
|
request_lsn, **latest_gc_cutoff_lsn
|
||||||
).into())
|
).into())
|
||||||
});
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||||
@@ -790,6 +794,8 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Handles the lsn lease request.
|
||||||
|
/// If a lease cannot be obtained, the client will receive NULL.
|
||||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||||
async fn handle_make_lsn_lease<IO>(
|
async fn handle_make_lsn_lease<IO>(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -812,19 +818,25 @@ impl PageServerHandler {
|
|||||||
.await?;
|
.await?;
|
||||||
set_tracing_field_shard_id(&timeline);
|
set_tracing_field_shard_id(&timeline);
|
||||||
|
|
||||||
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
|
let lease = timeline
|
||||||
let valid_until = lease
|
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||||
.valid_until
|
.inspect_err(|e| {
|
||||||
.duration_since(SystemTime::UNIX_EPOCH)
|
warn!("{e}");
|
||||||
.map_err(|e| QueryError::Other(e.into()))?;
|
})
|
||||||
|
.ok();
|
||||||
|
let valid_until_str = lease.map(|l| {
|
||||||
|
l.valid_until
|
||||||
|
.duration_since(SystemTime::UNIX_EPOCH)
|
||||||
|
.expect("valid_until is earlier than UNIX_EPOCH")
|
||||||
|
.as_millis()
|
||||||
|
.to_string()
|
||||||
|
});
|
||||||
|
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||||
b"valid_until",
|
b"valid_until",
|
||||||
)]))?
|
)]))?
|
||||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
.write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
|
||||||
&valid_until.as_millis().to_be_bytes(),
|
|
||||||
)]))?
|
|
||||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1187,7 +1199,6 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
@@ -1275,35 +1286,6 @@ where
|
|||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
|
||||||
if params.len() != 2 {
|
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
|
||||||
"invalid param number for pagestream command"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let tenant_id = TenantId::from_str(params[0])
|
|
||||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
|
||||||
let timeline_id = TimelineId::from_str(params[1])
|
|
||||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
|
||||||
|
|
||||||
tracing::Span::current()
|
|
||||||
.record("tenant_id", field::display(tenant_id))
|
|
||||||
.record("timeline_id", field::display(timeline_id));
|
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
|
||||||
|
|
||||||
COMPUTE_COMMANDS_COUNTERS
|
|
||||||
.for_command(ComputeCommandKind::PageStream)
|
|
||||||
.inc();
|
|
||||||
|
|
||||||
self.handle_pagerequests(
|
|
||||||
pgb,
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
PagestreamProtocolVersion::V1,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||||
if params.len() < 2 {
|
if params.len() < 2 {
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
return Err(QueryError::Other(anyhow::anyhow!(
|
||||||
|
|||||||
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
|
|||||||
use anyhow::{ensure, Context};
|
use anyhow::{ensure, Context};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use enum_map::Enum;
|
use enum_map::Enum;
|
||||||
use itertools::Itertools;
|
|
||||||
use pageserver_api::key::{
|
use pageserver_api::key::{
|
||||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||||
};
|
};
|
||||||
use pageserver_api::keyspace::SparseKeySpace;
|
use pageserver_api::keyspace::SparseKeySpace;
|
||||||
use pageserver_api::models::AuxFilePolicy;
|
use pageserver_api::models::AuxFilePolicy;
|
||||||
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::{debug, info, trace, warn};
|
use tracing::{debug, info, trace, warn};
|
||||||
use utils::bin_ser::DeserializeError;
|
use utils::bin_ser::DeserializeError;
|
||||||
use utils::pausable_failpoint;
|
use utils::pausable_failpoint;
|
||||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||||
@@ -170,10 +168,13 @@ impl Timeline {
|
|||||||
DatadirModification {
|
DatadirModification {
|
||||||
tline: self,
|
tline: self,
|
||||||
pending_lsns: Vec::new(),
|
pending_lsns: Vec::new(),
|
||||||
pending_updates: HashMap::new(),
|
pending_metadata_pages: HashMap::new(),
|
||||||
|
pending_data_pages: Vec::new(),
|
||||||
|
pending_zero_data_pages: Default::default(),
|
||||||
pending_deletions: Vec::new(),
|
pending_deletions: Vec::new(),
|
||||||
pending_nblocks: 0,
|
pending_nblocks: 0,
|
||||||
pending_directory_entries: Vec::new(),
|
pending_directory_entries: Vec::new(),
|
||||||
|
pending_bytes: 0,
|
||||||
lsn,
|
lsn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -727,7 +728,21 @@ impl Timeline {
|
|||||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||||
let current_policy = self.last_aux_file_policy.load();
|
let current_policy = self.last_aux_file_policy.load();
|
||||||
match current_policy {
|
match current_policy {
|
||||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
Some(AuxFilePolicy::V1) => {
|
||||||
|
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||||
|
let empty_str = if res.is_empty() { ", empty" } else { "" };
|
||||||
|
warn!(
|
||||||
|
"this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
|
||||||
|
);
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||||
|
if !res.is_empty() {
|
||||||
|
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
||||||
|
}
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||||
Some(AuxFilePolicy::CrossValidation) => {
|
Some(AuxFilePolicy::CrossValidation) => {
|
||||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||||
@@ -1006,9 +1021,10 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// DatadirModification represents an operation to ingest an atomic set of
|
/// DatadirModification represents an operation to ingest an atomic set of
|
||||||
/// updates to the repository. It is created by the 'begin_record'
|
/// updates to the repository.
|
||||||
/// function. It is called for each WAL record, so that all the modifications
|
///
|
||||||
/// by a one WAL record appear atomic.
|
/// It is created by the 'begin_record' function. It is called for each WAL
|
||||||
|
/// record, so that all the modifications by a one WAL record appear atomic.
|
||||||
pub struct DatadirModification<'a> {
|
pub struct DatadirModification<'a> {
|
||||||
/// The timeline this modification applies to. You can access this to
|
/// The timeline this modification applies to. You can access this to
|
||||||
/// read the state, but note that any pending updates are *not* reflected
|
/// read the state, but note that any pending updates are *not* reflected
|
||||||
@@ -1022,21 +1038,51 @@ pub struct DatadirModification<'a> {
|
|||||||
// The put-functions add the modifications here, and they are flushed to the
|
// The put-functions add the modifications here, and they are flushed to the
|
||||||
// underlying key-value store by the 'finish' function.
|
// underlying key-value store by the 'finish' function.
|
||||||
pending_lsns: Vec<Lsn>,
|
pending_lsns: Vec<Lsn>,
|
||||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
|
||||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||||
pending_nblocks: i64,
|
pending_nblocks: i64,
|
||||||
|
|
||||||
|
/// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
|
||||||
|
/// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
|
||||||
|
pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
|
||||||
|
|
||||||
|
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
|
||||||
|
/// which keys are stored here.
|
||||||
|
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||||
|
|
||||||
|
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
|
||||||
|
// if we encounter a write from postgres in the same wal record, we will drop this entry.
|
||||||
|
//
|
||||||
|
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
|
||||||
|
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
|
||||||
|
pending_zero_data_pages: HashSet<CompactKey>,
|
||||||
|
|
||||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||||
/// if it was updated in this modification.
|
/// if it was updated in this modification.
|
||||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||||
|
|
||||||
|
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
||||||
|
pending_bytes: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DatadirModification<'a> {
|
impl<'a> DatadirModification<'a> {
|
||||||
|
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
||||||
|
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
||||||
|
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
|
||||||
|
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
|
||||||
|
|
||||||
/// Get the current lsn
|
/// Get the current lsn
|
||||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||||
self.lsn
|
self.lsn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
||||||
|
self.pending_bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn has_dirty_data_pages(&self) -> bool {
|
||||||
|
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the current lsn
|
/// Set the current lsn
|
||||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||||
ensure!(
|
ensure!(
|
||||||
@@ -1045,6 +1091,10 @@ impl<'a> DatadirModification<'a> {
|
|||||||
lsn,
|
lsn,
|
||||||
self.lsn
|
self.lsn
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// If we are advancing LSN, then state from previous wal record should have been flushed.
|
||||||
|
assert!(self.pending_zero_data_pages.is_empty());
|
||||||
|
|
||||||
if lsn > self.lsn {
|
if lsn > self.lsn {
|
||||||
self.pending_lsns.push(self.lsn);
|
self.pending_lsns.push(self.lsn);
|
||||||
self.lsn = lsn;
|
self.lsn = lsn;
|
||||||
@@ -1052,6 +1102,17 @@ impl<'a> DatadirModification<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
|
||||||
|
/// keys that represent literal blocks that postgres can read. So data includes relation blocks and
|
||||||
|
/// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
|
||||||
|
///
|
||||||
|
/// The distinction is important because data keys are handled on a fast path where dirty writes are
|
||||||
|
/// not readable until this modification is committed, whereas metadata keys are visible for read
|
||||||
|
/// via [`Self::get`] as soon as their record has been ingested.
|
||||||
|
fn is_data_key(key: &Key) -> bool {
|
||||||
|
key.is_rel_block_key() || key.is_slru_block_key()
|
||||||
|
}
|
||||||
|
|
||||||
/// Initialize a completely new repository.
|
/// Initialize a completely new repository.
|
||||||
///
|
///
|
||||||
/// This inserts the directory metadata entries that are assumed to
|
/// This inserts the directory metadata entries that are assumed to
|
||||||
@@ -1144,6 +1205,13 @@ impl<'a> DatadirModification<'a> {
|
|||||||
img: Bytes,
|
img: Bytes,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||||
|
let key = rel_block_to_key(rel, blknum);
|
||||||
|
if !key.is_valid_key_on_write_path() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"the request contains data not supported by pageserver at {}",
|
||||||
|
key
|
||||||
|
);
|
||||||
|
}
|
||||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1155,10 +1223,63 @@ impl<'a> DatadirModification<'a> {
|
|||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
img: Bytes,
|
img: Bytes,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
|
let key = slru_block_to_key(kind, segno, blknum);
|
||||||
|
if !key.is_valid_key_on_write_path() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"the request contains data not supported by pageserver at {}",
|
||||||
|
key
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.put(key, Value::Image(img));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn put_rel_page_image_zero(
|
||||||
|
&mut self,
|
||||||
|
rel: RelTag,
|
||||||
|
blknum: BlockNumber,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||||
|
let key = rel_block_to_key(rel, blknum);
|
||||||
|
if !key.is_valid_key_on_write_path() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"the request contains data not supported by pageserver: {} @ {}",
|
||||||
|
key,
|
||||||
|
self.lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.pending_zero_data_pages.insert(key.to_compact());
|
||||||
|
self.pending_bytes += ZERO_PAGE.len();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn put_slru_page_image_zero(
|
||||||
|
&mut self,
|
||||||
|
kind: SlruKind,
|
||||||
|
segno: u32,
|
||||||
|
blknum: BlockNumber,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let key = slru_block_to_key(kind, segno, blknum);
|
||||||
|
if !key.is_valid_key_on_write_path() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"the request contains data not supported by pageserver: {} @ {}",
|
||||||
|
key,
|
||||||
|
self.lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.pending_zero_data_pages.insert(key.to_compact());
|
||||||
|
self.pending_bytes += ZERO_PAGE.len();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Call this at the end of each WAL record.
|
||||||
|
pub(crate) fn on_record_end(&mut self) {
|
||||||
|
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
|
||||||
|
for key in pending_zero_data_pages {
|
||||||
|
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||||
pub async fn put_relmap_file(
|
pub async fn put_relmap_file(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -1576,6 +1697,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if aux_files_key_v1.is_empty() {
|
if aux_files_key_v1.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
|
warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
|
||||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||||
Some(AuxFilePolicy::V1)
|
Some(AuxFilePolicy::V1)
|
||||||
}
|
}
|
||||||
@@ -1756,7 +1878,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||||
/// for bulk import, where you are just loading data pages and won't try to
|
/// for bulk import, where you are just loading data pages and won't try to
|
||||||
/// modify the same pages twice.
|
/// modify the same pages twice.
|
||||||
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||||
// to scan through the pending_updates list.
|
// to scan through the pending_updates list.
|
||||||
let pending_nblocks = self.pending_nblocks;
|
let pending_nblocks = self.pending_nblocks;
|
||||||
@@ -1767,23 +1889,12 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let mut writer = self.tline.writer().await;
|
let mut writer = self.tline.writer().await;
|
||||||
|
|
||||||
// Flush relation and SLRU data blocks, keep metadata.
|
// Flush relation and SLRU data blocks, keep metadata.
|
||||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
|
||||||
for (key, values) in self.pending_updates.drain() {
|
|
||||||
for (lsn, value) in values {
|
|
||||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
|
||||||
// This bails out on first error without modifying pending_updates.
|
|
||||||
// That's Ok, cf this function's doc comment.
|
|
||||||
writer.put(key, lsn, &value, ctx).await?;
|
|
||||||
} else {
|
|
||||||
retained_pending_updates
|
|
||||||
.entry(key)
|
|
||||||
.or_default()
|
|
||||||
.push((lsn, value));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.pending_updates = retained_pending_updates;
|
// This bails out on first error without modifying pending_updates.
|
||||||
|
// That's Ok, cf this function's doc comment.
|
||||||
|
writer.put_batch(pending_data_pages, ctx).await?;
|
||||||
|
self.pending_bytes = 0;
|
||||||
|
|
||||||
if pending_nblocks != 0 {
|
if pending_nblocks != 0 {
|
||||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||||
@@ -1803,23 +1914,31 @@ impl<'a> DatadirModification<'a> {
|
|||||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||||
///
|
///
|
||||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
|
// Commit should never be called mid-wal-record
|
||||||
|
assert!(self.pending_zero_data_pages.is_empty());
|
||||||
|
|
||||||
let mut writer = self.tline.writer().await;
|
let mut writer = self.tline.writer().await;
|
||||||
|
|
||||||
let pending_nblocks = self.pending_nblocks;
|
let pending_nblocks = self.pending_nblocks;
|
||||||
self.pending_nblocks = 0;
|
self.pending_nblocks = 0;
|
||||||
|
|
||||||
if !self.pending_updates.is_empty() {
|
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||||
// so we do that first.
|
// this to do efficient updates to its index.
|
||||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
|
||||||
self.pending_updates
|
|
||||||
.drain()
|
|
||||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
|
||||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
|
||||||
VecMapOrdering::GreaterOrEqual,
|
|
||||||
);
|
|
||||||
|
|
||||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
write_batch.extend(
|
||||||
|
self.pending_metadata_pages
|
||||||
|
.drain()
|
||||||
|
.flat_map(|(key, values)| {
|
||||||
|
values
|
||||||
|
.into_iter()
|
||||||
|
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
if !write_batch.is_empty() {
|
||||||
|
writer.put_batch(write_batch, ctx).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !self.pending_deletions.is_empty() {
|
if !self.pending_deletions.is_empty() {
|
||||||
@@ -1844,37 +1963,64 @@ impl<'a> DatadirModification<'a> {
|
|||||||
writer.update_directory_entries_count(kind, count as u64);
|
writer.update_directory_entries_count(kind, count as u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.pending_bytes = 0;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn len(&self) -> usize {
|
pub(crate) fn len(&self) -> usize {
|
||||||
self.pending_updates.len() + self.pending_deletions.len()
|
self.pending_metadata_pages.len()
|
||||||
|
+ self.pending_data_pages.len()
|
||||||
|
+ self.pending_deletions.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Internal helper functions to batch the modifications
|
/// Read a page from the Timeline we are writing to. For metadata pages, this passes through
|
||||||
|
/// a cache in Self, which makes writes earlier in this modification visible to WAL records later
|
||||||
|
/// in the modification.
|
||||||
|
///
|
||||||
|
/// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
|
||||||
|
/// page must ensure that the pages they read are already committed in Timeline, for example
|
||||||
|
/// DB create operations are always preceded by a call to commit(). This is special cased because
|
||||||
|
/// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
|
||||||
|
/// and not data pages.
|
||||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||||
// Have we already updated the same key? Read the latest pending updated
|
if !Self::is_data_key(&key) {
|
||||||
// version in that case.
|
// Have we already updated the same key? Read the latest pending updated
|
||||||
//
|
// version in that case.
|
||||||
// Note: we don't check pending_deletions. It is an error to request a
|
//
|
||||||
// value that has been removed, deletion only avoids leaking storage.
|
// Note: we don't check pending_deletions. It is an error to request a
|
||||||
if let Some(values) = self.pending_updates.get(&key) {
|
// value that has been removed, deletion only avoids leaking storage.
|
||||||
if let Some((_, value)) = values.last() {
|
if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
|
||||||
return if let Value::Image(img) = value {
|
if let Some((_, _, value)) = values.last() {
|
||||||
Ok(img.clone())
|
return if let Value::Image(img) = value {
|
||||||
} else {
|
Ok(img.clone())
|
||||||
// Currently, we never need to read back a WAL record that we
|
} else {
|
||||||
// inserted in the same "transaction". All the metadata updates
|
// Currently, we never need to read back a WAL record that we
|
||||||
// work directly with Images, and we never need to read actual
|
// inserted in the same "transaction". All the metadata updates
|
||||||
// data pages. We could handle this if we had to, by calling
|
// work directly with Images, and we never need to read actual
|
||||||
// the walredo manager, but let's keep it simple for now.
|
// data pages. We could handle this if we had to, by calling
|
||||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
// the walredo manager, but let's keep it simple for now.
|
||||||
"unexpected pending WAL record"
|
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||||
)))
|
"unexpected pending WAL record"
|
||||||
};
|
)))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This is an expensive check, so we only do it in debug mode. If reading a data key,
|
||||||
|
// this key should never be present in pending_data_pages. We ensure this by committing
|
||||||
|
// modifications before ingesting DB create operations, which are the only kind that reads
|
||||||
|
// data pages during ingest.
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
for (dirty_key, _, _, _) in &self.pending_data_pages {
|
||||||
|
debug_assert!(&key.to_compact() != dirty_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Metadata page cache miss, or we're reading a data page.
|
||||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||||
self.tline.get(key, lsn, ctx).await
|
self.tline.get(key, lsn, ctx).await
|
||||||
}
|
}
|
||||||
@@ -1886,15 +2032,48 @@ impl<'a> DatadirModification<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn put(&mut self, key: Key, val: Value) {
|
fn put(&mut self, key: Key, val: Value) {
|
||||||
let values = self.pending_updates.entry(key).or_default();
|
if Self::is_data_key(&key) {
|
||||||
|
self.put_data(key.to_compact(), val)
|
||||||
|
} else {
|
||||||
|
self.put_metadata(key.to_compact(), val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put_data(&mut self, key: CompactKey, val: Value) {
|
||||||
|
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||||
|
|
||||||
|
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
|
||||||
|
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
|
||||||
|
// and the subsequent postgres-originating write
|
||||||
|
if self.pending_zero_data_pages.remove(&key) {
|
||||||
|
self.pending_bytes -= ZERO_PAGE.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.pending_bytes += val_serialized_size;
|
||||||
|
self.pending_data_pages
|
||||||
|
.push((key, self.lsn, val_serialized_size, val))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put_metadata(&mut self, key: CompactKey, val: Value) {
|
||||||
|
let values = self.pending_metadata_pages.entry(key).or_default();
|
||||||
// Replace the previous value if it exists at the same lsn
|
// Replace the previous value if it exists at the same lsn
|
||||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||||
if *last_lsn == self.lsn {
|
if *last_lsn == self.lsn {
|
||||||
|
// Update the pending_bytes contribution from this entry, and update the serialized size in place
|
||||||
|
self.pending_bytes -= *last_value_ser_size;
|
||||||
|
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||||
|
self.pending_bytes += *last_value_ser_size;
|
||||||
|
|
||||||
|
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
|
||||||
|
// have been generated by synthesized zero page writes prior to the first real write to a page.
|
||||||
*last_value = val;
|
*last_value = val;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
values.push((self.lsn, val));
|
|
||||||
|
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||||
|
self.pending_bytes += val_serialized_size;
|
||||||
|
values.push((self.lsn, val_serialized_size, val));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete(&mut self, key_range: Range<Key>) {
|
fn delete(&mut self, key_range: Range<Key>) {
|
||||||
@@ -1905,6 +2084,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
|
|
||||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||||
|
///
|
||||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||||
/// need to look up the keys in the modification first before looking them up in the
|
/// need to look up the keys in the modification first before looking them up in the
|
||||||
@@ -2024,7 +2204,7 @@ mod tests {
|
|||||||
|
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
let tline = tenant
|
let tline = tenant
|
||||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||||
.await?;
|
.await?;
|
||||||
let tline = tline.raw_timeline().unwrap();
|
let tline = tline.raw_timeline().unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -60,32 +60,7 @@ pub mod mock {
|
|||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use tracing::log::info;
|
use tracing::log::info;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
pub use pageserver_api::config::statvfs::mock::Behavior;
|
||||||
#[serde(tag = "type")]
|
|
||||||
pub enum Behavior {
|
|
||||||
Success {
|
|
||||||
blocksize: u64,
|
|
||||||
total_blocks: u64,
|
|
||||||
name_filter: Option<utils::serde_regex::Regex>,
|
|
||||||
},
|
|
||||||
Failure {
|
|
||||||
mocked_error: MockedError,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
|
||||||
#[allow(clippy::upper_case_acronyms)]
|
|
||||||
pub enum MockedError {
|
|
||||||
EIO,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<MockedError> for nix::Error {
|
|
||||||
fn from(e: MockedError) -> Self {
|
|
||||||
match e {
|
|
||||||
MockedError::EIO => nix::Error::EIO,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
|
pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
|
||||||
info!("running mocked statvfs");
|
info!("running mocked statvfs");
|
||||||
@@ -116,6 +91,7 @@ pub mod mock {
|
|||||||
block_size: *blocksize,
|
block_size: *blocksize,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
|
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
|
||||||
|
env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
|
||||||
|
// the default 2MiB are insufficent, especially in debug mode
|
||||||
|
.unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
|
||||||
|
});
|
||||||
|
|
||||||
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||||
let thread_name = "pageserver-tokio";
|
let thread_name = "pageserver-tokio";
|
||||||
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
||||||
@@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
|||||||
tokio::runtime::Builder::new_current_thread()
|
tokio::runtime::Builder::new_current_thread()
|
||||||
.thread_name(thread_name)
|
.thread_name(thread_name)
|
||||||
.enable_all()
|
.enable_all()
|
||||||
|
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||||
.build()
|
.build()
|
||||||
.expect("failed to create one single runtime")
|
.expect("failed to create one single runtime")
|
||||||
}
|
}
|
||||||
@@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
|||||||
.thread_name(thread_name)
|
.thread_name(thread_name)
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.worker_threads(num_workers.get())
|
.worker_threads(num_workers.get())
|
||||||
|
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||||
.build()
|
.build()
|
||||||
.expect("failed to create one multi-threaded runtime")
|
.expect("failed to create one multi-threaded runtime")
|
||||||
}
|
}
|
||||||
@@ -199,6 +207,7 @@ macro_rules! pageserver_runtime {
|
|||||||
.thread_name($name)
|
.thread_name($name)
|
||||||
.worker_threads(TOKIO_WORKER_THREADS.get())
|
.worker_threads(TOKIO_WORKER_THREADS.get())
|
||||||
.enable_all()
|
.enable_all()
|
||||||
|
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||||
.build()
|
.build()
|
||||||
.expect(std::concat!("Failed to create runtime ", $name))
|
.expect(std::concat!("Failed to create runtime ", $name))
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
|
//! Timeline repository implementation that keeps old data in layer files, and
|
||||||
|
//! the recent changes in ephemeral files.
|
||||||
//!
|
//!
|
||||||
//! Timeline repository implementation that keeps old data in files on disk, and
|
//! See tenant/*_layer.rs files. The functions here are responsible for locating
|
||||||
//! the recent changes in memory. See tenant/*_layer.rs files.
|
//! the correct layer for the get/put call, walking back the timeline branching
|
||||||
//! The functions here are responsible for locating the correct layer for the
|
//! history as needed.
|
||||||
//! get/put call, walking back the timeline branching history as needed.
|
|
||||||
//!
|
//!
|
||||||
//! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
|
//! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
|
||||||
//! directory. See docs/pageserver-storage.md for how the files are managed.
|
//! directory. See docs/pageserver-storage.md for how the files are managed.
|
||||||
@@ -501,6 +502,42 @@ impl Debug for DeleteTimelineError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error)]
|
||||||
|
pub enum TimelineArchivalError {
|
||||||
|
#[error("NotFound")]
|
||||||
|
NotFound,
|
||||||
|
|
||||||
|
#[error("Timeout")]
|
||||||
|
Timeout,
|
||||||
|
|
||||||
|
#[error("ancestor is archived: {}", .0)]
|
||||||
|
HasArchivedParent(TimelineId),
|
||||||
|
|
||||||
|
#[error("HasUnarchivedChildren")]
|
||||||
|
HasUnarchivedChildren(Vec<TimelineId>),
|
||||||
|
|
||||||
|
#[error("Timeline archival is already in progress")]
|
||||||
|
AlreadyInProgress,
|
||||||
|
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for TimelineArchivalError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::NotFound => write!(f, "NotFound"),
|
||||||
|
Self::Timeout => write!(f, "Timeout"),
|
||||||
|
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
|
||||||
|
Self::HasUnarchivedChildren(c) => {
|
||||||
|
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||||
|
}
|
||||||
|
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
|
||||||
|
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub enum SetStoppingError {
|
pub enum SetStoppingError {
|
||||||
AlreadyStopping(completion::Barrier),
|
AlreadyStopping(completion::Barrier),
|
||||||
Broken,
|
Broken,
|
||||||
@@ -845,6 +882,12 @@ impl Tenant {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// TODO: should also be rejecting tenant conf changes that violate this check.
|
||||||
|
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
|
||||||
|
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let mut init_order = init_order;
|
let mut init_order = init_order;
|
||||||
// take the completion because initial tenant loading will complete when all of
|
// take the completion because initial tenant loading will complete when all of
|
||||||
// these tasks complete.
|
// these tasks complete.
|
||||||
@@ -1326,24 +1369,59 @@ impl Tenant {
|
|||||||
&self,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
state: TimelineArchivalState,
|
state: TimelineArchivalState,
|
||||||
) -> anyhow::Result<()> {
|
) -> Result<(), TimelineArchivalError> {
|
||||||
let timeline = self
|
info!("setting timeline archival config");
|
||||||
.get_timeline(timeline_id, false)
|
let timeline = {
|
||||||
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
let timelines = self.timelines.lock().unwrap();
|
||||||
|
|
||||||
|
let Some(timeline) = timelines.get(&timeline_id) else {
|
||||||
|
return Err(TimelineArchivalError::NotFound);
|
||||||
|
};
|
||||||
|
|
||||||
|
if state == TimelineArchivalState::Unarchived {
|
||||||
|
if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
|
||||||
|
if ancestor_timeline.is_archived() == Some(true) {
|
||||||
|
return Err(TimelineArchivalError::HasArchivedParent(
|
||||||
|
ancestor_timeline.timeline_id,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that there are no non-archived child timelines
|
||||||
|
let children: Vec<TimelineId> = timelines
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(id, entry)| {
|
||||||
|
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if entry.is_archived() == Some(true) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(*id)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if !children.is_empty() && state == TimelineArchivalState::Archived {
|
||||||
|
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
|
||||||
|
}
|
||||||
|
Arc::clone(timeline)
|
||||||
|
};
|
||||||
|
|
||||||
let upload_needed = timeline
|
let upload_needed = timeline
|
||||||
.remote_client
|
.remote_client
|
||||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||||
|
|
||||||
if upload_needed {
|
if upload_needed {
|
||||||
|
info!("Uploading new state");
|
||||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||||
let Ok(v) =
|
let Ok(v) =
|
||||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||||
else {
|
else {
|
||||||
tracing::warn!("reached timeout for waiting on upload queue");
|
tracing::warn!("reached timeout for waiting on upload queue");
|
||||||
bail!("reached timeout for upload queue flush");
|
return Err(TimelineArchivalError::Timeout);
|
||||||
};
|
};
|
||||||
v?;
|
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -3741,13 +3819,21 @@ impl Tenant {
|
|||||||
/// less than this (via eviction and on-demand downloads), but this function enables
|
/// less than this (via eviction and on-demand downloads), but this function enables
|
||||||
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
||||||
/// by keeping important things on local disk.
|
/// by keeping important things on local disk.
|
||||||
|
///
|
||||||
|
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
|
||||||
|
/// than they report here, due to layer eviction. Tenants with many active branches may
|
||||||
|
/// actually use more than they report here.
|
||||||
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
||||||
let mut wanted = 0;
|
|
||||||
let timelines = self.timelines.lock().unwrap();
|
let timelines = self.timelines.lock().unwrap();
|
||||||
for timeline in timelines.values() {
|
|
||||||
wanted += timeline.metrics.visible_physical_size_gauge.get();
|
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
|
||||||
}
|
// reflects the observation that on tenants with multiple large branches, typically only one
|
||||||
wanted
|
// of them is used actively enough to occupy space on disk.
|
||||||
|
timelines
|
||||||
|
.values()
|
||||||
|
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -5932,10 +6018,10 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// the default aux file policy to switch is v1 if not set by the admins
|
// the default aux file policy to switch is v2 if not set by the admins
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
harness.tenant_conf.switch_aux_file_policy,
|
harness.tenant_conf.switch_aux_file_policy,
|
||||||
AuxFilePolicy::V1
|
AuxFilePolicy::default_tenant_config()
|
||||||
);
|
);
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
@@ -5979,8 +6065,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.last_aux_file_policy.load(),
|
tline.last_aux_file_policy.load(),
|
||||||
Some(AuxFilePolicy::V1),
|
Some(AuxFilePolicy::V2),
|
||||||
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
||||||
);
|
);
|
||||||
|
|
||||||
// we can read everything from the storage
|
// we can read everything from the storage
|
||||||
@@ -6002,8 +6088,8 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.last_aux_file_policy.load(),
|
tline.last_aux_file_policy.load(),
|
||||||
Some(AuxFilePolicy::V1),
|
Some(AuxFilePolicy::V2),
|
||||||
"keep v1 storage format when new files are written"
|
"keep v2 storage format when new files are written"
|
||||||
);
|
);
|
||||||
|
|
||||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||||
@@ -6019,7 +6105,7 @@ mod tests {
|
|||||||
|
|
||||||
// child copies the last flag even if that is not on remote storage yet
|
// child copies the last flag even if that is not on remote storage yet
|
||||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||||
|
|
||||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||||
@@ -7005,18 +7091,14 @@ mod tests {
|
|||||||
vec![
|
vec![
|
||||||
// Image layer at GC horizon
|
// Image layer at GC horizon
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: {
|
key_range: Key::MIN..Key::MAX,
|
||||||
let mut key = Key::MAX;
|
|
||||||
key.field6 -= 1;
|
|
||||||
Key::MIN..key
|
|
||||||
},
|
|
||||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||||
is_delta: false
|
is_delta: false
|
||||||
},
|
},
|
||||||
// The delta layer that is cut in the middle
|
// The delta layer below the horizon
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: get_key(3)..get_key(4),
|
key_range: get_key(3)..get_key(4),
|
||||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
lsn_range: Lsn(0x30)..Lsn(0x48),
|
||||||
is_delta: true
|
is_delta: true
|
||||||
},
|
},
|
||||||
// The delta3 layer that should not be picked for the compaction
|
// The delta3 layer that should not be picked for the compaction
|
||||||
@@ -7996,6 +8078,214 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||||
|
{
|
||||||
|
let harness =
|
||||||
|
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
|
||||||
|
.await?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
fn get_key(id: u32) -> Key {
|
||||||
|
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||||
|
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||||
|
key.field6 = id;
|
||||||
|
key
|
||||||
|
}
|
||||||
|
|
||||||
|
let img_layer = (0..10)
|
||||||
|
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||||
|
.collect_vec();
|
||||||
|
|
||||||
|
let delta1 = vec![
|
||||||
|
(
|
||||||
|
get_key(1),
|
||||||
|
Lsn(0x20),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(1),
|
||||||
|
Lsn(0x28),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let delta2 = vec![
|
||||||
|
(
|
||||||
|
get_key(1),
|
||||||
|
Lsn(0x30),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(1),
|
||||||
|
Lsn(0x38),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let delta3 = vec![
|
||||||
|
(
|
||||||
|
get_key(8),
|
||||||
|
Lsn(0x48),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
get_key(9),
|
||||||
|
Lsn(0x48),
|
||||||
|
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline_with_layers(
|
||||||
|
TIMELINE_ID,
|
||||||
|
Lsn(0x10),
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
|
&ctx,
|
||||||
|
vec![
|
||||||
|
// delta1 and delta 2 only contain a single key but multiple updates
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||||
|
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
|
||||||
|
], // delta layers
|
||||||
|
vec![(Lsn(0x10), img_layer)], // image layers
|
||||||
|
Lsn(0x50),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
{
|
||||||
|
// Update GC info
|
||||||
|
let mut guard = tline.gc_info.write().unwrap();
|
||||||
|
*guard = GcInfo {
|
||||||
|
retain_lsns: vec![
|
||||||
|
(Lsn(0x10), tline.timeline_id),
|
||||||
|
(Lsn(0x20), tline.timeline_id),
|
||||||
|
],
|
||||||
|
cutoffs: GcCutoffs {
|
||||||
|
time: Lsn(0x30),
|
||||||
|
space: Lsn(0x30),
|
||||||
|
},
|
||||||
|
leases: Default::default(),
|
||||||
|
within_ancestor_pitr: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let expected_result = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||||
|
Bytes::from_static(b"value 2@0x10"),
|
||||||
|
Bytes::from_static(b"value 3@0x10"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10"),
|
||||||
|
Bytes::from_static(b"value 6@0x10"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||||
|
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_result_at_gc_horizon = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||||
|
Bytes::from_static(b"value 2@0x10"),
|
||||||
|
Bytes::from_static(b"value 3@0x10"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10"),
|
||||||
|
Bytes::from_static(b"value 6@0x10"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10"),
|
||||||
|
Bytes::from_static(b"value 9@0x10"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_result_at_lsn_20 = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||||
|
Bytes::from_static(b"value 2@0x10"),
|
||||||
|
Bytes::from_static(b"value 3@0x10"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10"),
|
||||||
|
Bytes::from_static(b"value 6@0x10"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10"),
|
||||||
|
Bytes::from_static(b"value 9@0x10"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_result_at_lsn_10 = [
|
||||||
|
Bytes::from_static(b"value 0@0x10"),
|
||||||
|
Bytes::from_static(b"value 1@0x10"),
|
||||||
|
Bytes::from_static(b"value 2@0x10"),
|
||||||
|
Bytes::from_static(b"value 3@0x10"),
|
||||||
|
Bytes::from_static(b"value 4@0x10"),
|
||||||
|
Bytes::from_static(b"value 5@0x10"),
|
||||||
|
Bytes::from_static(b"value 6@0x10"),
|
||||||
|
Bytes::from_static(b"value 7@0x10"),
|
||||||
|
Bytes::from_static(b"value 8@0x10"),
|
||||||
|
Bytes::from_static(b"value 9@0x10"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let verify_result = || async {
|
||||||
|
let gc_horizon = {
|
||||||
|
let gc_info = tline.gc_info.read().unwrap();
|
||||||
|
gc_info.cutoffs.time
|
||||||
|
};
|
||||||
|
for idx in 0..10 {
|
||||||
|
assert_eq!(
|
||||||
|
tline
|
||||||
|
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result[idx]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tline
|
||||||
|
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result_at_gc_horizon[idx]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tline
|
||||||
|
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result_at_lsn_20[idx]
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
tline
|
||||||
|
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
&expected_result_at_lsn_10[idx]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
let mut dryrun_flags = EnumSet::new();
|
||||||
|
dryrun_flags.insert(CompactFlags::DryRun);
|
||||||
|
|
||||||
|
tline
|
||||||
|
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||||
|
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
tline
|
||||||
|
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
// compact again
|
||||||
|
tline
|
||||||
|
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
verify_result().await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||||
|
|||||||
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
|||||||
|
|
||||||
/// The maximum size of blobs we support. The highest few bits
|
/// The maximum size of blobs we support. The highest few bits
|
||||||
/// are reserved for compression and other further uses.
|
/// are reserved for compression and other further uses.
|
||||||
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
|
||||||
|
|
||||||
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||||
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||||
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
||||||
} else {
|
} else {
|
||||||
// Write a 4-byte length header
|
// Write a 4-byte length header
|
||||||
if len > MAX_SUPPORTED_LEN {
|
if len > MAX_SUPPORTED_BLOB_LEN {
|
||||||
return (
|
return (
|
||||||
(
|
(
|
||||||
io_buf.slice_len(),
|
io_buf.slice_len(),
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
//! Low-level Block-oriented I/O functions
|
//! Low-level Block-oriented I/O functions
|
||||||
//!
|
//!
|
||||||
|
|
||||||
use super::ephemeral_file::EphemeralFile;
|
|
||||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||||
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
|
|||||||
/// Unlike traits, we also support the read function to be async though.
|
/// Unlike traits, we also support the read function to be async though.
|
||||||
pub(crate) enum BlockReaderRef<'a> {
|
pub(crate) enum BlockReaderRef<'a> {
|
||||||
FileBlockReader(&'a FileBlockReader<'a>),
|
FileBlockReader(&'a FileBlockReader<'a>),
|
||||||
EphemeralFile(&'a EphemeralFile),
|
|
||||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||||
Slice(&'a [u8]),
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
use BlockReaderRef::*;
|
use BlockReaderRef::*;
|
||||||
match self {
|
match self {
|
||||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||||
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
|
||||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||||
Slice(s) => Self::read_blk_slice(s, blknum),
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(r) => r.read_blk(blknum),
|
TestDisk(r) => r.read_blk(blknum),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> BlockReaderRef<'a> {
|
|
||||||
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
|
||||||
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
|
||||||
let end = start.checked_add(PAGE_SZ).unwrap();
|
|
||||||
if end > slice.len() {
|
|
||||||
return Err(std::io::Error::new(
|
|
||||||
std::io::ErrorKind::UnexpectedEof,
|
|
||||||
format!("slice too short, len={} end={}", slice.len(), end),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
let slice = &slice[start..end];
|
|
||||||
let page_sized: &[u8; PAGE_SZ] = slice
|
|
||||||
.try_into()
|
|
||||||
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
|
||||||
Ok(BlockLease::Slice(page_sized))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -9,11 +9,10 @@
|
|||||||
//! may lead to a data loss.
|
//! may lead to a data loss.
|
||||||
//!
|
//!
|
||||||
use anyhow::bail;
|
use anyhow::bail;
|
||||||
|
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||||
use pageserver_api::models::AuxFilePolicy;
|
use pageserver_api::models::AuxFilePolicy;
|
||||||
use pageserver_api::models::CompactionAlgorithm;
|
|
||||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||||
use pageserver_api::models::EvictionPolicy;
|
use pageserver_api::models::EvictionPolicy;
|
||||||
use pageserver_api::models::LsnLease;
|
|
||||||
use pageserver_api::models::{self, ThrottleConfig};
|
use pageserver_api::models::{self, ThrottleConfig};
|
||||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
|
|
||||||
pub mod defaults {
|
|
||||||
|
|
||||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
|
||||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
|
||||||
// which is good for now to trigger bugs.
|
|
||||||
// This parameter actually determines L0 layer file size.
|
|
||||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
|
||||||
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
|
||||||
|
|
||||||
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
|
||||||
// has different parameters.
|
|
||||||
|
|
||||||
// Target file size, when creating image and delta layers.
|
|
||||||
// This parameter determines L1 layer file size.
|
|
||||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
|
||||||
|
|
||||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
|
||||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
|
||||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
|
||||||
super::CompactionAlgorithm::Legacy;
|
|
||||||
|
|
||||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
|
||||||
|
|
||||||
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
|
||||||
// If there's a need to decrease this value, first make sure that GC
|
|
||||||
// doesn't hold a layer map write lock for non-trivial operations.
|
|
||||||
// Relevant: https://github.com/neondatabase/neon/issues/3394
|
|
||||||
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
|
||||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
|
||||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
|
||||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
|
||||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
|
||||||
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
|
|
||||||
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
|
|
||||||
// throughputs up to 1GiB/s per timeline.
|
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
|
||||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
|
||||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
|
||||||
// image layers should be created.
|
|
||||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
|
||||||
|
|
||||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
pub(crate) enum AttachmentMode {
|
pub(crate) enum AttachmentMode {
|
||||||
/// Our generation is current as far as we know, and as far as we know we are the only attached
|
/// Our generation is current as far as we know, and as far as we know we are the only attached
|
||||||
@@ -281,96 +236,20 @@ impl LocationConf {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A tenant's calcuated configuration, which is the result of merging a
|
impl Default for LocationConf {
|
||||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
// TODO: this should be removed once tenant loading can guarantee that we are never
|
||||||
///
|
// loading from a directory without a configuration.
|
||||||
/// For storing and transmitting individual tenant's configuration, see
|
// => tech debt since https://github.com/neondatabase/neon/issues/1555
|
||||||
/// TenantConfOpt.
|
fn default() -> Self {
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
Self {
|
||||||
pub struct TenantConf {
|
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
generation: Generation::none(),
|
||||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
attach_mode: AttachmentMode::Single,
|
||||||
// page server crashes.
|
}),
|
||||||
// This parameter actually determines L0 layer file size.
|
tenant_conf: TenantConfOpt::default(),
|
||||||
pub checkpoint_distance: u64,
|
shard: ShardIdentity::unsharded(),
|
||||||
// Inmemory layer is also flushed at least once in checkpoint_timeout to
|
}
|
||||||
// eventually upload WAL after activity is stopped.
|
}
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub checkpoint_timeout: Duration,
|
|
||||||
// Target file size, when creating image and delta layers.
|
|
||||||
// This parameter determines L1 layer file size.
|
|
||||||
pub compaction_target_size: u64,
|
|
||||||
// How often to check if there's compaction work to be done.
|
|
||||||
// Duration::ZERO means automatic compaction is disabled.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub compaction_period: Duration,
|
|
||||||
// Level0 delta layer threshold for compaction.
|
|
||||||
pub compaction_threshold: usize,
|
|
||||||
pub compaction_algorithm: CompactionAlgorithmSettings,
|
|
||||||
// Determines how much history is retained, to allow
|
|
||||||
// branching and read replicas at an older point in time.
|
|
||||||
// The unit is #of bytes of WAL.
|
|
||||||
// Page versions older than this are garbage collected away.
|
|
||||||
pub gc_horizon: u64,
|
|
||||||
// Interval at which garbage collection is triggered.
|
|
||||||
// Duration::ZERO means automatic GC is disabled
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub gc_period: Duration,
|
|
||||||
// Delta layer churn threshold to create L1 image layers.
|
|
||||||
pub image_creation_threshold: usize,
|
|
||||||
// Determines how much history is retained, to allow
|
|
||||||
// branching and read replicas at an older point in time.
|
|
||||||
// The unit is time.
|
|
||||||
// Page versions older than this are garbage collected away.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub pitr_interval: Duration,
|
|
||||||
/// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub walreceiver_connect_timeout: Duration,
|
|
||||||
/// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
|
|
||||||
/// A stalled safekeeper will be changed to a newer one when it appears.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub lagging_wal_timeout: Duration,
|
|
||||||
/// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
|
|
||||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
|
||||||
/// to avoid eager reconnects.
|
|
||||||
pub max_lsn_wal_lag: NonZeroU64,
|
|
||||||
pub eviction_policy: EvictionPolicy,
|
|
||||||
pub min_resident_size_override: Option<u64>,
|
|
||||||
// See the corresponding metric's help string.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
|
||||||
|
|
||||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
|
||||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
|
||||||
/// locations will use the heatmap uploaded by attached locations.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub heatmap_period: Duration,
|
|
||||||
|
|
||||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
|
||||||
pub lazy_slru_download: bool,
|
|
||||||
|
|
||||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
|
||||||
|
|
||||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
|
||||||
// Expresed in multiples of checkpoint distance.
|
|
||||||
pub image_layer_creation_check_threshold: u8,
|
|
||||||
|
|
||||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
|
||||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
|
||||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
|
||||||
/// file is written.
|
|
||||||
pub switch_aux_file_policy: AuxFilePolicy,
|
|
||||||
|
|
||||||
/// The length for an explicit LSN lease request.
|
|
||||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub lsn_lease_length: Duration,
|
|
||||||
|
|
||||||
/// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
|
|
||||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
|
||||||
#[serde(with = "humantime_serde")]
|
|
||||||
pub lsn_lease_length_for_ts: Duration,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Same as TenantConf, but this struct preserves the information about
|
/// Same as TenantConf, but this struct preserves the information about
|
||||||
@@ -545,51 +424,6 @@ impl TenantConfOpt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TenantConf {
|
|
||||||
fn default() -> Self {
|
|
||||||
use defaults::*;
|
|
||||||
Self {
|
|
||||||
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
|
||||||
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
|
|
||||||
.expect("cannot parse default checkpoint timeout"),
|
|
||||||
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
|
|
||||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
|
||||||
.expect("cannot parse default compaction period"),
|
|
||||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
|
||||||
compaction_algorithm: CompactionAlgorithmSettings {
|
|
||||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
|
||||||
},
|
|
||||||
gc_horizon: DEFAULT_GC_HORIZON,
|
|
||||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
|
||||||
.expect("cannot parse default gc period"),
|
|
||||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
|
||||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
|
||||||
.expect("cannot parse default PITR interval"),
|
|
||||||
walreceiver_connect_timeout: humantime::parse_duration(
|
|
||||||
DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
|
|
||||||
)
|
|
||||||
.expect("cannot parse default walreceiver connect timeout"),
|
|
||||||
lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
|
|
||||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
|
||||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
|
||||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
|
||||||
eviction_policy: EvictionPolicy::NoEviction,
|
|
||||||
min_resident_size_override: None,
|
|
||||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
|
||||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
|
||||||
)
|
|
||||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
|
||||||
heatmap_period: Duration::ZERO,
|
|
||||||
lazy_slru_download: false,
|
|
||||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
|
||||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
|
||||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
|
||||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
|
||||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||||
type Error = anyhow::Error;
|
type Error = anyhow::Error;
|
||||||
|
|
||||||
@@ -618,7 +452,8 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
|
|||||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||||
}
|
}
|
||||||
toml_edit::Item::Table(table) => {
|
toml_edit::Item::Table(table) => {
|
||||||
let deserializer = toml_edit::de::Deserializer::new(table.into());
|
let deserializer =
|
||||||
|
toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
|
||||||
return serde_path_to_error::deserialize(deserializer)
|
return serde_path_to_error::deserialize(deserializer)
|
||||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
//! Implementation of append-only file data structure
|
//! Implementation of append-only file data structure
|
||||||
//! used to keep in-memory layers spilled on disk.
|
//! used to keep in-memory layers spilled on disk.
|
||||||
|
|
||||||
|
use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache;
|
use crate::page_cache;
|
||||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||||
use crate::virtual_file::{self, VirtualFile};
|
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||||
|
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||||
|
use crate::virtual_file::owned_buffers_io::write::Buffer;
|
||||||
|
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
|
||||||
|
use bytes::BytesMut;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
|
use num_traits::Num;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||||
|
use tracing::error;
|
||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
@@ -16,12 +24,17 @@ use utils::id::TimelineId;
|
|||||||
pub struct EphemeralFile {
|
pub struct EphemeralFile {
|
||||||
_tenant_shard_id: TenantShardId,
|
_tenant_shard_id: TenantShardId,
|
||||||
_timeline_id: TimelineId,
|
_timeline_id: TimelineId,
|
||||||
|
page_cache_file_id: page_cache::FileId,
|
||||||
rw: page_caching::RW,
|
bytes_written: u64,
|
||||||
|
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||||
|
BytesMut,
|
||||||
|
size_tracking_writer::Writer<VirtualFile>,
|
||||||
|
>,
|
||||||
|
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
|
||||||
|
_gate_guard: utils::sync::gate::GateGuard,
|
||||||
}
|
}
|
||||||
|
|
||||||
mod page_caching;
|
const TAIL_SZ: usize = 64 * 1024;
|
||||||
mod zero_padded_read_write;
|
|
||||||
|
|
||||||
impl EphemeralFile {
|
impl EphemeralFile {
|
||||||
pub async fn create(
|
pub async fn create(
|
||||||
@@ -51,60 +64,178 @@ impl EphemeralFile {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
|
||||||
|
|
||||||
Ok(EphemeralFile {
|
Ok(EphemeralFile {
|
||||||
_tenant_shard_id: tenant_shard_id,
|
_tenant_shard_id: tenant_shard_id,
|
||||||
_timeline_id: timeline_id,
|
_timeline_id: timeline_id,
|
||||||
rw: page_caching::RW::new(file, gate_guard),
|
page_cache_file_id,
|
||||||
|
bytes_written: 0,
|
||||||
|
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
|
||||||
|
size_tracking_writer::Writer::new(file),
|
||||||
|
BytesMut::with_capacity(TAIL_SZ),
|
||||||
|
),
|
||||||
|
_gate_guard: gate_guard,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for EphemeralFile {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
// unlink the file
|
||||||
|
// we are clear to do this, because we have entered a gate
|
||||||
|
let path = &self.buffered_writer.as_inner().as_inner().path;
|
||||||
|
let res = std::fs::remove_file(path);
|
||||||
|
if let Err(e) = res {
|
||||||
|
if e.kind() != std::io::ErrorKind::NotFound {
|
||||||
|
// just never log the not found errors, we cannot do anything for them; on detach
|
||||||
|
// the tenant directory is already gone.
|
||||||
|
//
|
||||||
|
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||||
|
error!("could not remove ephemeral file '{path}': {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EphemeralFile {
|
||||||
pub(crate) fn len(&self) -> u64 {
|
pub(crate) fn len(&self) -> u64 {
|
||||||
self.rw.bytes_written()
|
self.bytes_written
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||||
self.rw.page_cache_file_id()
|
self.page_cache_file_id
|
||||||
}
|
}
|
||||||
|
|
||||||
/// See [`self::page_caching::RW::load_to_vec`].
|
|
||||||
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||||
self.rw.load_to_vec(ctx).await
|
let size = self.len().into_usize();
|
||||||
|
let vec = Vec::with_capacity(size);
|
||||||
|
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
|
||||||
|
assert_eq!(nread, size);
|
||||||
|
let vec = slice.into_inner();
|
||||||
|
assert_eq!(vec.len(), nread);
|
||||||
|
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
|
||||||
|
Ok(vec)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
/// Returns the offset at which the first byte of the input was written, for use
|
||||||
&self,
|
/// in constructing indices over the written value.
|
||||||
blknum: u32,
|
///
|
||||||
ctx: &RequestContext,
|
/// Panics if the write is short because there's no way we can recover from that.
|
||||||
) -> Result<BlockLease, io::Error> {
|
/// TODO: make upstack handle this as an error.
|
||||||
self.rw.read_blk(blknum, ctx).await
|
pub(crate) async fn write_raw(
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn write_blob(
|
|
||||||
&mut self,
|
&mut self,
|
||||||
srcbuf: &[u8],
|
srcbuf: &[u8],
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<u64, io::Error> {
|
) -> std::io::Result<u64> {
|
||||||
let pos = self.rw.bytes_written();
|
let pos = self.bytes_written;
|
||||||
|
|
||||||
// Write the length field
|
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
|
||||||
if srcbuf.len() < 0x80 {
|
std::io::Error::new(
|
||||||
// short one-byte length header
|
std::io::ErrorKind::Other,
|
||||||
let len_buf = [srcbuf.len() as u8];
|
format!(
|
||||||
|
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
|
||||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
srcbuf_len = srcbuf.len(),
|
||||||
} else {
|
),
|
||||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
)
|
||||||
len_buf[0] |= 0x80;
|
})?;
|
||||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write the payload
|
// Write the payload
|
||||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
let nwritten = self
|
||||||
|
.buffered_writer
|
||||||
|
.write_buffered_borrowed(srcbuf, ctx)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(
|
||||||
|
nwritten,
|
||||||
|
srcbuf.len(),
|
||||||
|
"buffered writer has no short writes"
|
||||||
|
);
|
||||||
|
|
||||||
|
self.bytes_written = new_bytes_written;
|
||||||
|
|
||||||
Ok(pos)
|
Ok(pos)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
|
||||||
|
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
|
||||||
|
&'b self,
|
||||||
|
start: u64,
|
||||||
|
dst: tokio_epoll_uring::Slice<B>,
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
|
||||||
|
let file_size_tracking_writer = self.buffered_writer.as_inner();
|
||||||
|
let flushed_offset = file_size_tracking_writer.bytes_written();
|
||||||
|
|
||||||
|
let buffer = self.buffered_writer.inspect_buffer();
|
||||||
|
let buffered = &buffer[0..buffer.pending()];
|
||||||
|
|
||||||
|
let dst_cap = dst.bytes_total().into_u64();
|
||||||
|
let end = {
|
||||||
|
// saturating_add is correct here because the max file size is u64::MAX, so,
|
||||||
|
// if start + dst.len() > u64::MAX, then we know it will be a short read
|
||||||
|
let mut end: u64 = start.saturating_add(dst_cap);
|
||||||
|
if end > self.bytes_written {
|
||||||
|
end = self.bytes_written;
|
||||||
|
}
|
||||||
|
end
|
||||||
|
};
|
||||||
|
|
||||||
|
// inclusive, exclusive
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct Range<N>(N, N);
|
||||||
|
impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
|
||||||
|
fn len(&self) -> N {
|
||||||
|
if self.0 > self.1 {
|
||||||
|
N::zero()
|
||||||
|
} else {
|
||||||
|
self.1 - self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let written_range = Range(start, std::cmp::min(end, flushed_offset));
|
||||||
|
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
|
||||||
|
|
||||||
|
let dst = if written_range.len() > 0 {
|
||||||
|
let file: &VirtualFile = file_size_tracking_writer.as_inner();
|
||||||
|
let bounds = dst.bounds();
|
||||||
|
let slice = file
|
||||||
|
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||||
|
.await?;
|
||||||
|
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
|
||||||
|
} else {
|
||||||
|
dst
|
||||||
|
};
|
||||||
|
|
||||||
|
let dst = if buffered_range.len() > 0 {
|
||||||
|
let offset_in_buffer = buffered_range
|
||||||
|
.0
|
||||||
|
.checked_sub(flushed_offset)
|
||||||
|
.unwrap()
|
||||||
|
.into_usize();
|
||||||
|
let to_copy =
|
||||||
|
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
|
||||||
|
let bounds = dst.bounds();
|
||||||
|
let mut view = dst.slice({
|
||||||
|
let start = written_range.len().into_usize();
|
||||||
|
let end = start
|
||||||
|
.checked_add(buffered_range.len().into_usize())
|
||||||
|
.unwrap();
|
||||||
|
start..end
|
||||||
|
});
|
||||||
|
view.as_mut_rust_slice_full_zeroed()
|
||||||
|
.copy_from_slice(to_copy);
|
||||||
|
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||||
|
} else {
|
||||||
|
dst
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
|
||||||
|
|
||||||
|
Ok((dst, (end - start).into_usize()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Does the given filename look like an ephemeral file?
|
/// Does the given filename look like an ephemeral file?
|
||||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||||
@@ -114,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BlockReader for EphemeralFile {
|
|
||||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
|
||||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use rand::Rng;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::context::DownloadBehavior;
|
use crate::context::DownloadBehavior;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::block_io::BlockReaderRef;
|
|
||||||
use rand::{thread_rng, RngCore};
|
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
@@ -157,69 +282,6 @@ mod tests {
|
|||||||
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
|
||||||
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
|
|
||||||
|
|
||||||
let gate = utils::sync::gate::Gate::default();
|
|
||||||
|
|
||||||
let entered = gate.enter().unwrap();
|
|
||||||
|
|
||||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
|
|
||||||
|
|
||||||
let pos_foo = file.write_blob(b"foo", &ctx).await?;
|
|
||||||
assert_eq!(
|
|
||||||
b"foo",
|
|
||||||
file.block_cursor()
|
|
||||||
.read_blob(pos_foo, &ctx)
|
|
||||||
.await?
|
|
||||||
.as_slice()
|
|
||||||
);
|
|
||||||
let pos_bar = file.write_blob(b"bar", &ctx).await?;
|
|
||||||
assert_eq!(
|
|
||||||
b"foo",
|
|
||||||
file.block_cursor()
|
|
||||||
.read_blob(pos_foo, &ctx)
|
|
||||||
.await?
|
|
||||||
.as_slice()
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
b"bar",
|
|
||||||
file.block_cursor()
|
|
||||||
.read_blob(pos_bar, &ctx)
|
|
||||||
.await?
|
|
||||||
.as_slice()
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut blobs = Vec::new();
|
|
||||||
for i in 0..10000 {
|
|
||||||
let data = Vec::from(format!("blob{}", i).as_bytes());
|
|
||||||
let pos = file.write_blob(&data, &ctx).await?;
|
|
||||||
blobs.push((pos, data));
|
|
||||||
}
|
|
||||||
// also test with a large blobs
|
|
||||||
for i in 0..100 {
|
|
||||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
|
||||||
let pos = file.write_blob(&data, &ctx).await?;
|
|
||||||
blobs.push((pos, data));
|
|
||||||
}
|
|
||||||
|
|
||||||
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
|
|
||||||
for (pos, expected) in blobs {
|
|
||||||
let actual = cursor.read_blob(pos, &ctx).await?;
|
|
||||||
assert_eq!(actual, expected);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test a large blob that spans multiple pages
|
|
||||||
let mut large_data = vec![0; 20000];
|
|
||||||
thread_rng().fill_bytes(&mut large_data);
|
|
||||||
let pos_large = file.write_blob(&large_data, &ctx).await?;
|
|
||||||
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
|
|
||||||
assert_eq!(result, large_data);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn ephemeral_file_holds_gate_open() {
|
async fn ephemeral_file_holds_gate_open() {
|
||||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
||||||
@@ -253,4 +315,151 @@ mod tests {
|
|||||||
.expect("closing completes right away")
|
.expect("closing completes right away")
|
||||||
.expect("closing does not panic");
|
.expect("closing does not panic");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_ephemeral_file_basics() {
|
||||||
|
let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
|
||||||
|
|
||||||
|
let gate = utils::sync::gate::Gate::default();
|
||||||
|
|
||||||
|
let mut file =
|
||||||
|
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||||
|
|
||||||
|
let write_nbytes = cap + cap / 2;
|
||||||
|
|
||||||
|
let content: Vec<u8> = rand::thread_rng()
|
||||||
|
.sample_iter(rand::distributions::Standard)
|
||||||
|
.take(write_nbytes)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut value_offsets = Vec::new();
|
||||||
|
for i in 0..write_nbytes {
|
||||||
|
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
|
||||||
|
value_offsets.push(off);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(file.len() as usize == write_nbytes);
|
||||||
|
for i in 0..write_nbytes {
|
||||||
|
assert_eq!(value_offsets[i], i.into_u64());
|
||||||
|
let buf = Vec::with_capacity(1);
|
||||||
|
let (buf_slice, nread) = file
|
||||||
|
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let buf = buf_slice.into_inner();
|
||||||
|
assert_eq!(nread, 1);
|
||||||
|
assert_eq!(&buf, &content[i..i + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let file_contents =
|
||||||
|
std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
|
||||||
|
assert_eq!(file_contents, &content[0..cap]);
|
||||||
|
|
||||||
|
let buffer_contents = file.buffered_writer.inspect_buffer();
|
||||||
|
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_flushes_do_happen() {
|
||||||
|
let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
|
||||||
|
|
||||||
|
let gate = utils::sync::gate::Gate::default();
|
||||||
|
|
||||||
|
let mut file =
|
||||||
|
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||||
|
|
||||||
|
let content: Vec<u8> = rand::thread_rng()
|
||||||
|
.sample_iter(rand::distributions::Standard)
|
||||||
|
.take(cap + cap / 2)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
file.write_raw(&content, &ctx).await.unwrap();
|
||||||
|
|
||||||
|
// assert the state is as this test expects it to be
|
||||||
|
assert_eq!(
|
||||||
|
&file.load_to_vec(&ctx).await.unwrap(),
|
||||||
|
&content[0..cap + cap / 2]
|
||||||
|
);
|
||||||
|
let md = file
|
||||||
|
.buffered_writer
|
||||||
|
.as_inner()
|
||||||
|
.as_inner()
|
||||||
|
.path
|
||||||
|
.metadata()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
md.len(),
|
||||||
|
cap.into_u64(),
|
||||||
|
"buffered writer does one write if we write 1.5x buffer capacity"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&file.buffered_writer.inspect_buffer()[0..cap / 2],
|
||||||
|
&content[cap..cap + cap / 2]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_read_split_across_file_and_buffer() {
|
||||||
|
// This test exercises the logic on the read path that splits the logical read
|
||||||
|
// into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
|
||||||
|
//
|
||||||
|
// This test build on the assertions in test_flushes_do_happen
|
||||||
|
|
||||||
|
let (conf, tenant_id, timeline_id, ctx) =
|
||||||
|
harness("test_read_split_across_file_and_buffer").unwrap();
|
||||||
|
|
||||||
|
let gate = utils::sync::gate::Gate::default();
|
||||||
|
|
||||||
|
let mut file =
|
||||||
|
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||||
|
|
||||||
|
let content: Vec<u8> = rand::thread_rng()
|
||||||
|
.sample_iter(rand::distributions::Standard)
|
||||||
|
.take(cap + cap / 2)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
file.write_raw(&content, &ctx).await.unwrap();
|
||||||
|
|
||||||
|
let test_read = |start: usize, len: usize| {
|
||||||
|
let file = &file;
|
||||||
|
let ctx = &ctx;
|
||||||
|
let content = &content;
|
||||||
|
async move {
|
||||||
|
let (buf, nread) = file
|
||||||
|
.read_exact_at_eof_ok(
|
||||||
|
start.into_u64(),
|
||||||
|
Vec::with_capacity(len).slice_full(),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(nread, len);
|
||||||
|
assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// completely within the file range
|
||||||
|
assert!(20 < cap, "test assumption");
|
||||||
|
test_read(10, 10).await;
|
||||||
|
// border onto edge of file
|
||||||
|
test_read(cap - 10, 10).await;
|
||||||
|
// read across file and buffer
|
||||||
|
test_read(cap - 10, 20).await;
|
||||||
|
// stay from start of buffer
|
||||||
|
test_read(cap, 10).await;
|
||||||
|
// completely within buffer
|
||||||
|
test_read(cap + 10, 10).await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,153 +0,0 @@
|
|||||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
|
||||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
|
||||||
//!
|
|
||||||
//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
|
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
|
||||||
use crate::page_cache::{self, PAGE_SZ};
|
|
||||||
use crate::tenant::block_io::BlockLease;
|
|
||||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
|
||||||
use crate::virtual_file::VirtualFile;
|
|
||||||
|
|
||||||
use std::io::{self};
|
|
||||||
use tokio_epoll_uring::BoundedBuf;
|
|
||||||
use tracing::*;
|
|
||||||
|
|
||||||
use super::zero_padded_read_write;
|
|
||||||
|
|
||||||
/// See module-level comment.
|
|
||||||
pub struct RW {
|
|
||||||
page_cache_file_id: page_cache::FileId,
|
|
||||||
rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
|
|
||||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
|
||||||
_gate_guard: utils::sync::gate::GateGuard,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RW {
|
|
||||||
pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
|
|
||||||
let page_cache_file_id = page_cache::next_file_id();
|
|
||||||
Self {
|
|
||||||
page_cache_file_id,
|
|
||||||
rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
|
|
||||||
_gate_guard,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn page_cache_file_id(&self) -> page_cache::FileId {
|
|
||||||
self.page_cache_file_id
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn write_all_borrowed(
|
|
||||||
&mut self,
|
|
||||||
srcbuf: &[u8],
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<usize, io::Error> {
|
|
||||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
|
||||||
// because Compute is unlikely to access recently written data.
|
|
||||||
self.rw.write_all_borrowed(srcbuf, ctx).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn bytes_written(&self) -> u64 {
|
|
||||||
self.rw.bytes_written()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
|
|
||||||
///
|
|
||||||
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
|
|
||||||
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
|
|
||||||
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
|
||||||
// round up to the next PAGE_SZ multiple, required by blob_io
|
|
||||||
let size = {
|
|
||||||
let s = usize::try_from(self.bytes_written()).unwrap();
|
|
||||||
if s % PAGE_SZ == 0 {
|
|
||||||
s
|
|
||||||
} else {
|
|
||||||
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let vec = Vec::with_capacity(size);
|
|
||||||
|
|
||||||
// read from disk what we've already flushed
|
|
||||||
let file_size_tracking_writer = self.rw.as_writer();
|
|
||||||
let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
|
|
||||||
let mut vec = file_size_tracking_writer
|
|
||||||
.as_inner()
|
|
||||||
.read_exact_at(
|
|
||||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
|
||||||
u64::try_from(flushed_range.start).unwrap(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
.into_inner();
|
|
||||||
|
|
||||||
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
|
|
||||||
let buffered = self.rw.get_tail_zero_padded();
|
|
||||||
vec.extend_from_slice(buffered);
|
|
||||||
assert_eq!(vec.len(), size);
|
|
||||||
assert_eq!(vec.len() % PAGE_SZ, 0);
|
|
||||||
Ok(vec)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
|
||||||
&self,
|
|
||||||
blknum: u32,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<BlockLease, io::Error> {
|
|
||||||
match self.rw.read_blk(blknum).await? {
|
|
||||||
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
|
|
||||||
let cache = page_cache::get();
|
|
||||||
match cache
|
|
||||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
|
||||||
.await
|
|
||||||
.map_err(|e| {
|
|
||||||
std::io::Error::new(
|
|
||||||
std::io::ErrorKind::Other,
|
|
||||||
// order path before error because error is anyhow::Error => might have many contexts
|
|
||||||
format!(
|
|
||||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
|
||||||
blknum,
|
|
||||||
self.rw.as_writer().as_inner().path,
|
|
||||||
e,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
})? {
|
|
||||||
page_cache::ReadBufResult::Found(guard) => {
|
|
||||||
return Ok(BlockLease::PageReadGuard(guard))
|
|
||||||
}
|
|
||||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
|
||||||
let write_guard = writer
|
|
||||||
.as_inner()
|
|
||||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
|
||||||
.await?;
|
|
||||||
let read_guard = write_guard.mark_valid();
|
|
||||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
|
|
||||||
Ok(BlockLease::EphemeralFileMutableTail(buffer))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for RW {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
|
||||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
|
||||||
|
|
||||||
// unlink the file
|
|
||||||
// we are clear to do this, because we have entered a gate
|
|
||||||
let path = &self.rw.as_writer().as_inner().path;
|
|
||||||
let res = std::fs::remove_file(path);
|
|
||||||
if let Err(e) = res {
|
|
||||||
if e.kind() != std::io::ErrorKind::NotFound {
|
|
||||||
// just never log the not found errors, we cannot do anything for them; on detach
|
|
||||||
// the tenant directory is already gone.
|
|
||||||
//
|
|
||||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
|
||||||
error!("could not remove ephemeral file '{path}': {e}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,145 +0,0 @@
|
|||||||
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
|
|
||||||
//!
|
|
||||||
//! # Writes
|
|
||||||
//!
|
|
||||||
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
|
|
||||||
//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
|
|
||||||
//!
|
|
||||||
//! # Reads
|
|
||||||
//!
|
|
||||||
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
|
|
||||||
//!
|
|
||||||
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
|
|
||||||
//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
|
|
||||||
//! if the read is for the prefix that has already been flushed.
|
|
||||||
//!
|
|
||||||
//! # Current Usage
|
|
||||||
//!
|
|
||||||
//! The current user of this module is [`super::page_caching::RW`].
|
|
||||||
|
|
||||||
mod zero_padded;
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
context::RequestContext,
|
|
||||||
page_cache::PAGE_SZ,
|
|
||||||
virtual_file::owned_buffers_io::{
|
|
||||||
self,
|
|
||||||
write::{Buffer, OwnedAsyncWriter},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const TAIL_SZ: usize = 64 * 1024;
|
|
||||||
|
|
||||||
/// See module-level comment.
|
|
||||||
pub struct RW<W: OwnedAsyncWriter> {
|
|
||||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
|
||||||
zero_padded::Buffer<TAIL_SZ>,
|
|
||||||
owned_buffers_io::util::size_tracking_writer::Writer<W>,
|
|
||||||
>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum ReadResult<'a, W> {
|
|
||||||
NeedsReadFromWriter { writer: &'a W },
|
|
||||||
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<W> RW<W>
|
|
||||||
where
|
|
||||||
W: OwnedAsyncWriter,
|
|
||||||
{
|
|
||||||
pub fn new(writer: W) -> Self {
|
|
||||||
let bytes_flushed_tracker =
|
|
||||||
owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
|
|
||||||
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
|
|
||||||
bytes_flushed_tracker,
|
|
||||||
zero_padded::Buffer::default(),
|
|
||||||
);
|
|
||||||
Self { buffered_writer }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn as_writer(&self) -> &W {
|
|
||||||
self.buffered_writer.as_inner().as_inner()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn write_all_borrowed(
|
|
||||||
&mut self,
|
|
||||||
buf: &[u8],
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> std::io::Result<usize> {
|
|
||||||
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn bytes_written(&self) -> u64 {
|
|
||||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
|
||||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
|
||||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
|
|
||||||
pub fn get_tail_zero_padded(&self) -> &[u8] {
|
|
||||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
|
||||||
let buffer_written_up_to = buffer.pending();
|
|
||||||
// pad to next page boundary
|
|
||||||
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
|
|
||||||
buffer_written_up_to
|
|
||||||
} else {
|
|
||||||
buffer_written_up_to
|
|
||||||
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
&buffer.as_zero_padded_slice()[0..read_up_to]
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
|
||||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
|
||||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
|
||||||
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
|
|
||||||
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
|
|
||||||
|
|
||||||
// The trailing page ("block") might only be partially filled,
|
|
||||||
// yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
|
|
||||||
// Moreover, it has to be zero-padded, because when we still had
|
|
||||||
// a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
|
|
||||||
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
|
|
||||||
// => check here that the read doesn't go beyond this potentially trailing
|
|
||||||
// => the zero-padding is done in the `else` branch below
|
|
||||||
let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
|
|
||||||
buffered_offset / (PAGE_SZ as u64)
|
|
||||||
} else {
|
|
||||||
(buffered_offset / (PAGE_SZ as u64)) + 1
|
|
||||||
};
|
|
||||||
if (blknum as u64) >= blocks_written {
|
|
||||||
return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
|
|
||||||
}
|
|
||||||
|
|
||||||
// assertions for the `if-else` below
|
|
||||||
assert_eq!(
|
|
||||||
flushed_offset % (TAIL_SZ as u64), 0,
|
|
||||||
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
flushed_offset % (PAGE_SZ as u64),
|
|
||||||
0,
|
|
||||||
"the logic below can't handle if the page is spread across the flushed part and the buffer"
|
|
||||||
);
|
|
||||||
|
|
||||||
if read_offset < flushed_offset {
|
|
||||||
assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
|
|
||||||
Ok(ReadResult::NeedsReadFromWriter {
|
|
||||||
writer: self.as_writer(),
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
let read_offset_in_buffer = read_offset
|
|
||||||
.checked_sub(flushed_offset)
|
|
||||||
.expect("would have taken `if` branch instead of this one");
|
|
||||||
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
|
|
||||||
let zero_padded_slice = buffer.as_zero_padded_slice();
|
|
||||||
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
|
|
||||||
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
|
|
||||||
buffer: page
|
|
||||||
.try_into()
|
|
||||||
.expect("the slice above got it as page-size slice"),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,110 +0,0 @@
|
|||||||
//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
|
|
||||||
//! unwritten range is guaranteed to be zero-initialized.
|
|
||||||
//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
|
|
||||||
//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
|
|
||||||
|
|
||||||
use std::mem::MaybeUninit;
|
|
||||||
|
|
||||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
|
||||||
|
|
||||||
/// See module-level comment.
|
|
||||||
pub struct Buffer<const N: usize> {
|
|
||||||
allocation: Box<[u8; N]>,
|
|
||||||
written: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> Default for Buffer<N> {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
allocation: Box::new(
|
|
||||||
// SAFETY: zeroed memory is a valid [u8; N]
|
|
||||||
unsafe { MaybeUninit::zeroed().assume_init() },
|
|
||||||
),
|
|
||||||
written: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> Buffer<N> {
|
|
||||||
#[inline(always)]
|
|
||||||
fn invariants(&self) {
|
|
||||||
// don't check by default, unoptimized is too expensive even for debug mode
|
|
||||||
if false {
|
|
||||||
debug_assert!(self.written <= N, "{}", self.written);
|
|
||||||
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
|
|
||||||
&self.allocation
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
|
|
||||||
type IoBuf = Self;
|
|
||||||
|
|
||||||
fn cap(&self) -> usize {
|
|
||||||
self.allocation.len()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
|
||||||
self.invariants();
|
|
||||||
let remaining = self.allocation.len() - self.written;
|
|
||||||
if other.len() > remaining {
|
|
||||||
panic!("calling extend_from_slice() with insufficient remaining capacity");
|
|
||||||
}
|
|
||||||
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
|
|
||||||
self.written += other.len();
|
|
||||||
self.invariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
fn pending(&self) -> usize {
|
|
||||||
self.written
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(self) -> FullSlice<Self> {
|
|
||||||
self.invariants();
|
|
||||||
let written = self.written;
|
|
||||||
FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
|
||||||
let Self {
|
|
||||||
mut allocation,
|
|
||||||
written,
|
|
||||||
} = iobuf;
|
|
||||||
allocation[0..written].fill(0);
|
|
||||||
let new = Self {
|
|
||||||
allocation,
|
|
||||||
written: 0,
|
|
||||||
};
|
|
||||||
new.invariants();
|
|
||||||
new
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
|
|
||||||
/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
|
|
||||||
///
|
|
||||||
/// Remember that bytes_init is generally _not_ a tracker of the amount
|
|
||||||
/// of valid data in the io buffer; we use `Slice` for that.
|
|
||||||
/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
|
|
||||||
///
|
|
||||||
/// SAFETY:
|
|
||||||
///
|
|
||||||
/// The [`Self::allocation`] is stable becauses boxes are stable.
|
|
||||||
/// The memory is zero-initialized, so, bytes_init is always N.
|
|
||||||
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
|
|
||||||
fn stable_ptr(&self) -> *const u8 {
|
|
||||||
self.allocation.as_ptr()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bytes_init(&self) -> usize {
|
|
||||||
// Yes, N, not self.written; Read the full comment of this impl block!
|
|
||||||
N
|
|
||||||
}
|
|
||||||
|
|
||||||
fn bytes_total(&self) -> usize {
|
|
||||||
N
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user