mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 08:00:37 +00:00
Compare commits
87 Commits
communicat
...
link-compa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
067f80e8db | ||
|
|
8c2ba51dce | ||
|
|
9971fba584 | ||
|
|
a77919f4b2 | ||
|
|
a618056770 | ||
|
|
307e1e64c8 | ||
|
|
a537b2ffd0 | ||
|
|
64353b48db | ||
|
|
79ddc803af | ||
|
|
f5070f6aa4 | ||
|
|
3b7cc4234c | ||
|
|
33abfc2b74 | ||
|
|
93b964f829 | ||
|
|
d0aaec2abb | ||
|
|
d0dc65da12 | ||
|
|
03d635b916 | ||
|
|
5cd7f936f9 | ||
|
|
101e115b38 | ||
|
|
b37bb7d7ed | ||
|
|
bef5954fd7 | ||
|
|
8477d15f95 | ||
|
|
622b3b2993 | ||
|
|
659366060d | ||
|
|
42d93031a1 | ||
|
|
d22377c754 | ||
|
|
6c70789cfd | ||
|
|
7e55497e13 | ||
|
|
40f32ea326 | ||
|
|
1d1502bc16 | ||
|
|
7eb85c56ac | ||
|
|
24d62c647f | ||
|
|
4d2e4b19c3 | ||
|
|
0691b73f53 | ||
|
|
3cf5e1386c | ||
|
|
608afc3055 | ||
|
|
0ef6851219 | ||
|
|
5c356c63eb | ||
|
|
384e3df2ad | ||
|
|
f9b3a2e059 | ||
|
|
79ee78ea32 | ||
|
|
0e0ad073bf | ||
|
|
6827f2f58c | ||
|
|
c82e363ed9 | ||
|
|
50dc2fae77 | ||
|
|
62ac5b94b3 | ||
|
|
f0e7b3e0ef | ||
|
|
c6ff18affc | ||
|
|
16ca74a3f4 | ||
|
|
cb67f9a651 | ||
|
|
baf425a2cd | ||
|
|
0b243242df | ||
|
|
6131d86ec9 | ||
|
|
4b9087651c | ||
|
|
79699aebc8 | ||
|
|
22290eb7ba | ||
|
|
bbc35e10b8 | ||
|
|
ae2c3ac12f | ||
|
|
16d594b7b3 | ||
|
|
f999632327 | ||
|
|
5bd850d15a | ||
|
|
1b789e8d7c | ||
|
|
bec7427d9e | ||
|
|
e2db76b9be | ||
|
|
6b4b8e0d8b | ||
|
|
1d68577fbd | ||
|
|
60f63c076f | ||
|
|
8da4ec9740 | ||
|
|
b48404952d | ||
|
|
1d06172d59 | ||
|
|
a08c1a23eb | ||
|
|
a2adc7dbd3 | ||
|
|
768a580373 | ||
|
|
09247de8d5 | ||
|
|
0b35929211 | ||
|
|
b3db7f66ac | ||
|
|
498d852bde | ||
|
|
7f8b1d79c0 | ||
|
|
d15f2ff57a | ||
|
|
3593356c10 | ||
|
|
9e8ab2ab4f | ||
|
|
c1ff7db187 | ||
|
|
6d6b83e737 | ||
|
|
0482690534 | ||
|
|
a750026c2e | ||
|
|
998d2c2ce9 | ||
|
|
b1fa68f659 | ||
|
|
84bc3380cc |
5
.github/actionlint.yml
vendored
5
.github/actionlint.yml
vendored
@@ -33,9 +33,14 @@ config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
- SLACK_COMPUTE_CHANNEL_ID
|
||||
- SLACK_ON_CALL_DEVPROD_STREAM
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
|
||||
- SLACK_ONCALL_COMPUTE_GROUP
|
||||
- SLACK_ONCALL_PROXY_GROUP
|
||||
- SLACK_ONCALL_STORAGE_GROUP
|
||||
- SLACK_PROXY_CHANNEL_ID
|
||||
- SLACK_RUST_CHANNEL_ID
|
||||
- SLACK_STORAGE_CHANNEL_ID
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
2
.github/scripts/lint-release-pr.sh
vendored
2
.github/scripts/lint-release-pr.sh
vendored
@@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
|
||||
LAST_COMMIT=$(git rev-parse HEAD)
|
||||
|
||||
MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
|
||||
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
|
||||
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$"
|
||||
|
||||
if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
|
||||
report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
|
||||
|
||||
103
.github/workflows/_create-release-pr.yml
vendored
103
.github/workflows/_create-release-pr.yml
vendored
@@ -1,103 +0,0 @@
|
||||
name: Create Release PR
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
component-name:
|
||||
description: 'Component name'
|
||||
required: true
|
||||
type: string
|
||||
source-branch:
|
||||
description: 'Source branch'
|
||||
required: true
|
||||
type: string
|
||||
secrets:
|
||||
ci-access-token:
|
||||
description: 'CI access token'
|
||||
required: true
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
create-release-branch:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
ref: ${{ inputs.source-branch }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set variables
|
||||
id: vars
|
||||
env:
|
||||
COMPONENT_NAME: ${{ inputs.component-name }}
|
||||
RELEASE_BRANCH: >-
|
||||
${{
|
||||
false
|
||||
|| inputs.component-name == 'Storage' && 'release'
|
||||
|| inputs.component-name == 'Proxy' && 'release-proxy'
|
||||
|| inputs.component-name == 'Compute' && 'release-compute'
|
||||
}}
|
||||
run: |
|
||||
now_date=$(date -u +'%Y-%m-%d')
|
||||
now_time=$(date -u +'%H-%M-%Z')
|
||||
{
|
||||
echo "title=${COMPONENT_NAME} release ${now_date}"
|
||||
echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
|
||||
echo "release-branch=${RELEASE_BRANCH}"
|
||||
} | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Create RC branch
|
||||
env:
|
||||
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
|
||||
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
|
||||
TITLE: ${{ steps.vars.outputs.title }}
|
||||
run: |
|
||||
git switch -c "${RC_BRANCH}"
|
||||
|
||||
# Manually create a merge commit on the current branch, keeping the
|
||||
# tree and setting the parents to the current HEAD and the HEAD of the
|
||||
# release branch. This commit is what we'll fast-forward the release
|
||||
# branch to when merging the release branch.
|
||||
# For details on why, look at
|
||||
# https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
|
||||
current_tree=$(git rev-parse 'HEAD^{tree}')
|
||||
release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
|
||||
current_head=$(git rev-parse HEAD)
|
||||
merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
|
||||
|
||||
# Fast-forward the current branch to the newly created merge_commit
|
||||
git merge --ff-only ${merge_commit}
|
||||
|
||||
git push origin "${RC_BRANCH}"
|
||||
|
||||
- name: Create a PR into ${{ steps.vars.outputs.release-branch }}
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.ci-access-token }}
|
||||
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
|
||||
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
|
||||
TITLE: ${{ steps.vars.outputs.title }}
|
||||
run: |
|
||||
gh pr create --title "${TITLE}" \
|
||||
--body "" \
|
||||
--head "${RC_BRANCH}" \
|
||||
--base "${RELEASE_BRANCH}"
|
||||
71
.github/workflows/benchmarking.yml
vendored
71
.github/workflows/benchmarking.yml
vendored
@@ -53,6 +53,77 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
env:
|
||||
ORG_ID: org-solitary-dew-09443886
|
||||
LIMIT: 100
|
||||
SEARCH: "GITHUB_RUN_ID="
|
||||
BASE_URL: https://console-stage.neon.build/api/v2
|
||||
DRY_RUN: "false" # Set to "true" to just test out the workflow
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Cleanup inactive Neon projects left over from prior runs
|
||||
env:
|
||||
API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
NOW=$(date -u +%s)
|
||||
DAYS_AGO=$((NOW - 5 * 86400))
|
||||
|
||||
REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID"
|
||||
|
||||
echo "Requesting project list from:"
|
||||
echo "$REQUEST_URL"
|
||||
|
||||
response=$(curl -s -X GET "$REQUEST_URL" \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}" )
|
||||
|
||||
echo "Response:"
|
||||
echo "$response" | jq .
|
||||
|
||||
projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" '
|
||||
.projects[]
|
||||
| select(.compute_last_active_at != null)
|
||||
| select((.compute_last_active_at | fromdateiso8601) < $cutoff)
|
||||
| {id, name, compute_last_active_at}
|
||||
')
|
||||
|
||||
if [ -z "$projects_to_delete" ]; then
|
||||
echo "No projects eligible for deletion."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Projects that will be deleted:"
|
||||
echo "$projects_to_delete" | jq -r '.id'
|
||||
|
||||
if [ "$DRY_RUN" = "false" ]; then
|
||||
echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do
|
||||
echo "Deleting project: $project_id"
|
||||
curl -s -X DELETE "$BASE_URL/projects/$project_id" \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
done
|
||||
else
|
||||
echo "Dry run enabled — no projects were deleted."
|
||||
fi
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for file changes
|
||||
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
|
||||
uses: step-security/paths-filter@v3
|
||||
id: files-changed
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@@ -824,7 +824,7 @@ jobs:
|
||||
- pg: v17
|
||||
debian: bookworm
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.42.2
|
||||
VM_BUILDER_VERSION: v0.46.0
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
@@ -1434,10 +1434,10 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
notify-storage-release-deploy-failure:
|
||||
needs: [ deploy ]
|
||||
notify-release-deploy-failure:
|
||||
needs: [ meta, deploy ]
|
||||
# We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
|
||||
if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
|
||||
if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always()
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
@@ -1445,15 +1445,40 @@ jobs:
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Post release-deploy failure to team-storage slack channel
|
||||
- name: Post release-deploy failure to team slack channel
|
||||
uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
|
||||
env:
|
||||
TEAM_ONCALL: >-
|
||||
${{
|
||||
fromJSON(format('{
|
||||
"storage-release": "<!subteam^{0}|@oncall-storage>",
|
||||
"compute-release": "<!subteam^{1}|@oncall-compute>",
|
||||
"proxy-release": "<!subteam^{2}|@oncall-proxy>"
|
||||
}',
|
||||
vars.SLACK_ONCALL_STORAGE_GROUP,
|
||||
vars.SLACK_ONCALL_COMPUTE_GROUP,
|
||||
vars.SLACK_ONCALL_PROXY_GROUP
|
||||
))[needs.meta.outputs.run-kind]
|
||||
}}
|
||||
CHANNEL: >-
|
||||
${{
|
||||
fromJSON(format('{
|
||||
"storage-release": "{0}",
|
||||
"compute-release": "{1}",
|
||||
"proxy-release": "{2}"
|
||||
}',
|
||||
vars.SLACK_STORAGE_CHANNEL_ID,
|
||||
vars.SLACK_COMPUTE_CHANNEL_ID,
|
||||
vars.SLACK_PROXY_CHANNEL_ID
|
||||
))[needs.meta.outputs.run-kind]
|
||||
}}
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
|
||||
channel: ${{ env.CHANNEL }}
|
||||
text: |
|
||||
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
|
||||
2
.github/workflows/cloud-extensions.yml
vendored
2
.github/workflows/cloud-extensions.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ inputs.region_id }}
|
||||
region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ matrix.pg-version }}
|
||||
project_settings: ${{ steps.project-settings.outputs.settings }}
|
||||
# We need these settings to get the expected output results.
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for Postgres changes
|
||||
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
|
||||
uses: step-security/paths-filter@v3
|
||||
id: files_changed
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
|
||||
12
.github/workflows/release-compute.yml
vendored
Normal file
12
.github/workflows/release-compute.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create compute release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * FRI'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: compute
|
||||
secrets: inherit
|
||||
12
.github/workflows/release-proxy.yml
vendored
Normal file
12
.github/workflows/release-proxy.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create proxy release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * TUE'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: proxy
|
||||
secrets: inherit
|
||||
12
.github/workflows/release-storage.yml
vendored
Normal file
12
.github/workflows/release-storage.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create storage release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * FRI'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: storage
|
||||
secrets: inherit
|
||||
93
.github/workflows/release.yml
vendored
93
.github/workflows/release.yml
vendored
@@ -1,25 +1,34 @@
|
||||
name: Create Release Branch
|
||||
name: Create release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * TUE' # Proxy release
|
||||
- cron: '0 6 * * FRI' # Storage release
|
||||
- cron: '0 7 * * FRI' # Compute release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Storage release PR'
|
||||
component:
|
||||
description: "Component to release"
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- compute
|
||||
- proxy
|
||||
- storage
|
||||
cherry-pick:
|
||||
description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
|
||||
required: false
|
||||
create-proxy-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
create-compute-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Compute release PR'
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
workflow_call:
|
||||
inputs:
|
||||
component:
|
||||
description: "Component to release"
|
||||
required: true
|
||||
type: string
|
||||
cherry-pick:
|
||||
description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
@@ -29,41 +38,31 @@ defaults:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
|
||||
create-release-pr:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Storage'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }}
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Proxy'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
create-compute-release-branch:
|
||||
if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Compute'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
- name: Create release PR
|
||||
uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677
|
||||
with:
|
||||
component: ${{ inputs.component }}
|
||||
cherry-pick: ${{ inputs.cherry-pick }}
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
233
Cargo.lock
generated
233
Cargo.lock
generated
@@ -253,17 +253,6 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_enum"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -698,40 +687,13 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core 0.4.5",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"itoa",
|
||||
"matchit 0.7.3",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"axum-core 0.5.0",
|
||||
"axum-core",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
@@ -742,7 +704,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit 0.8.4",
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -762,26 +724,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.5.0"
|
||||
@@ -808,8 +750,8 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"headers",
|
||||
@@ -1144,25 +1086,6 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck 0.4.1",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1283,7 +1206,7 @@ version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -1341,40 +1264,13 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"atomic_enum",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"http 1.1.0",
|
||||
"libc",
|
||||
"neonart",
|
||||
"nix 0.27.1",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_data_api",
|
||||
"prost 0.13.3",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-pipe",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uring-common",
|
||||
"utils",
|
||||
"zerocopy 0.8.24",
|
||||
"zerocopy-derive 0.8.24",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"jsonwebtoken",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -1388,11 +1284,12 @@ name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"aws-config",
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
@@ -1405,7 +1302,8 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -1524,6 +1422,7 @@ dependencies = [
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"endpoint_storage",
|
||||
"futures",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
@@ -2031,7 +1930,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -2145,7 +2044,7 @@ name = "endpoint_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -2692,7 +2591,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -2711,7 +2610,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -2807,12 +2706,6 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -3301,12 +3194,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.9.0"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
|
||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.15.2",
|
||||
"hashbrown 0.14.5",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -3329,7 +3222,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"is-terminal",
|
||||
"itoa",
|
||||
"log",
|
||||
@@ -3352,7 +3245,7 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
"dashmap 6.1.0",
|
||||
"env_logger",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"itoa",
|
||||
"log",
|
||||
"num-format",
|
||||
@@ -3704,12 +3597,6 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3755,7 +3642,7 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -3901,15 +3788,6 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
"tracing",
|
||||
"zerocopy 0.8.24",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -4333,8 +4211,6 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_data_api",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -4411,8 +4287,6 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_compaction",
|
||||
"pageserver_data_api",
|
||||
"peekable",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -4424,7 +4298,6 @@ dependencies = [
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"prost 0.13.3",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
@@ -4456,7 +4329,6 @@ dependencies = [
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
@@ -4521,18 +4393,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"pageserver_data_api",
|
||||
"thiserror 1.0.69",
|
||||
"tonic",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -4556,17 +4416,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_data_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"prost 0.13.3",
|
||||
"thiserror 1.0.69",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.2.1"
|
||||
@@ -4693,15 +4542,6 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peekable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -5173,7 +5013,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5194,7 +5034,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5297,7 +5137,7 @@ dependencies = [
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"itoa",
|
||||
@@ -5808,7 +5648,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit 0.8.4",
|
||||
"matchit",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6969,7 +6809,7 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -7394,16 +7234,6 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-pipe"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
@@ -7586,7 +7416,7 @@ version = "0.22.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
|
||||
dependencies = [
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
@@ -7599,13 +7429,9 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum 0.7.9",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"flate2",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
@@ -7617,7 +7443,6 @@ dependencies = [
|
||||
"prost 0.13.3",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
@@ -8117,7 +7942,7 @@ name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum 0.8.1",
|
||||
"axum",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -8627,7 +8452,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.9.0",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
|
||||
15
Cargo.toml
15
Cargo.toml
@@ -8,7 +8,6 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
@@ -30,7 +29,6 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -43,7 +41,6 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -145,7 +142,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -191,6 +187,7 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
@@ -199,7 +196,7 @@ tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]}
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
|
||||
@@ -231,9 +228,6 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
@@ -249,14 +243,12 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neonart = { version = "0.1", path = "./libs/neonart/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_data_api = { path = "./pageserver/data_api" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
@@ -280,7 +272,6 @@ wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
7
Makefile
7
Makefile
@@ -18,12 +18,10 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -182,16 +180,11 @@ postgres-check-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
|
||||
@@ -1085,6 +1085,23 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions pgrx14"
|
||||
#
|
||||
# Version 14 is now required by a few
|
||||
# This layer should be used as a base for new pgrx extensions,
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "pg-onnx-build" and "pgrag-build"
|
||||
@@ -1100,11 +1117,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
echo "#nothing to test here" > neon-test.sh
|
||||
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
|
||||
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \
|
||||
echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pgrag-build
|
||||
FROM rust-extensions-build-pgrx14 AS pgrag-build
|
||||
COPY --from=pgrag-src /ext-src/ /ext-src/
|
||||
|
||||
# Install build-time dependencies
|
||||
@@ -1124,19 +1141,19 @@ RUN . venv/bin/activate && \
|
||||
|
||||
WORKDIR /ext-src/pgrag-src
|
||||
RUN cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
|
||||
|
||||
RUN cd exts/rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
|
||||
|
||||
RUN cd exts/rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
@@ -1305,8 +1322,8 @@ ARG PG_VERSION
|
||||
# Do not update without approve from proxy team
|
||||
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
|
||||
WORKDIR /ext-src
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
@@ -1319,6 +1336,40 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pg_session_jwt-src
|
||||
RUN cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-anon-pg-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg_anon-src
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
WORKDIR /ext-src
|
||||
COPY compute/patches/anon_v2.patch .
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
patch -p1 < /ext-src/anon_v2.patch
|
||||
|
||||
FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg_anon-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src
|
||||
RUN cd pg_anon-src && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
|
||||
chmod -R a+r ../pg_anon-src && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control;
|
||||
|
||||
########################################################################################
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
@@ -1615,6 +1666,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1919,7 +1971,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Make the libraries we built available
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf
|
||||
RUN /sbin/ldconfig
|
||||
|
||||
# rsyslog config permissions
|
||||
# directory for rsyslogd pid file
|
||||
|
||||
1
compute/etc/ld.so.conf.d/00-neon.conf
Normal file
1
compute/etc/ld.so.conf.d/00-neon.conf
Normal file
@@ -0,0 +1 @@
|
||||
/usr/local/lib
|
||||
@@ -23,6 +23,8 @@
|
||||
import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
|
||||
import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
|
||||
import 'sql_exporter/getpage_sync_requests_total.libsonnet',
|
||||
import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
|
||||
import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
|
||||
type: 'gauge',
|
||||
help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
|
||||
values: [
|
||||
'compute_getpage_max_inflight_stuck_time_ms',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'compute_getpage_stuck_requests_total',
|
||||
type: 'counter',
|
||||
help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
|
||||
values: [
|
||||
'compute_getpage_stuck_requests_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
compute_getpage_stuck_requests_total numeric,
|
||||
compute_getpage_max_inflight_stuck_time_ms numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
getpage_prefetches_buffered numeric,
|
||||
|
||||
129
compute/patches/anon_v2.patch
Normal file
129
compute/patches/anon_v2.patch
Normal file
@@ -0,0 +1,129 @@
|
||||
diff --git a/sql/anon.sql b/sql/anon.sql
|
||||
index 0cdc769..f6cc950 100644
|
||||
--- a/sql/anon.sql
|
||||
+++ b/sql/anon.sql
|
||||
@@ -1141,3 +1141,8 @@ $$
|
||||
-- TODO : https://en.wikipedia.org/wiki/L-diversity
|
||||
|
||||
-- TODO : https://en.wikipedia.org/wiki/T-closeness
|
||||
+
|
||||
+-- NEON Patches
|
||||
+
|
||||
+GRANT ALL ON SCHEMA anon to neon_superuser;
|
||||
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
|
||||
diff --git a/sql/init.sql b/sql/init.sql
|
||||
index 7da6553..9b6164b 100644
|
||||
--- a/sql/init.sql
|
||||
+++ b/sql/init.sql
|
||||
@@ -74,50 +74,49 @@ $$
|
||||
|
||||
SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED';
|
||||
|
||||
--- load fake data from a given path
|
||||
-CREATE OR REPLACE FUNCTION anon.init(
|
||||
- datapath TEXT
|
||||
-)
|
||||
+CREATE OR REPLACE FUNCTION anon.load_fake_data()
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
DECLARE
|
||||
- datapath_check TEXT;
|
||||
success BOOLEAN;
|
||||
+ sharedir TEXT;
|
||||
+ datapath TEXT;
|
||||
BEGIN
|
||||
|
||||
- IF anon.is_initialized() THEN
|
||||
- RAISE NOTICE 'The anon extension is already initialized.';
|
||||
- RETURN TRUE;
|
||||
- END IF;
|
||||
+ datapath := '/extension/anon/';
|
||||
+ -- find the local extension directory
|
||||
+ SELECT setting INTO sharedir
|
||||
+ FROM pg_catalog.pg_config
|
||||
+ WHERE name = 'SHAREDIR';
|
||||
|
||||
SELECT bool_or(results) INTO success
|
||||
FROM unnest(array[
|
||||
- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'),
|
||||
- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'),
|
||||
- anon.load_csv('anon.address',datapath ||'/address.csv'),
|
||||
- anon.load_csv('anon.city',datapath ||'/city.csv'),
|
||||
- anon.load_csv('anon.company',datapath ||'/company.csv'),
|
||||
- anon.load_csv('anon.country',datapath ||'/country.csv'),
|
||||
- anon.load_csv('anon.email', datapath ||'/email.csv'),
|
||||
- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'),
|
||||
- anon.load_csv('anon.iban',datapath ||'/iban.csv'),
|
||||
- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'),
|
||||
- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'),
|
||||
- anon.load_csv('anon.siret',datapath ||'/siret.csv'),
|
||||
- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv')
|
||||
+ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'),
|
||||
+ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'),
|
||||
+ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'),
|
||||
+ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'),
|
||||
+ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'),
|
||||
+ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'),
|
||||
+ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'),
|
||||
+ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'),
|
||||
+ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'),
|
||||
+ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'),
|
||||
+ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'),
|
||||
+ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'),
|
||||
+ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv')
|
||||
]) results;
|
||||
RETURN success;
|
||||
-
|
||||
END;
|
||||
$$
|
||||
- LANGUAGE PLPGSQL
|
||||
+ LANGUAGE plpgsql
|
||||
VOLATILE
|
||||
RETURNS NULL ON NULL INPUT
|
||||
- PARALLEL UNSAFE -- because load_csv is unsafe
|
||||
- SECURITY INVOKER
|
||||
+ PARALLEL UNSAFE -- because of the EXCEPTION
|
||||
+ SECURITY DEFINER
|
||||
SET search_path=''
|
||||
;
|
||||
-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED';
|
||||
+
|
||||
+SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED';
|
||||
|
||||
-- People tend to forget the anon.init() step
|
||||
-- This is a friendly notice for them
|
||||
@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED';
|
||||
CREATE OR REPLACE FUNCTION anon.load(TEXT)
|
||||
RETURNS BOOLEAN AS
|
||||
$$
|
||||
- SELECT anon.init($1);
|
||||
+ SELECT anon.init();
|
||||
$$
|
||||
LANGUAGE SQL
|
||||
VOLATILE
|
||||
@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED';
|
||||
CREATE OR REPLACE FUNCTION anon.init()
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
- WITH conf AS (
|
||||
- -- find the local extension directory
|
||||
- SELECT setting AS sharedir
|
||||
- FROM pg_catalog.pg_config
|
||||
- WHERE name = 'SHAREDIR'
|
||||
- )
|
||||
- SELECT anon.init(conf.sharedir || '/extension/anon/')
|
||||
- FROM conf;
|
||||
+BEGIN
|
||||
+ IF anon.is_initialized() THEN
|
||||
+ RAISE NOTICE 'The anon extension is already initialized.';
|
||||
+ RETURN TRUE;
|
||||
+ END IF;
|
||||
+
|
||||
+ RETURN anon.load_fake_data();
|
||||
+END;
|
||||
$$
|
||||
- LANGUAGE SQL
|
||||
+ LANGUAGE plpgsql
|
||||
VOLATILE
|
||||
PARALLEL UNSAFE -- because init is unsafe
|
||||
SECURITY INVOKER
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -10,6 +10,7 @@ default = []
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
async-compression.workspace = true
|
||||
base64.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
@@ -27,6 +28,7 @@ flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
indexmap.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
|
||||
@@ -60,12 +60,16 @@ use utils::failpoint_support;
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
fn parse_remote_ext_config(arg: &str) -> Result<String> {
|
||||
if arg.starts_with("http") {
|
||||
Ok(arg.trim_end_matches('/').to_string())
|
||||
fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
|
||||
const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
|
||||
|
||||
Ok(if arg.starts_with("http") {
|
||||
arg
|
||||
} else {
|
||||
Ok("http://pg-ext-s3-gateway".to_string())
|
||||
FALLBACK_PG_EXT_GATEWAY_BASE_URL
|
||||
}
|
||||
.to_owned())
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -74,8 +78,10 @@ struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
|
||||
pub remote_ext_config: Option<String>,
|
||||
/// The base URL for the remote extension storage proxy gateway.
|
||||
/// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
/// outside the compute will talk to the compute through this port. Keep
|
||||
@@ -164,7 +170,7 @@ fn main() -> Result<()> {
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
ext_remote_storage: cli.remote_ext_config.clone(),
|
||||
remote_ext_base_url: cli.remote_ext_base_url.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
#[cfg(target_os = "linux")]
|
||||
@@ -265,4 +271,18 @@ mod test {
|
||||
fn verify_cli() {
|
||||
Cli::command().debug_assert()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_ext_gateway_base_url() {
|
||||
let arg = "http://pg-ext-s3-gateway2";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
assert_eq!(result, arg);
|
||||
|
||||
let arg = "pg-ext-s3-gateway";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
assert_eq!(
|
||||
result,
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,6 +348,7 @@ async fn run_dump_restore(
|
||||
"--no-security-labels".to_string(),
|
||||
"--no-subscriptions".to_string(),
|
||||
"--no-tablespaces".to_string(),
|
||||
"--no-event-triggers".to_string(),
|
||||
// format
|
||||
"--format".to_string(),
|
||||
"directory".to_string(),
|
||||
|
||||
@@ -1,4 +1,26 @@
|
||||
use std::collections::HashMap;
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::SocketAddr;
|
||||
use std::os::unix::fs::{PermissionsExt, symlink};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
@@ -7,24 +29,6 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use tokio::spawn;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -92,7 +96,7 @@ pub struct ComputeNodeParams {
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// the address of extension storage proxy gateway
|
||||
pub ext_remote_storage: Option<String>,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
}
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -150,6 +154,9 @@ pub struct ComputeState {
|
||||
/// set up the span relationship ourselves.
|
||||
pub startup_span: Option<tracing::span::Span>,
|
||||
|
||||
pub lfc_prewarm_state: LfcPrewarmState,
|
||||
pub lfc_offload_state: LfcOffloadState,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
}
|
||||
|
||||
@@ -163,6 +170,8 @@ impl ComputeState {
|
||||
pspec: None,
|
||||
startup_span: None,
|
||||
metrics: ComputeMetrics::default(),
|
||||
lfc_prewarm_state: LfcPrewarmState::default(),
|
||||
lfc_offload_state: LfcOffloadState::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +207,8 @@ pub struct ParsedSpec {
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
pub endpoint_storage_addr: Option<SocketAddr>,
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
@@ -251,6 +262,18 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.or(Err("invalid timeline id"))?
|
||||
};
|
||||
|
||||
let endpoint_storage_addr: Option<SocketAddr> = spec
|
||||
.endpoint_storage_addr
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
|
||||
.unwrap_or_default()
|
||||
.parse()
|
||||
.ok();
|
||||
let endpoint_storage_token = spec
|
||||
.endpoint_storage_token
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token"));
|
||||
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
@@ -258,6 +281,8 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
endpoint_storage_addr,
|
||||
endpoint_storage_token,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -305,11 +330,39 @@ struct StartVmMonitorResult {
|
||||
impl ComputeNode {
|
||||
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
|
||||
let connstr = params.connstr.as_str();
|
||||
let conn_conf = postgres::config::Config::from_str(connstr)
|
||||
let mut conn_conf = postgres::config::Config::from_str(connstr)
|
||||
.context("cannot build postgres config from connstr")?;
|
||||
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
|
||||
let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
|
||||
// Users can set some configuration parameters per database with
|
||||
// ALTER DATABASE ... SET ...
|
||||
//
|
||||
// There are at least these parameters:
|
||||
//
|
||||
// - role=some_other_role
|
||||
// - default_transaction_read_only=on
|
||||
// - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail
|
||||
// - search_path=non_public_schema, this should be actually safe because
|
||||
// we don't call any functions in user databases, but better to always reset
|
||||
// it to public.
|
||||
//
|
||||
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
|
||||
// Unset them via connection string options before connecting to the database.
|
||||
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
|
||||
//
|
||||
// TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
|
||||
// as well. After rolling out this code, we can remove this parameter from control plane.
|
||||
// In the meantime, double-passing is fine, the last value is applied.
|
||||
// See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
|
||||
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
let options = match conn_conf.get_options() {
|
||||
Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
|
||||
None => EXTRA_OPTIONS.to_string(),
|
||||
};
|
||||
conn_conf.options(&options);
|
||||
tokio_conn_conf.options(&options);
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
@@ -736,6 +789,9 @@ impl ComputeNode {
|
||||
// Log metrics so that we can search for slow operations in logs
|
||||
info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
|
||||
|
||||
if pspec.spec.prewarm_lfc_on_startup {
|
||||
self.prewarm_lfc();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1422,15 +1478,20 @@ impl ComputeNode {
|
||||
Err(e) => match e.code() {
|
||||
Some(&SqlState::INVALID_PASSWORD)
|
||||
| Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
|
||||
// Connect with zenith_admin if cloud_admin could not authenticate
|
||||
// Connect with `zenith_admin` if `cloud_admin` could not authenticate
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
"cannot connect to Postgres: {}, retrying with 'zenith_admin' username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
|
||||
zenith_admin_conf.application_name("compute_ctl:apply_config");
|
||||
zenith_admin_conf.user("zenith_admin");
|
||||
|
||||
// It doesn't matter what were the options before, here we just want
|
||||
// to connect and create a new superuser role.
|
||||
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
|
||||
zenith_admin_conf.options(ZENITH_OPTIONS);
|
||||
|
||||
let mut client =
|
||||
zenith_admin_conf.connect(NoTls)
|
||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||
@@ -1596,9 +1657,7 @@ impl ComputeNode {
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
let mut conf =
|
||||
tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap();
|
||||
conf.application_name("apply_config");
|
||||
let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure"));
|
||||
let conf = Arc::new(conf);
|
||||
|
||||
let spec = Arc::new(spec.clone());
|
||||
@@ -1838,9 +1897,9 @@ LIMIT 100",
|
||||
real_ext_name: String,
|
||||
ext_path: RemotePath,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let ext_remote_storage =
|
||||
let remote_ext_base_url =
|
||||
self.params
|
||||
.ext_remote_storage
|
||||
.remote_ext_base_url
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
@@ -1902,7 +1961,7 @@ LIMIT 100",
|
||||
let download_size = extension_server::download_extension(
|
||||
&real_ext_name,
|
||||
&ext_path,
|
||||
ext_remote_storage,
|
||||
remote_ext_base_url,
|
||||
&self.params.pgbin,
|
||||
)
|
||||
.await
|
||||
@@ -1937,23 +1996,40 @@ LIMIT 100",
|
||||
tokio::spawn(conn);
|
||||
|
||||
// TODO: support other types of grants apart from schemas?
|
||||
let query = format!(
|
||||
"GRANT {} ON SCHEMA {} TO {}",
|
||||
privileges
|
||||
.iter()
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.collect::<Vec<&'static str>>()
|
||||
.join(", "),
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
schema_name.pg_quote(),
|
||||
role_name.pg_quote(),
|
||||
);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
|
||||
// check the role grants first - to gracefully handle read-replicas.
|
||||
let select = "SELECT privilege_type
|
||||
FROM pg_namespace
|
||||
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
|
||||
JOIN pg_user users ON acl.grantee = users.usesysid
|
||||
WHERE users.usename = $1
|
||||
AND nspname = $2";
|
||||
let rows = db_client
|
||||
.query(select, &[role_name, schema_name])
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
.with_context(|| format!("Failed to execute query: {select}"))?;
|
||||
|
||||
let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
|
||||
|
||||
let grants = privileges
|
||||
.iter()
|
||||
.filter(|p| !already_granted.contains(p.as_str()))
|
||||
// should not be quoted as it's part of the command.
|
||||
// is already sanitized so it's ok
|
||||
.map(|p| p.as_str())
|
||||
.join(", ");
|
||||
|
||||
if !grants.is_empty() {
|
||||
// quote the schema and role name as identifiers to sanitize them.
|
||||
let schema_name = schema_name.pg_quote();
|
||||
let role_name = role_name.pg_quote();
|
||||
|
||||
let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
|
||||
db_client
|
||||
.simple_query(&query)
|
||||
.await
|
||||
.with_context(|| format!("Failed to execute query: {}", query))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2011,7 +2087,7 @@ LIMIT 100",
|
||||
&self,
|
||||
spec: &ComputeSpec,
|
||||
) -> Result<RemoteExtensionMetrics> {
|
||||
if self.params.ext_remote_storage.is_none() {
|
||||
if self.params.remote_ext_base_url.is_none() {
|
||||
return Ok(RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
|
||||
202
compute_tools/src/compute_prewarm.rs
Normal file
202
compute_tools/src/compute_prewarm.rs
Normal file
@@ -0,0 +1,202 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
|
||||
use compute_api::responses::LfcOffloadState;
|
||||
use compute_api::responses::LfcPrewarmState;
|
||||
use http::StatusCode;
|
||||
use reqwest::Client;
|
||||
use std::sync::Arc;
|
||||
use tokio::{io::AsyncReadExt, spawn};
|
||||
use tracing::{error, info};
|
||||
|
||||
#[derive(serde::Serialize, Default)]
|
||||
pub struct LfcPrewarmStateWithProgress {
|
||||
#[serde(flatten)]
|
||||
base: LfcPrewarmState,
|
||||
total: i32,
|
||||
prewarmed: i32,
|
||||
skipped: i32,
|
||||
}
|
||||
|
||||
/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks
|
||||
struct EndpointStoragePair {
|
||||
url: String,
|
||||
token: String,
|
||||
}
|
||||
|
||||
const KEY: &str = "lfc_state";
|
||||
impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(pspec: &crate::compute::ParsedSpec) -> Result<Self, Self::Error> {
|
||||
let Some(ref endpoint_id) = pspec.spec.endpoint_id else {
|
||||
bail!("pspec.endpoint_id missing")
|
||||
};
|
||||
let Some(ref base_uri) = pspec.endpoint_storage_addr else {
|
||||
bail!("pspec.endpoint_storage_addr missing")
|
||||
};
|
||||
let tenant_id = pspec.tenant_id;
|
||||
let timeline_id = pspec.timeline_id;
|
||||
|
||||
let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}");
|
||||
let Some(ref token) = pspec.endpoint_storage_token else {
|
||||
bail!("pspec.endpoint_storage_token missing")
|
||||
};
|
||||
let token = token.clone();
|
||||
Ok(EndpointStoragePair { url, token })
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
// If prewarm failed, we want to get overall number of segments as well as done ones.
|
||||
// However, this function should be reliable even if querying postgres failed.
|
||||
pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress {
|
||||
info!("requesting LFC prewarm state from postgres");
|
||||
let mut state = LfcPrewarmStateWithProgress::default();
|
||||
{
|
||||
state.base = self.state.lock().unwrap().lfc_prewarm_state.clone();
|
||||
}
|
||||
|
||||
let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await {
|
||||
Ok(client) => client,
|
||||
Err(err) => {
|
||||
error!(%err, "connecting to postgres");
|
||||
return state;
|
||||
}
|
||||
};
|
||||
let row = match client
|
||||
.query_one("select * from get_prewarm_info()", &[])
|
||||
.await
|
||||
{
|
||||
Ok(row) => row,
|
||||
Err(err) => {
|
||||
error!(%err, "querying LFC prewarm status");
|
||||
return state;
|
||||
}
|
||||
};
|
||||
state.total = row.try_get(0).unwrap_or_default();
|
||||
state.prewarmed = row.try_get(1).unwrap_or_default();
|
||||
state.skipped = row.try_get(2).unwrap_or_default();
|
||||
state
|
||||
}
|
||||
|
||||
pub fn lfc_offload_state(&self) -> LfcOffloadState {
|
||||
self.state.lock().unwrap().lfc_offload_state.clone()
|
||||
}
|
||||
|
||||
/// Returns false if there is a prewarm request ongoing, true otherwise
|
||||
pub fn prewarm_lfc(self: &Arc<Self>) -> bool {
|
||||
crate::metrics::LFC_PREWARM_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
if let LfcPrewarmState::Prewarming =
|
||||
std::mem::replace(state, LfcPrewarmState::Prewarming)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.prewarm_impl().await else {
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
});
|
||||
true
|
||||
}
|
||||
|
||||
fn endpoint_storage_pair(&self) -> Result<EndpointStoragePair> {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.pspec.as_ref().unwrap().try_into()
|
||||
}
|
||||
|
||||
async fn prewarm_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
let res = request.send().await.context("querying endpoint storage")?;
|
||||
let status = res.status();
|
||||
if status != StatusCode::OK {
|
||||
bail!("{status} querying endpoint storage")
|
||||
}
|
||||
|
||||
let mut uncompressed = Vec::new();
|
||||
let lfc_state = res
|
||||
.bytes()
|
||||
.await
|
||||
.context("getting request body from endpoint storage")?;
|
||||
ZstdDecoder::new(lfc_state.iter().as_slice())
|
||||
.read_to_end(&mut uncompressed)
|
||||
.await
|
||||
.context("decoding LFC state")?;
|
||||
let uncompressed_len = uncompressed.len();
|
||||
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
|
||||
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select prewarm_local_cache($1)", &[&uncompressed])
|
||||
.await
|
||||
.context("loading LFC state into postgres")
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// Returns false if there is an offload request ongoing, true otherwise
|
||||
pub fn offload_lfc(self: &Arc<Self>) -> bool {
|
||||
crate::metrics::LFC_OFFLOAD_REQUESTS.inc();
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_offload_state;
|
||||
if let LfcOffloadState::Offloading =
|
||||
std::mem::replace(state, LfcOffloadState::Offloading)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
let cloned = self.clone();
|
||||
spawn(async move {
|
||||
let Err(err) = cloned.offload_lfc_impl().await else {
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
|
||||
return;
|
||||
};
|
||||
error!(%err);
|
||||
cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
};
|
||||
});
|
||||
true
|
||||
}
|
||||
|
||||
async fn offload_lfc_impl(&self) -> Result<()> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
|
||||
info!(%url, "requesting LFC state from postgres");
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select get_local_cache_state()", &[])
|
||||
.await
|
||||
.context("querying LFC state")?
|
||||
.try_get::<usize, &[u8]>(0)
|
||||
.context("deserializing LFC state")
|
||||
.map(ZstdEncoder::new)?
|
||||
.read_to_end(&mut compressed)
|
||||
.await
|
||||
.context("compressing LFC state")?;
|
||||
let compressed_len = compressed.len();
|
||||
info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
|
||||
|
||||
let request = Client::new().put(url).bearer_auth(token).body(compressed);
|
||||
match request.send().await {
|
||||
Ok(res) if res.status() == StatusCode::OK => Ok(()),
|
||||
Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()),
|
||||
Err(err) => Err(err).context("writing to endpoint storage"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -223,6 +223,12 @@ pub fn write_postgres_conf(
|
||||
// TODO: tune this after performance testing
|
||||
writeln!(file, "pgaudit.log_rotation_age=5")?;
|
||||
|
||||
// Enable audit logs for pg_session_jwt extension
|
||||
// TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as
|
||||
// pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863
|
||||
//
|
||||
// writeln!(file, "pg_session_jwt.audit_log=on")?;
|
||||
|
||||
// Add audit shared_preload_libraries, if they are not present.
|
||||
//
|
||||
// The caller who sets the flag is responsible for ensuring that the necessary
|
||||
|
||||
@@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
ext_remote_storage: &str,
|
||||
remote_ext_base_url: &str,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
|
||||
// TODO add retry logic
|
||||
let download_buffer =
|
||||
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
|
||||
match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
|
||||
Ok(buffer) => buffer,
|
||||
Err(error_message) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
@@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
||||
async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", remote_ext_base_url, ext_path);
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use axum::{RequestExt, body::Body};
|
||||
use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
};
|
||||
use compute_api::requests::ComputeClaims;
|
||||
use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope};
|
||||
use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
@@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize {
|
||||
impl Authorize {
|
||||
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
|
||||
let mut validation = Validation::new(Algorithm::EdDSA);
|
||||
// Nothing is currently required
|
||||
validation.required_spec_claims = HashSet::new();
|
||||
validation.validate_exp = true;
|
||||
// Unused by the control plane
|
||||
validation.validate_aud = false;
|
||||
// Unused by the control plane
|
||||
validation.validate_nbf = false;
|
||||
// Unused by the control plane
|
||||
validation.validate_aud = false;
|
||||
validation.set_audience(&[COMPUTE_AUDIENCE]);
|
||||
// Nothing is currently required
|
||||
validation.set_required_spec_claims(&[] as &[&str; 0]);
|
||||
|
||||
Self {
|
||||
compute_id,
|
||||
@@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
|
||||
};
|
||||
|
||||
if data.claims.compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid compute ID in authorization token claims",
|
||||
));
|
||||
match data.claims.scope {
|
||||
// TODO: We should validate audience for every token, but
|
||||
// instead of this ad-hoc validation, we should turn
|
||||
// [`Validation::validate_aud`] on. This is merely a stopgap
|
||||
// while we roll out `aud` deployment. We return a 401
|
||||
// Unauthorized because when we eventually do use
|
||||
// [`Validation`], we will hit the above `Err` match arm which
|
||||
// returns 401 Unauthorized.
|
||||
Some(ComputeClaimsScope::Admin) => {
|
||||
let Some(ref audience) = data.claims.audience else {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"missing audience in authorization token claims",
|
||||
));
|
||||
};
|
||||
|
||||
if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid audience in authorization token claims",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// If the scope is not [`ComputeClaimsScope::Admin`], then we
|
||||
// must validate the compute_id
|
||||
_ => {
|
||||
let Some(ref claimed_compute_id) = data.claims.compute_id else {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::FORBIDDEN,
|
||||
"missing compute_id in authorization token claims",
|
||||
));
|
||||
};
|
||||
|
||||
if *claimed_compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::FORBIDDEN,
|
||||
"invalid compute ID in authorization token claims",
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make claims available to any subsequent middleware or request
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
// Don't even try to download extensions if no remote storage is configured
|
||||
if compute.params.ext_remote_storage.is_none() {
|
||||
if compute.params.remote_ext_base_url.is_none() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"remote storage is not configured",
|
||||
|
||||
39
compute_tools/src/http/routes/lfc.rs
Normal file
39
compute_tools/src/http/routes/lfc.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use crate::compute_prewarm::LfcPrewarmStateWithProgress;
|
||||
use crate::http::JsonResponse;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{Json, http::StatusCode};
|
||||
use compute_api::responses::LfcOffloadState;
|
||||
type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
|
||||
|
||||
pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmStateWithProgress> {
|
||||
Json(compute.lfc_prewarm_state().await)
|
||||
}
|
||||
|
||||
// Following functions are marked async for axum, as it's more convenient than wrapping these
|
||||
// in async lambdas at call site
|
||||
|
||||
pub(in crate::http) async fn offload_state(compute: Compute) -> Json<LfcOffloadState> {
|
||||
Json(compute.lfc_offload_state())
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn prewarm(compute: Compute) -> Response {
|
||||
if compute.prewarm_lfc() {
|
||||
StatusCode::ACCEPTED.into_response()
|
||||
} else {
|
||||
JsonResponse::error(
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
"Multiple requests for prewarm are not allowed",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(in crate::http) async fn offload(compute: Compute) -> Response {
|
||||
if compute.offload_lfc() {
|
||||
StatusCode::ACCEPTED.into_response()
|
||||
} else {
|
||||
JsonResponse::error(
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
"Multiple requests for prewarm offload are not allowed",
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,7 @@ pub(in crate::http) mod extensions;
|
||||
pub(in crate::http) mod failpoints;
|
||||
pub(in crate::http) mod grants;
|
||||
pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod status;
|
||||
|
||||
@@ -23,7 +23,7 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, metrics, metrics_json, status, terminate,
|
||||
grants, insights, lfc, metrics, metrics_json, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -85,6 +85,8 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
|
||||
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
|
||||
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
|
||||
@@ -11,6 +11,7 @@ pub mod http;
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod compute_prewarm;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{
|
||||
IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
|
||||
IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm.
|
||||
/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm
|
||||
pub(crate) static LFC_PREWARM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_prewarm_requests_total",
|
||||
"Total number of LFC prewarm requests made by compute_ctl",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"compute_ctl_lfc_offload_requests_total",
|
||||
"Total number of LFC offload requests made by compute_ctl",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
let mut metrics = COMPUTE_CTL_UP.collect();
|
||||
metrics.extend(INSTALLED_EXTENSIONS.collect());
|
||||
@@ -106,5 +124,7 @@ pub fn collect() -> Vec<MetricFamily> {
|
||||
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
|
||||
metrics.extend(PG_CURR_DOWNTIME_MS.collect());
|
||||
metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
|
||||
metrics.extend(LFC_PREWARM_REQUESTS.collect());
|
||||
metrics.extend(LFC_OFFLOAD_REQUESTS.collect());
|
||||
metrics
|
||||
}
|
||||
|
||||
@@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
experimental,
|
||||
};
|
||||
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
})
|
||||
|
||||
@@ -30,6 +30,7 @@ mod pg_helpers_tests {
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
prewarm_lfc_on_startup = off
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
|
||||
@@ -41,7 +41,7 @@ storage_broker.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
whoami.workspace = true
|
||||
|
||||
endpoint_storage.workspace = true
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -16,10 +16,11 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
|
||||
@@ -643,9 +644,10 @@ struct EndpointStartCmdArgs {
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Configure the remote extensions storage proxy gateway to request for extensions."
|
||||
help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
|
||||
alias = "remote-ext-config"
|
||||
)]
|
||||
remote_ext_config: Option<String>,
|
||||
remote_ext_base_url: Option<String>,
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
@@ -705,6 +707,9 @@ struct EndpointStopCmdArgs {
|
||||
struct EndpointGenerateJwtCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
endpoint_id: String,
|
||||
|
||||
#[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
|
||||
scope: Option<ComputeClaimsScope>,
|
||||
}
|
||||
|
||||
#[derive(clap::Subcommand)]
|
||||
@@ -1018,7 +1023,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
})
|
||||
.collect(),
|
||||
endpoint_storage: EndpointStorageConf {
|
||||
port: ENDPOINT_STORAGE_DEFAULT_PORT,
|
||||
listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR,
|
||||
},
|
||||
pg_distrib_dir: None,
|
||||
neon_distrib_dir: None,
|
||||
@@ -1410,9 +1415,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
EndpointCmd::Start(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let pageserver_id = args.endpoint_pageserver_id;
|
||||
let remote_ext_config = &args.remote_ext_config;
|
||||
let remote_ext_base_url = &args.remote_ext_base_url;
|
||||
|
||||
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
|
||||
let default_generation = env
|
||||
.storage_controller
|
||||
.timelines_onto_safekeepers
|
||||
.then_some(1);
|
||||
let safekeepers_generation = args
|
||||
.safekeepers_generation
|
||||
.or(default_generation)
|
||||
.map(SafekeeperGeneration::new);
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
|
||||
@@ -1484,14 +1496,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
None
|
||||
};
|
||||
|
||||
let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?
|
||||
+ Duration::from_secs(86400))
|
||||
.as_secs();
|
||||
let claims = endpoint_storage::claims::EndpointStorageClaims {
|
||||
tenant_id: endpoint.tenant_id,
|
||||
timeline_id: endpoint.timeline_id,
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
exp,
|
||||
};
|
||||
|
||||
let endpoint_storage_token = env.generate_auth_token(&claims)?;
|
||||
let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string();
|
||||
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
endpoint_storage_token,
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_config.as_ref(),
|
||||
remote_ext_base_url.as_ref(),
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
@@ -1540,12 +1567,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint.stop(&args.mode, args.destroy)?;
|
||||
}
|
||||
EndpointCmd::GenerateJwt(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id)
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let jwt = endpoint.generate_jwt()?;
|
||||
let endpoint = {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
|
||||
cplane
|
||||
.endpoints
|
||||
.get(endpoint_id)
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?
|
||||
};
|
||||
|
||||
let jwt = endpoint.generate_jwt(args.scope)?;
|
||||
|
||||
print!("{jwt}");
|
||||
}
|
||||
|
||||
@@ -45,7 +45,9 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::{ComputeClaims, ConfigurationRequest};
|
||||
use compute_api::requests::{
|
||||
COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
|
||||
};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
|
||||
};
|
||||
@@ -630,9 +632,17 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
/// Generate a JWT with the correct claims.
|
||||
pub fn generate_jwt(&self) -> Result<String> {
|
||||
pub fn generate_jwt(&self, scope: Option<ComputeClaimsScope>) -> Result<String> {
|
||||
self.env.generate_auth_token(&ComputeClaims {
|
||||
compute_id: self.endpoint_id.clone(),
|
||||
audience: match scope {
|
||||
Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]),
|
||||
_ => None,
|
||||
},
|
||||
compute_id: match scope {
|
||||
Some(ComputeClaimsScope::Admin) => None,
|
||||
_ => Some(self.endpoint_id.clone()),
|
||||
},
|
||||
scope,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -640,10 +650,12 @@ impl Endpoint {
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
endpoint_storage_token: String,
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
@@ -733,6 +745,9 @@ impl Endpoint {
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
prewarm_lfc_on_startup: false,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -810,8 +825,8 @@ impl Endpoint {
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
if let Some(remote_ext_base_url) = remote_ext_base_url {
|
||||
cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
@@ -903,7 +918,7 @@ impl Endpoint {
|
||||
self.external_http_address.port()
|
||||
),
|
||||
)
|
||||
.bearer_auth(self.generate_jwt()?)
|
||||
.bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
@@ -980,7 +995,7 @@ impl Endpoint {
|
||||
self.external_http_address.port()
|
||||
))
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.bearer_auth(self.generate_jwt()?)
|
||||
.bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
|
||||
.body(
|
||||
serde_json::to_string(&ConfigurationRequest {
|
||||
spec,
|
||||
|
||||
@@ -3,17 +3,19 @@ use crate::local_env::LocalEnv;
|
||||
use anyhow::{Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||
pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
|
||||
pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;
|
||||
pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr =
|
||||
SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993);
|
||||
|
||||
pub struct EndpointStorage {
|
||||
pub bin: Utf8PathBuf,
|
||||
pub data_dir: Utf8PathBuf,
|
||||
pub pemfile: Utf8PathBuf,
|
||||
pub port: u16,
|
||||
pub addr: SocketAddr,
|
||||
}
|
||||
|
||||
impl EndpointStorage {
|
||||
@@ -22,7 +24,7 @@ impl EndpointStorage {
|
||||
bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
|
||||
data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
|
||||
pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
|
||||
port: env.endpoint_storage.port,
|
||||
addr: env.endpoint_storage.listen_addr,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +33,7 @@ impl EndpointStorage {
|
||||
}
|
||||
|
||||
fn listen_addr(&self) -> Utf8PathBuf {
|
||||
format!("127.0.0.1:{}", self.port).into()
|
||||
format!("{}:{}", self.addr.ip(), self.addr.port()).into()
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
|
||||
@@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file;
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use crate::broker::StorageBroker;
|
||||
use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
|
||||
use crate::endpoint_storage::{
|
||||
ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage,
|
||||
};
|
||||
use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -151,10 +153,10 @@ pub struct NeonLocalInitConf {
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct EndpointStorageConf {
|
||||
pub port: u16,
|
||||
pub listen_addr: SocketAddr,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
@@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EndpointStorageConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NeonBroker {
|
||||
pub fn client_url(&self) -> Url {
|
||||
let url = if let Some(addr) = self.listen_https_addr {
|
||||
|
||||
@@ -112,7 +112,7 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
/// Initializes a safekeeper node by creating all necessary files,
|
||||
/// e.g. SSL certificates.
|
||||
/// e.g. SSL certificates and JWT token file.
|
||||
pub fn initialize(&self) -> anyhow::Result<()> {
|
||||
if self.env.generate_local_ssl_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
@@ -120,6 +120,17 @@ impl SafekeeperNode {
|
||||
&self.datadir_path().join("server.key"),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Generate a token file for authentication with other safekeepers
|
||||
if self.conf.auth_enabled {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
|
||||
let token_path = self.datadir_path().join("peer_jwt_token");
|
||||
std::fs::write(token_path, token)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -218,14 +229,26 @@ impl SafekeeperNode {
|
||||
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
|
||||
}
|
||||
|
||||
if self.conf.auth_enabled {
|
||||
let token_path = self.datadir_path().join("peer_jwt_token");
|
||||
let token_path_str = token_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Token path {token_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned();
|
||||
args.extend(["--auth-token-path".to_owned(), token_path_str]);
|
||||
}
|
||||
|
||||
args.extend_from_slice(extra_opts);
|
||||
|
||||
let env_variables = Vec::new();
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
self.safekeeper_env_variables()?,
|
||||
env_variables,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
@@ -239,18 +262,6 @@ impl SafekeeperNode {
|
||||
.await
|
||||
}
|
||||
|
||||
fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||
// Generate a token to connect from safekeeper to peers
|
||||
if self.conf.auth_enabled {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
|
||||
@@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper0::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest,
|
||||
SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
@@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use reqwest::{Method, Response};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::process::Command;
|
||||
@@ -570,6 +571,11 @@ impl StorageController {
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
|
||||
let claims = Claims::new(None, Scope::SafekeeperData);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--safekeeper-jwt-token={jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
@@ -614,6 +620,10 @@ impl StorageController {
|
||||
self.env.base_data_dir.display()
|
||||
));
|
||||
|
||||
if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() {
|
||||
anyhow::bail!("Safekeeper set up for auth but no private key specified");
|
||||
}
|
||||
|
||||
if self.config.timelines_onto_safekeepers {
|
||||
args.push("--timelines-onto-safekeepers".to_string());
|
||||
}
|
||||
@@ -640,6 +650,10 @@ impl StorageController {
|
||||
)
|
||||
.await?;
|
||||
|
||||
if self.config.timelines_onto_safekeepers {
|
||||
self.register_safekeepers().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -743,6 +757,23 @@ impl StorageController {
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
let response = self.dispatch_inner(method, path, body).await?;
|
||||
Ok(response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch_inner<RQ>(
|
||||
&self,
|
||||
method: reqwest::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> anyhow::Result<Response>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
{
|
||||
// In the special case of the `storage_controller start` subcommand, we wish
|
||||
// to use the API endpoint of the newly started storage controller in order
|
||||
@@ -785,10 +816,31 @@ impl StorageController {
|
||||
let response = builder.send().await?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
Ok(response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Register the safekeepers in the storage controller
|
||||
#[instrument(skip(self))]
|
||||
async fn register_safekeepers(&self) -> anyhow::Result<()> {
|
||||
for sk in self.env.safekeepers.iter() {
|
||||
let sk_id = sk.id;
|
||||
let body = serde_json::json!({
|
||||
"id": sk_id,
|
||||
"created_at": "2023-10-25T09:11:25Z",
|
||||
"updated_at": "2024-08-28T11:32:43Z",
|
||||
"region_id": "aws-us-east-2",
|
||||
"host": "127.0.0.1",
|
||||
"port": sk.pg_port,
|
||||
"http_port": sk.http_port,
|
||||
"https_port": sk.https_port,
|
||||
"version": 5957,
|
||||
"availability_zone_id": format!("us-east-2b-{sk_id}"),
|
||||
});
|
||||
self.upsert_safekeeper(sk_id, body).await?;
|
||||
self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call into the attach_hook API, for use before handing out attachments to pageservers
|
||||
@@ -816,6 +868,42 @@ impl StorageController {
|
||||
Ok(response.generation)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn upsert_safekeeper(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
request: serde_json::Value,
|
||||
) -> anyhow::Result<()> {
|
||||
let resp = self
|
||||
.dispatch_inner::<serde_json::Value>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}"),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!(
|
||||
"setting scheduling policy unsuccessful for safekeeper {node_id}: {}",
|
||||
resp.status()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn safekeeper_scheduling_policy(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
scheduling_policy: SkSchedulingPolicy,
|
||||
) -> anyhow::Result<()> {
|
||||
self.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
|
||||
Method::POST,
|
||||
format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
|
||||
Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn inspect(
|
||||
&self,
|
||||
|
||||
@@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14}
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
|
||||
# Test that the first library path that the dynamic loader looks in is the path
|
||||
# that we use for custom compiled software
|
||||
first_path="$(ldconfig --verbose 2>/dev/null \
|
||||
| grep --invert-match ^$'\t' \
|
||||
| cut --delimiter=: --fields=1 \
|
||||
| head --lines=1)"
|
||||
test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat.
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
|
||||
@@ -12,6 +12,7 @@ ERROR: invalid JWT encoding
|
||||
-- Test creating a session with an expired JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
|
||||
ERROR: Token used after it has expired
|
||||
DETAIL: exp=1742564432
|
||||
-- Test creating a session with a valid JWT
|
||||
SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
|
||||
jwt_session_init
|
||||
|
||||
@@ -3,3 +3,6 @@ pg_distrib_dir='/usr/local/'
|
||||
listen_pg_addr='0.0.0.0:6400'
|
||||
listen_http_addr='0.0.0.0:9898'
|
||||
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
|
||||
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
|
||||
control_plane_emergency_mode=true
|
||||
virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks
|
||||
|
||||
@@ -38,11 +38,6 @@ Currently, the following metrics are collected:
|
||||
Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
This is an absolute, per-timeline metric.
|
||||
|
||||
- `resident_size`
|
||||
|
||||
Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||
This is an absolute, per-tenant metric.
|
||||
|
||||
- `remote_storage_size`
|
||||
|
||||
Size of the remote storage (S3) directory.
|
||||
|
||||
@@ -7,6 +7,8 @@ Author: Christian Schwarz
|
||||
|
||||
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
|
||||
|
||||
**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link).
|
||||
|
||||
# Motivation
|
||||
|
||||
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
|
||||
|
||||
362
docs/rfcs/2025-04-30-direct-io-for-pageserver.md
Normal file
362
docs/rfcs/2025-04-30-direct-io-for-pageserver.md
Normal file
@@ -0,0 +1,362 @@
|
||||
# Direct IO For Pageserver
|
||||
|
||||
Date: Apr 30, 2025
|
||||
|
||||
## Summary
|
||||
|
||||
This document is a retroactive RFC. It
|
||||
- provides some background on what direct IO is,
|
||||
- motivates why Pageserver should be using it for its IO, and
|
||||
- describes how we changed Pageserver to use it.
|
||||
|
||||
The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR.
|
||||
|
||||
People primarily involved in this project were:
|
||||
- Yuchen Liang <yuchen@neon.tech>
|
||||
- Vlad Lazar <vlad@neon.tech>
|
||||
- Christian Schwarz <christian@neon.tech>
|
||||
|
||||
## Timeline
|
||||
|
||||
For posterity, here is the rough timeline of the development work that got us to where we are today.
|
||||
|
||||
- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API
|
||||
- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode
|
||||
- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks
|
||||
- Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests
|
||||
- Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users
|
||||
- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go.
|
||||
- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376))
|
||||
- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO
|
||||
- Apr 2025: develop & roll out direct IO for the write path
|
||||
|
||||
## Background: Terminology & Glossary
|
||||
|
||||
**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents.
|
||||
The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k).
|
||||
The cache lives in kernel memory and is not directly accessible through userspace.
|
||||
|
||||
**Buffered IO**: an application's read/write system calls go through the kernel page cache.
|
||||
For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents
|
||||
at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict
|
||||
a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes
|
||||
from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps
|
||||
track of the fact that the page is now "dirty" in some ancillary structure.
|
||||
|
||||
**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications
|
||||
made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel
|
||||
asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant
|
||||
ones are a) explicit request by userspace (`fsync`) and b) memory pressure.
|
||||
|
||||
**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity.
|
||||
If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations.
|
||||
Before reusing a page like that, the page has to be written back (writeback, see above).
|
||||
The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only
|
||||
way to get that memory is by eviction & re-using a dirty page cache page.
|
||||
Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`.
|
||||
I refer to this effect as the "malloc latency backscatter" caused by buffered IO.
|
||||
|
||||
**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem
|
||||
is still involved because it is ultimately in charge of mapping the concept of files & offsets within them
|
||||
to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers
|
||||
and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155).
|
||||
The IO operations will fail at runtime with EINVAL if the alignment requirements are not met.
|
||||
|
||||
**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and
|
||||
fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers,
|
||||
kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by
|
||||
the application.
|
||||
It takes more effort by the application to program with direct instead of buffered IO.
|
||||
The return is precise control over and a clear distinction between consumption/modification of memory vs disk.
|
||||
|
||||
**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache").
|
||||
Its caching unit is 8KiB blocks of the layer files written by Pageserver.
|
||||
A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer.
|
||||
The default size is tiny (64MiB), very much like Postgres's `shared_buffers`.
|
||||
We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year.
|
||||
|
||||
**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name.
|
||||
Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux.
|
||||
However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of
|
||||
IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`).
|
||||
|
||||
## Background: History Of Caching In Pageserver
|
||||
|
||||
For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO.
|
||||
It performed write-back to the kernel using buffered IO.
|
||||
|
||||
We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994).
|
||||
|
||||
The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers.
|
||||
The `PageCache` pages are usable as owned IO buffers.
|
||||
|
||||
We then started bypassing PageCache for user data blocks.
|
||||
Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets.
|
||||
The disk btree embedded in delta & image layers remains `PageCache`'d.
|
||||
Epics for that work were:
|
||||
- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright.
|
||||
- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks:
|
||||
- Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice)
|
||||
- InMemoryLayer
|
||||
- Compaction
|
||||
|
||||
The outcome of the above:
|
||||
1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache).
|
||||
2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`.
|
||||
|
||||
In production we size the PS `PageCache` to be 2GiB.
|
||||
Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines.
|
||||
High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS).
|
||||
The response to this is to migrate tenants away, or increase PS `PageCache` size.
|
||||
It is currently manual but could be automated, e.g., in Storage Controller.
|
||||
|
||||
In the future, we may eliminate the `PageCache` even for indirect blocks.
|
||||
For example with an LRU cache that has as unit the entire disk btree content
|
||||
instead of individual blocks.
|
||||
|
||||
## High-Level Design
|
||||
|
||||
So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache.
|
||||
We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem.
|
||||
This achieves the following system properties:
|
||||
|
||||
**Predictable VirtualFile latencies**
|
||||
* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss.
|
||||
* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure.
|
||||
* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe.
|
||||
But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree.
|
||||
* By switching to direct IO, above operations will have the (predictable) device latency -- always.
|
||||
Reads and appends always go to disk.
|
||||
And malloc will not have to write back dirty data.
|
||||
|
||||
**Explicitness & Tangibility of resource usage**
|
||||
* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant.
|
||||
* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control.
|
||||
* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?").
|
||||
* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that.
|
||||
|
||||
**CPU Efficiency**
|
||||
* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path.
|
||||
* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements.
|
||||
|
||||
The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are:
|
||||
- read latency improvements for repeat reads of the same data ("locality of reference")
|
||||
- asterisk: only if that state is still cache-resident by time of next access
|
||||
- write throughput by having kernel page cache batch small VFS writes into bigger disk writes
|
||||
- asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback
|
||||
|
||||
We are **happy to make this trade-off**:
|
||||
- Because of the advantages listed above.
|
||||
- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache.
|
||||
(At just 2GiB PS PageCache size, we average a 99.95% hit rate).
|
||||
So, the latency of going to disk is only for data block reads, not the index traversal.
|
||||
- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance).
|
||||
And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it.
|
||||
(See the appendix for a more detailed explanation why this is).
|
||||
- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before.
|
||||
|
||||
### Desired End State
|
||||
|
||||
The desired end state of the project is as follows, and with some asterisks, we have achieved it.
|
||||
|
||||
All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache.
|
||||
|
||||
In particular, the "data path" includes
|
||||
- the wal ingest path
|
||||
- compaction
|
||||
- anything on the `Timeline::get` / `Timeline::get_vectored` path.
|
||||
|
||||
The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache.
|
||||
Hit rate target is 99.95%.
|
||||
|
||||
There are no regressions to ingest latency.
|
||||
|
||||
The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`.
|
||||
We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO.
|
||||
Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO).
|
||||
|
||||
The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request.
|
||||
We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call.
|
||||
(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth).
|
||||
|
||||
## Design & Implementation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
A lot of prerequisite work had to happen to enable use of direct IO.
|
||||
|
||||
To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path:
|
||||
- page_service level server-side batching (config field `page_service_pipelining`)
|
||||
- concurrent IO (config field `get_vectored_concurrent_io`)
|
||||
The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376).
|
||||
Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799).
|
||||
The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`.
|
||||
The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC.
|
||||
|
||||
For the write path, and especially WAL ingest, we need to hide write latency.
|
||||
We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled
|
||||
buffer happen in a sidecar tokio task while new writes fill a new buffer.
|
||||
We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`.
|
||||
The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558).
|
||||
|
||||
### Ensuring Adherence to Alignment Requirements
|
||||
|
||||
Direct IO puts requirements on
|
||||
- memory buffer alignment
|
||||
- io size (=memory buffer size)
|
||||
- file offset alignment
|
||||
|
||||
The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!).
|
||||
|
||||
In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe).
|
||||
Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple.
|
||||
We made this decision because:
|
||||
- a) it is compatible with all the environments we need to run in
|
||||
- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart)
|
||||
- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower).
|
||||
- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO.
|
||||
|
||||
This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD).
|
||||
|
||||
The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements.
|
||||
All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits.
|
||||
Implementors of the marker traits are:
|
||||
- `IoBuffer` / `IoBufferMut`: used for most reads and writes
|
||||
- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!)
|
||||
|
||||
The alignment requirement is infectious; it permeates bottom-up throughout the code base.
|
||||
We stop the infection at roughly the same layers in the code base where we stopped permeating the
|
||||
use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing
|
||||
a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap.
|
||||
The places where we currently stop permeating are sort of arbitrary. For example, it would probably
|
||||
make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s.
|
||||
|
||||
The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors:
|
||||
- non-adherence to file offset alignment requirements
|
||||
- non-adherence to io size requirements
|
||||
|
||||
The following higher-level constructs ensure we meet the requirements:
|
||||
- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples.
|
||||
- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment.
|
||||
|
||||
Note that these types are used always, regardless of whether direct IO is enabled or not.
|
||||
There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512).
|
||||
But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO.
|
||||
|
||||
### Configuration / Feature Flagging
|
||||
|
||||
In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements.
|
||||
To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations.
|
||||
|
||||
We set `O_DIRECT` based on:
|
||||
- the VirtualFile API used to create/open the VirtualFile instance
|
||||
- the `virtual_file_io_mode` configuration flag
|
||||
- the OpenOptions `read` and/or `write` flags.
|
||||
|
||||
The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list.
|
||||
Other APIs never use `O_DIRECT`.
|
||||
(The name is bad and should really be `_maybe_direct_io`.)
|
||||
|
||||
The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path).
|
||||
At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available.
|
||||
|
||||
The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags.
|
||||
The result is the following runtime behavior:
|
||||
|
||||
|what|OpenOptions|`v_f_io_mode`<br/>=`buffered`|`v_f_io_mode`<br/>=`direct`|`v_f_io_mode`<br/>=`direct-rw`|
|
||||
|-|-|-|-|-|
|
||||
|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT|
|
||||
|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT|
|
||||
|`InMemoryLayer`|read + write|()|()*|O_DIRECT|
|
||||
|`DeltaLayerWriter`| write | () | () | O_DIRECT |
|
||||
|`ImageLayerWriter`| write | () | () | O_DIRECT |
|
||||
|`download_layer_file`|write |()|()|O_DIRECT|
|
||||
|
||||
The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`.
|
||||
That period was when we implemented and shipped the first version of `BufferedWriter`.
|
||||
We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`.
|
||||
The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later,
|
||||
in https://github.com/neondatabase/neon/pull/11558.
|
||||
|
||||
Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction.
|
||||
For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set.
|
||||
|
||||
## Correctness Validation
|
||||
|
||||
The correctness risks with this project were:
|
||||
- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation.
|
||||
These types expose an API that is largely identical to that of the `bytes` crate and/or Vec.
|
||||
- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path.
|
||||
|
||||
We sadly do not have infrastructure to run pageserver under `cargo miri`.
|
||||
So for memory safety issues, we relied on careful peer review.
|
||||
|
||||
We do assert the production-like alignment requirements in testing builds.
|
||||
However, these asserts were added retroactively.
|
||||
The actual validation before rollout happened in staging and pre-prod.
|
||||
We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite.
|
||||
I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements.
|
||||
Evidently developer testing was good enough.
|
||||
|
||||
## Performance Validation
|
||||
|
||||
The read path went through a lot of iterations of benchmarking in staging and pre-prod.
|
||||
The benchmarks in those environments demonstrated performance regressions early in the implementation.
|
||||
It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions.
|
||||
|
||||
The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns.
|
||||
|
||||
## Future Work
|
||||
|
||||
There is minor and major follow-up work that can be considered in the future.
|
||||
Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list.
|
||||
|
||||
Read Path:
|
||||
- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally.
|
||||
Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size
|
||||
and potentially also use that to drive placement decisions of shards from StorageController
|
||||
https://github.com/neondatabase/neon/issues/9288
|
||||
- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache.
|
||||
But even then, an estimation of the working set would be helpful to figure out caching strategy.
|
||||
|
||||
Write Path:
|
||||
- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129
|
||||
- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101
|
||||
- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692
|
||||
- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676
|
||||
|
||||
Both:
|
||||
- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster.
|
||||
This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts.
|
||||
However, padding latencies at microsecond scale is non-trivial.
|
||||
|
||||
Misc:
|
||||
- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write.
|
||||
Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use
|
||||
APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string`
|
||||
are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809
|
||||
|
||||
# Appendix
|
||||
|
||||
## Why Kernel Page Cache Is Ineffective At Tenant High Density
|
||||
|
||||
In the Motivation section, we stated:
|
||||
|
||||
> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance).
|
||||
|
||||
The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss.
|
||||
That's either sequential scans or random reads.
|
||||
A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available.
|
||||
It is complete waste to have the kernel page cache cache data blocks in this case.
|
||||
Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space.
|
||||
In such cases, the WAL records of those updates likely sit on the same delta layer block.
|
||||
When Compute does a sequential scan, it sends a series of single-page requests for these individual pages.
|
||||
When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit.
|
||||
This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching.
|
||||
We can either add a small per-connection LRU cache for such delta layer blocks.
|
||||
Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice.
|
||||
This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32).
|
||||
|
||||
There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these
|
||||
1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation)
|
||||
2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching).
|
||||
251
docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
Normal file
251
docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# Concurrent IO for Pageserver Read Path
|
||||
|
||||
Date: May 6, 2025
|
||||
|
||||
## Summary
|
||||
|
||||
This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025.
|
||||
|
||||
The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files
|
||||
_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete.
|
||||
|
||||
Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time
|
||||
contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`.
|
||||
|
||||
The motivation for why this work had to happen when it happened was the switch of Pageserver to
|
||||
- not cache user data blocks in PS PageCache and
|
||||
- switch to use direct IO.
|
||||
More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`.
|
||||
|
||||
### Refs
|
||||
|
||||
- Epic: https://github.com/neondatabase/neon/issues/9378
|
||||
- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002
|
||||
- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378
|
||||
|
||||
Design and implementation by:
|
||||
- Vlad Lazar <vlad@neon.tech>
|
||||
- Christian Schwarz <christian@neon.tech>
|
||||
|
||||
## Background & Motivation
|
||||
|
||||
The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps:
|
||||
- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`).
|
||||
- Pass these values to walredo to reconstruct the page images.
|
||||
|
||||
The read path used to be single-key but has been made multi-key some time ago.
|
||||
([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link))
|
||||
However, for simplicity, most of this doc will explain things in terms of a single key being requested.
|
||||
|
||||
The `Value` retrieval step above can be broken down into the following functions:
|
||||
- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction.
|
||||
- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk.
|
||||
The main job here is to coalesce the small value reads into larger filesystem-level read operations.
|
||||
This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.)
|
||||
Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done.
|
||||
- **Perform the read IO** using `tokio-epoll-uring`.
|
||||
|
||||
Before this project, above functions were sequentially interleaved, meaning:
|
||||
1. we would advance traversal, ...
|
||||
2. discover, that we need to read a value, ...
|
||||
3. read it from disk using `tokio-epoll-uring`, ...
|
||||
4. goto 1 unless we're done.
|
||||
|
||||
This meant that if N `Value`s need to be read to reconstruct a page,
|
||||
the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`.
|
||||
|
||||
## Design
|
||||
|
||||
The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before.
|
||||
But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution.
|
||||
After the last read from the last layer is submitted, we wait for the IOs to complete.
|
||||
|
||||
Assuming the filesystem / disk is able to actually process the submitted IOs without queuing,
|
||||
we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`.
|
||||
|
||||
Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe.
|
||||
Traversal will stall on on-demand layer download if a layer is not yet resident.
|
||||
It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index.
|
||||
|
||||
### Avoiding Waiting For IO During Traversal
|
||||
|
||||
The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized.
|
||||
|
||||
Before this project, traversal needed to perform IOs for the following:
|
||||
1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks.
|
||||
2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key,
|
||||
to determine whether the `Value::will_init` the page and therefore traversal can stop for this key.
|
||||
|
||||
The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%.
|
||||
(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.)
|
||||
|
||||
The solution for (2) is source `will_init` from the disk btree index keys, which fortunately
|
||||
already encode this bit of information since the introduction of the current storage/layer format.
|
||||
|
||||
### Concurrent IOs, Submission & Completion
|
||||
|
||||
To separate IO submission from waiting for its completion,
|
||||
we introduce the notion of an `IoConcurrency` struct through which IOs are issued.
|
||||
|
||||
An IO is an opaque future that
|
||||
- captures the `tx` side of a `oneshot` channel
|
||||
- performs the read IO by calling `VirtualFile::read_exact_at().await`
|
||||
- sending the result into the `tx`
|
||||
|
||||
Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct.
|
||||
|
||||
The traversal code that submits the IO stores the the corresponding `oneshot::Receiver`
|
||||
in the `VectoredValueReconstructState`, in the the place where we previously stored
|
||||
the sequentially read `img` and `records` fields.
|
||||
|
||||
When we're done with traversal, we wait for all submitted IOs:
|
||||
for each key, there is a future that awaits all the `oneshot::Receiver`s
|
||||
for that key, and then calls into walredo to reconstruct the page image.
|
||||
Walredo is now invoked concurrently for each value instead of sequentially.
|
||||
Walredo itself remains unchanged.
|
||||
|
||||
The spawned IO futures are driven to completion by a sidecar tokio task that
|
||||
is separate from the task that performs all the layer visiting and spawning of IOs.
|
||||
That tasks receives the IO futures via an unbounded mpsc channel and
|
||||
drives them to completion inside a `FuturedUnordered`.
|
||||
|
||||
### Error handling, Panics, Cancellation-Safety
|
||||
|
||||
There are two error classes during reconstruct data retrieval:
|
||||
* traversal errors: index lookup, move to next layer, and the like
|
||||
* value read IO errors
|
||||
|
||||
A traversal error fails the entire `get_vectored` request, as before this PR.
|
||||
A value read error only fails reconstruction of that value.
|
||||
|
||||
Panics and dropping of the `get_vectored` future before it completes
|
||||
leaves the sidecar task running and does not cancel submitted IOs
|
||||
(see next section for details on sidecar task lifecycle).
|
||||
All of this is safe, but, today's preference in the team is to close out
|
||||
all resource usage explicitly if possible, rather than cancelling + forgetting
|
||||
about it on drop. So, there is warning if we drop a
|
||||
`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs.
|
||||
|
||||
### Sidecar Task Lifecycle
|
||||
|
||||
The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct.
|
||||
The `IoConcurrency` object acts as a handle through which IO futures are submitted.
|
||||
|
||||
The spawned tokio task holds the `Timeline::gate` open.
|
||||
It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped.
|
||||
|
||||
Once the `IoConcurrency` struct is dropped, no new IO futures can come in
|
||||
but already submitted IO futures will be driven to completion regardless.
|
||||
We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe.
|
||||
But the underlying kernel and hardware resources are not magically freed up by that.
|
||||
So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete.
|
||||
Under normal conditions, this should be in the low hundreds of microseconds.
|
||||
|
||||
It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of
|
||||
tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack.
|
||||
The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to
|
||||
the (short-lived) functions/scope where we issue the IOs.
|
||||
We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)).
|
||||
For now, we just add another argument to the relevant code paths.
|
||||
|
||||
### Feature Gating
|
||||
|
||||
The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`.
|
||||
|
||||
The behavior from before this project is available through `IoConcurrency::Sequential`,
|
||||
which awaits the IO futures in place, without "spawning" or "submitting" them anywhere.
|
||||
|
||||
The `get_vectored_concurrent_io` pageserver config variable determines the runtime value,
|
||||
**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object.
|
||||
|
||||
### Alternatives Explored & Caveats Encountered
|
||||
|
||||
A few words on the rationale behind having a sidecar *task* and what
|
||||
alternatives were considered but abandoned.
|
||||
|
||||
#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work
|
||||
|
||||
We explored to not have a sidecar task, and instead have a `FuturesUnordered` per
|
||||
`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the
|
||||
first time after traversal is complete (i.e., at `collect_pending_ios`).
|
||||
|
||||
The obvious disadvantage, but not showstopper, is that we wouldn't be submitting
|
||||
IOs until traversal is complete.
|
||||
|
||||
The showstopper however, is that deadlocks happen if we don't drive the
|
||||
IO futures to completion independently of the traversal task.
|
||||
The reason is that both the IO futures and the traversal task may hold _some_,
|
||||
_and_ try to acquire _more_, shared limited resources.
|
||||
For example, both the travseral task and IO future may try to acquire
|
||||
* a `VirtualFile` file descriptor cache slot async mutex (observed during impl)
|
||||
* a `tokio-epoll-uring` submission slot (observed during impl)
|
||||
* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future)
|
||||
|
||||
#### Why We Don't Do `tokio::task`-per-IO-future
|
||||
|
||||
Another option is to spawn a short-lived `tokio::task` for each IO future.
|
||||
We implemented and benchmarked it during development, but found little
|
||||
throughput improvement and moderate mean & tail latency degradation.
|
||||
Concerns about pressure on the tokio scheduler led us to abandon this variant.
|
||||
|
||||
## Future Work
|
||||
|
||||
In addition to what is listed here, also check the "Punted" list in the epic:
|
||||
https://github.com/neondatabase/neon/issues/9378
|
||||
|
||||
### Enable `Timeline::get`
|
||||
|
||||
The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`.
|
||||
The impact is that roughly the following parts of pageserver do not benefit yet:
|
||||
- parts of basebackup
|
||||
- reads performed by the ingest path
|
||||
- most internal operations that read metadata keys (e.g. `collect_keyspace`!)
|
||||
|
||||
The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460
|
||||
|
||||
The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext).
|
||||
|
||||
Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given
|
||||
piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the
|
||||
place that puts the `IoConcurrency` into the `RequestContext`.
|
||||
We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some
|
||||
observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`.
|
||||
|
||||
### Concurrent On-Demand Downloads enabled by Detached Indices
|
||||
|
||||
As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index.
|
||||
Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695)
|
||||
we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example:
|
||||
- Move the `Layer::get_or_maybe_download().await` inside the IO futures.
|
||||
This goes in the opposite direction of the next "future work" item below, but it's easy to do.
|
||||
- Serve the IO future directly from object storage and dispatch the layer download
|
||||
to some other actor, e.g., an actor that is responsible for both downloads & eviction.
|
||||
|
||||
### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion
|
||||
|
||||
Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API
|
||||
that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission,
|
||||
and then wait for completion.
|
||||
|
||||
The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`.
|
||||
|
||||
A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full).
|
||||
While avoiding spending of CPU cycles on processing of completions while we're still traversing.
|
||||
|
||||
The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing.
|
||||
So, the submission part of the split API needs to process completions if squeue is full.
|
||||
|
||||
In any way, this split API is precondition for the bigger issue with the design presented here,
|
||||
which we dicsuss in the next section.
|
||||
|
||||
### Opaque Futures Are Brittle
|
||||
|
||||
The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating.
|
||||
However, we take on **brittleness** because callers must guarantee that the submitted futures are independent.
|
||||
By our experience, it is non-trivial to identify or rule out the interdependencies.
|
||||
See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details.
|
||||
|
||||
The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer")
|
||||
and get back a means to wait for completion.
|
||||
The subsystem can thereby reason by its own how operations may be related;
|
||||
unlike today, where the submitted opaque future can do just about anything.
|
||||
@@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
|
||||
const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
|
||||
fn token() -> String {
|
||||
let claims = endpoint_storage::Claims {
|
||||
let claims = endpoint_storage::claims::EndpointStorageClaims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
@@ -489,16 +489,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
}
|
||||
|
||||
fn delete_prefix_token(uri: &str) -> String {
|
||||
use serde::Serialize;
|
||||
let parts = uri.split("/").collect::<Vec<&str>>();
|
||||
#[derive(Serialize)]
|
||||
struct PrefixClaims {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
endpoint_id: Option<endpoint_storage::EndpointId>,
|
||||
exp: u64,
|
||||
}
|
||||
let claims = PrefixClaims {
|
||||
let claims = endpoint_storage::claims::DeletePrefixClaims {
|
||||
tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
|
||||
timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
|
||||
endpoint_id: parts.get(3).map(ToString::to_string),
|
||||
|
||||
52
endpoint_storage/src/claims.rs
Normal file
52
endpoint_storage/src/claims.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt::Display;
|
||||
use utils::id::{EndpointId, TenantId, TimelineId};
|
||||
|
||||
/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct EndpointStorageClaims {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub exp: u64,
|
||||
}
|
||||
|
||||
/// Claims to remove tenant, timeline, or endpoint data. Used by control plane
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct DeletePrefixClaims {
|
||||
pub tenant_id: TenantId,
|
||||
/// None when tenant is deleted (endpoint_id is also None in this case)
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
/// None when timeline is deleted
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
pub exp: u64,
|
||||
}
|
||||
|
||||
impl Display for EndpointStorageClaims {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})",
|
||||
self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for DeletePrefixClaims {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})",
|
||||
self.tenant_id,
|
||||
self.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string()),
|
||||
self.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string()),
|
||||
self.exp
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
pub mod claims;
|
||||
use crate::claims::{DeletePrefixClaims, EndpointStorageClaims};
|
||||
use anyhow::Result;
|
||||
use axum::extract::{FromRequestParts, Path};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
@@ -13,7 +15,7 @@ use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{EndpointId, TenantId, TimelineId};
|
||||
|
||||
// simplified version of utils::auth::JwtAuth
|
||||
pub struct JwtAuth {
|
||||
@@ -79,26 +81,6 @@ pub struct Storage {
|
||||
pub max_upload_file_limit: usize,
|
||||
}
|
||||
|
||||
pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct Claims {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub exp: u64,
|
||||
}
|
||||
|
||||
impl Display for Claims {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
|
||||
self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct KeyRequest {
|
||||
tenant_id: TenantId,
|
||||
@@ -107,6 +89,13 @@ struct KeyRequest {
|
||||
path: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
struct PrefixKeyRequest {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
endpoint_id: Option<EndpointId>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct S3Path {
|
||||
pub path: RemotePath,
|
||||
@@ -165,7 +154,7 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: Claims = state
|
||||
let claims: EndpointStorageClaims = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "decoding token"))?;
|
||||
@@ -178,7 +167,7 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
|
||||
path.endpoint_id.clone()
|
||||
};
|
||||
|
||||
let route = Claims {
|
||||
let route = EndpointStorageClaims {
|
||||
tenant_id: path.tenant_id,
|
||||
timeline_id: path.timeline_id,
|
||||
endpoint_id,
|
||||
@@ -193,38 +182,13 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, PartialEq)]
|
||||
pub struct PrefixKeyPath {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
}
|
||||
|
||||
impl Display for PrefixKeyPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
|
||||
self.tenant_id,
|
||||
self.timeline_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string()),
|
||||
self.endpoint_id
|
||||
.as_ref()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or("".to_string())
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PrefixS3Path {
|
||||
pub path: RemotePath,
|
||||
}
|
||||
|
||||
impl From<&PrefixKeyPath> for PrefixS3Path {
|
||||
fn from(path: &PrefixKeyPath) -> Self {
|
||||
impl From<&DeletePrefixClaims> for PrefixS3Path {
|
||||
fn from(path: &DeletePrefixClaims) -> Self {
|
||||
let timeline_id = path
|
||||
.timeline_id
|
||||
.as_ref()
|
||||
@@ -250,21 +214,27 @@ impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
|
||||
state: &Arc<Storage>,
|
||||
) -> Result<Self, Self::Rejection> {
|
||||
let Path(path) = parts
|
||||
.extract::<Path<PrefixKeyPath>>()
|
||||
.extract::<Path<PrefixKeyRequest>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid route"))?;
|
||||
let TypedHeader(Authorization(bearer)) = parts
|
||||
.extract::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
let claims: PrefixKeyPath = state
|
||||
let claims: DeletePrefixClaims = state
|
||||
.auth
|
||||
.decode(bearer.token())
|
||||
.map_err(|e| bad_request(e, "invalid token"))?;
|
||||
if path != claims {
|
||||
return Err(unauthorized(path, claims));
|
||||
let route = DeletePrefixClaims {
|
||||
tenant_id: path.tenant_id,
|
||||
timeline_id: path.timeline_id,
|
||||
endpoint_id: path.endpoint_id,
|
||||
exp: claims.exp,
|
||||
};
|
||||
if route != claims {
|
||||
return Err(unauthorized(route, claims));
|
||||
}
|
||||
Ok((&path).into())
|
||||
Ok((&route).into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,7 +267,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn s3_path() {
|
||||
let auth = Claims {
|
||||
let auth = EndpointStorageClaims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: TIMELINE_ID,
|
||||
endpoint_id: ENDPOINT_ID.into(),
|
||||
@@ -327,10 +297,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn prefix_s3_path() {
|
||||
let mut path = PrefixKeyPath {
|
||||
let mut path = DeletePrefixClaims {
|
||||
tenant_id: TENANT_ID,
|
||||
timeline_id: None,
|
||||
endpoint_id: None,
|
||||
exp: 0,
|
||||
};
|
||||
let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,16 +1,58 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
use std::str::FromStr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::privilege::Privilege;
|
||||
use crate::responses::ComputeCtlConfig;
|
||||
use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
|
||||
|
||||
/// The value to place in the [`ComputeClaims::audience`] claim.
|
||||
pub static COMPUTE_AUDIENCE: &str = "compute";
|
||||
|
||||
/// Available scopes for a compute's JWT.
|
||||
#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeClaimsScope {
|
||||
/// An admin-scoped token allows access to all of `compute_ctl`'s authorized
|
||||
/// facilities.
|
||||
Admin,
|
||||
}
|
||||
|
||||
impl FromStr for ComputeClaimsScope {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"admin" => Ok(ComputeClaimsScope::Admin),
|
||||
_ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// When making requests to the `compute_ctl` external HTTP server, the client
|
||||
/// must specify a set of claims in `Authorization` header JWTs such that
|
||||
/// `compute_ctl` can authorize the request.
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
#[serde(rename = "snake_case")]
|
||||
pub struct ComputeClaims {
|
||||
pub compute_id: String,
|
||||
/// The compute ID that will validate the token. The only case in which this
|
||||
/// can be [`None`] is if [`Self::scope`] is
|
||||
/// [`ComputeClaimsScope::Admin`].
|
||||
pub compute_id: Option<String>,
|
||||
|
||||
/// The scope of what the token authorizes.
|
||||
pub scope: Option<ComputeClaimsScope>,
|
||||
|
||||
/// The recipient the token is intended for.
|
||||
///
|
||||
/// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for
|
||||
/// more information.
|
||||
///
|
||||
/// TODO: Remove the [`Option`] wrapper when control plane learns to send
|
||||
/// the claim.
|
||||
#[serde(rename = "aud")]
|
||||
pub audience: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
/// Request of the /configure API
|
||||
|
||||
@@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse {
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcPrewarmState {
|
||||
#[default]
|
||||
NotPrewarmed,
|
||||
Prewarming,
|
||||
Completed,
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default, Debug, Clone)]
|
||||
#[serde(tag = "status", rename_all = "snake_case")]
|
||||
pub enum LfcOffloadState {
|
||||
#[default]
|
||||
NotOffloaded,
|
||||
Offloading,
|
||||
Completed,
|
||||
Failed {
|
||||
error: String,
|
||||
},
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
|
||||
@@ -172,6 +172,15 @@ pub struct ComputeSpec {
|
||||
/// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
|
||||
/// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
|
||||
pub logs_export_host: Option<String>,
|
||||
|
||||
/// Address of endpoint storage service
|
||||
pub endpoint_storage_addr: Option<String>,
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
pub prewarm_lfc_on_startup: bool,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -84,6 +84,11 @@
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "prewarm_lfc_on_startup",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
|
||||
|
||||
@@ -16,6 +16,7 @@ pub struct Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
|
||||
// SAFETY: libc::sysconf is safe, it merely returns a value.
|
||||
let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
|
||||
if long == -1 {
|
||||
panic!("sysconf(_SC_CLK_TCK) failed");
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
tracing.workspace = true
|
||||
|
||||
rand.workspace = true # for tests
|
||||
zerocopy = "0.8"
|
||||
@@ -1,377 +0,0 @@
|
||||
mod lock_and_version;
|
||||
mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ResultOrRestart;
|
||||
use crate::algorithm::node_ptr::{MAX_PREFIX_LEN, NodePtr};
|
||||
use crate::algorithm::node_ref::ChildOrValue;
|
||||
use crate::algorithm::node_ref::{NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Allocator, Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
pub fn new_root<V: Value>(allocator: &Allocator) -> RootPtr<V> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, K: Key, V: Value, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
allocator: &Allocator,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
if let Ok(()) = update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
allocator,
|
||||
epoch_pin,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(root: RootPtr<V>, epoch_pin: &'e EpochPin) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0);
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> ResultOrRestart<Option<V>> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(ChildOrValue::Value(vptr)) => {
|
||||
// safety: It's OK to follow the pointer because we checked the version.
|
||||
let v = unsafe { (*vptr).clone() };
|
||||
Ok(Some(v))
|
||||
}
|
||||
Some(ChildOrValue::Child(v)) => lookup_recurse(&key[1..], v, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
pub(crate) fn update_recurse<'e, V: Value, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
allocator: &Allocator,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> ResultOrRestart<()>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_split_prefix(
|
||||
key,
|
||||
new_value,
|
||||
&mut wnode,
|
||||
&mut wparent,
|
||||
parent_key,
|
||||
allocator,
|
||||
);
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len as usize..];
|
||||
let level = level + prefix_match_len as usize;
|
||||
|
||||
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_and_grow(key, new_value, &wnode, &mut wparent, parent_key, allocator);
|
||||
wnode.write_unlock_obsolete();
|
||||
wparent.write_unlock();
|
||||
} else {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_to_node(&mut wnode, key, new_value, allocator);
|
||||
}
|
||||
wnode.write_unlock();
|
||||
}
|
||||
return Ok(());
|
||||
} else {
|
||||
let next_node = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
match next_node {
|
||||
ChildOrValue::Value(existing_value_ptr) => {
|
||||
assert!(key.len() == 1);
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value
|
||||
let vmut = unsafe { existing_value_ptr.cast_mut().as_mut() }.unwrap();
|
||||
if let Some(new_value) = value_fn(Some(vmut)) {
|
||||
*vmut = new_value;
|
||||
} else {
|
||||
// TODO: Treat this as deletion?
|
||||
}
|
||||
wnode.write_unlock();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
ChildOrValue::Child(next_child) => {
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
allocator,
|
||||
epoch_pin,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
) -> ResultOrRestart<()> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
for key_byte in 0..u8::MAX {
|
||||
match rnode.find_child_or_value_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(ChildOrValue::Child(child_ref)) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
eprintln!(
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
);
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1)?;
|
||||
}
|
||||
Some(ChildOrValue::Value(val)) => {
|
||||
eprintln!("{} {:?}, {}: {:?}", indent, path, key_byte, unsafe {
|
||||
val.as_ref().unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<'a, V: Value>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(&key[common_prefix_len + 1..], value, allocator);
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
let mut prefix_node = node_ref::new_internal(&key[..common_prefix_len], allocator);
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_child(old_prefix[common_prefix_len], old_node.as_ptr());
|
||||
prefix_node.insert_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
}
|
||||
|
||||
fn insert_to_node<V: Value>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
if wnode.is_leaf() {
|
||||
wnode.insert_value(key[0], value);
|
||||
} else {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, allocator);
|
||||
wnode.insert_child(key[0], value_child);
|
||||
}
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<V: Value>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: &WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
let mut bigger_node = wnode.grow(allocator);
|
||||
|
||||
if wnode.is_leaf() {
|
||||
bigger_node.insert_value(key[0], value);
|
||||
} else {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, allocator);
|
||||
bigger_node.insert_child(key[0], value_child);
|
||||
}
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If key is long, we may need to allocate
|
||||
// new internal nodes to hold it too
|
||||
fn allocate_node_for_value<V: Value>(key: &[u8], value: V, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
|
||||
let mut leaf_node = node_ref::new_leaf(&key[prefix_off..key.len() - 1], allocator);
|
||||
leaf_node.insert_value(*key.last().unwrap(), value);
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
);
|
||||
internal_node.insert_child(*remain_prefix.last().unwrap(), node.into_ptr());
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
node.into_ptr()
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
@@ -1,85 +0,0 @@
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type ResultOrRestart<T> = Result<T, ()>;
|
||||
|
||||
const fn restart<T>() -> ResultOrRestart<T> {
|
||||
Err(())
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<u64> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return restart();
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return restart();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return restart();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while (version & 2) == 2 {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
return version + 2;
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
return (version & 1) == 1;
|
||||
}
|
||||
@@ -1,983 +0,0 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
|
||||
use super::lock_and_version::AtomicLockAndVersion;
|
||||
|
||||
use crate::Allocator;
|
||||
use crate::Value;
|
||||
|
||||
pub(crate) const MAX_PREFIX_LEN: usize = 8;
|
||||
|
||||
enum NodeTag {
|
||||
Internal4,
|
||||
Internal16,
|
||||
Internal48,
|
||||
Internal256,
|
||||
Leaf4,
|
||||
Leaf16,
|
||||
Leaf48,
|
||||
Leaf256,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeBase {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
}
|
||||
|
||||
pub(crate) struct NodePtr<V> {
|
||||
ptr: *mut NodeBase,
|
||||
|
||||
phantom_value: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<V> std::fmt::Debug for NodePtr<V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "0x{}", self.ptr.addr())
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> Copy for NodePtr<V> {}
|
||||
impl<V> Clone for NodePtr<V> {
|
||||
fn clone(&self) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: self.ptr,
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum NodeVariant<'a, V> {
|
||||
Internal4(&'a NodeInternal4<V>),
|
||||
Internal16(&'a NodeInternal16<V>),
|
||||
Internal48(&'a NodeInternal48<V>),
|
||||
Internal256(&'a NodeInternal256<V>),
|
||||
Leaf4(&'a NodeLeaf4<V>),
|
||||
Leaf16(&'a NodeLeaf16<V>),
|
||||
Leaf48(&'a NodeLeaf48<V>),
|
||||
Leaf256(&'a NodeLeaf256<V>),
|
||||
}
|
||||
|
||||
enum NodeVariantMut<'a, V> {
|
||||
Internal4(&'a mut NodeInternal4<V>),
|
||||
Internal16(&'a mut NodeInternal16<V>),
|
||||
Internal48(&'a mut NodeInternal48<V>),
|
||||
Internal256(&'a mut NodeInternal256<V>),
|
||||
Leaf4(&'a mut NodeLeaf4<V>),
|
||||
Leaf16(&'a mut NodeLeaf16<V>),
|
||||
Leaf48(&'a mut NodeLeaf48<V>),
|
||||
Leaf256(&'a mut NodeLeaf256<V>),
|
||||
}
|
||||
|
||||
pub(crate) enum ChildOrValuePtr<V> {
|
||||
Child(NodePtr<V>),
|
||||
Value(*const V),
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal4<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
num_children: u8,
|
||||
|
||||
child_keys: [u8; 4],
|
||||
child_ptrs: [NodePtr<V>; 4],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal16<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u8,
|
||||
child_keys: [u8; 16],
|
||||
child_ptrs: [NodePtr<V>; 16],
|
||||
}
|
||||
|
||||
const INVALID_CHILD_INDEX: u8 = u8::MAX;
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal48<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u8,
|
||||
child_indexes: [u8; 256],
|
||||
child_ptrs: [NodePtr<V>; 48],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
pub(crate) struct NodeInternal256<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u16,
|
||||
child_ptrs: [NodePtr<V>; 256],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf4<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_keys: [u8; 4],
|
||||
child_values: [Option<V>; 4],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf16<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_keys: [u8; 16],
|
||||
child_values: [Option<V>; 16],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf48<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_indexes: [u8; 256],
|
||||
child_values: [Option<V>; 48],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf256<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u16,
|
||||
child_values: [Option<V>; 256],
|
||||
}
|
||||
|
||||
impl<V> NodePtr<V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(_) => false,
|
||||
NodeVariant::Internal16(_) => false,
|
||||
NodeVariant::Internal48(_) => false,
|
||||
NodeVariant::Internal256(_) => false,
|
||||
NodeVariant::Leaf4(_) => true,
|
||||
NodeVariant::Leaf16(_) => true,
|
||||
NodeVariant::Leaf48(_) => true,
|
||||
NodeVariant::Leaf256(_) => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal16(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal48(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal256(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf4(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf16(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf48(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf256(n) => &n.lock_and_version,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_null(&self) -> bool {
|
||||
self.ptr.is_null()
|
||||
}
|
||||
|
||||
pub(crate) const fn null() -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: std::ptr::null_mut(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn variant(&self) -> NodeVariant<V> {
|
||||
unsafe {
|
||||
match (*self.ptr).tag {
|
||||
NodeTag::Internal4 => NodeVariant::Internal4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal16 => NodeVariant::Internal16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal48 => NodeVariant::Internal48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal256 => NodeVariant::Internal256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf4 => NodeVariant::Leaf4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf16 => NodeVariant::Leaf16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf48 => NodeVariant::Leaf48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf256 => NodeVariant::Leaf256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_ref(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn variant_mut(&mut self) -> NodeVariantMut<V> {
|
||||
unsafe {
|
||||
match (*self.ptr).tag {
|
||||
NodeTag::Internal4 => NodeVariantMut::Internal4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal16 => NodeVariantMut::Internal16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal48 => NodeVariantMut::Internal48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal256 => NodeVariantMut::Internal256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf4 => NodeVariantMut::Leaf4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf16 => NodeVariantMut::Leaf16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf48 => NodeVariantMut::Leaf48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf256 => NodeVariantMut::Leaf256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_mut(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodePtr<V> {
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
let node_prefix = self.get_prefix();
|
||||
assert!(node_prefix.len() <= key.len()); // because we only use fixed-size keys
|
||||
if &key[0..node_prefix.len()] != node_prefix {
|
||||
None
|
||||
} else {
|
||||
Some(node_prefix.len())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.get_prefix(),
|
||||
NodeVariant::Internal16(n) => n.get_prefix(),
|
||||
NodeVariant::Internal48(n) => n.get_prefix(),
|
||||
NodeVariant::Internal256(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf4(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf16(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf48(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf256(n) => n.get_prefix(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.is_full(),
|
||||
NodeVariant::Internal16(n) => n.is_full(),
|
||||
NodeVariant::Internal48(n) => n.is_full(),
|
||||
NodeVariant::Internal256(n) => n.is_full(),
|
||||
NodeVariant::Leaf4(n) => n.is_full(),
|
||||
NodeVariant::Leaf16(n) => n.is_full(),
|
||||
NodeVariant::Leaf48(n) => n.is_full(),
|
||||
NodeVariant::Leaf256(n) => n.is_full(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_value(&self, key_byte: u8) -> Option<ChildOrValuePtr<V>> {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal16(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal48(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal256(n) => {
|
||||
n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c))
|
||||
}
|
||||
NodeVariant::Leaf4(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf16(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf48(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf256(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal16(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal48(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal256(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf4(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf16(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf48(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf256(n) => n.truncate_prefix(new_prefix_len),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.grow(allocator),
|
||||
NodeVariant::Internal16(n) => n.grow(allocator),
|
||||
NodeVariant::Internal48(n) => n.grow(allocator),
|
||||
NodeVariant::Internal256(_) => panic!("cannot grow Internal256 node"),
|
||||
NodeVariant::Leaf4(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf16(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf48(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf256(_) => panic!("cannot grow Leaf256 node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal16(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal48(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal256(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Leaf4(_)
|
||||
| NodeVariantMut::Leaf16(_)
|
||||
| NodeVariantMut::Leaf48(_)
|
||||
| NodeVariantMut::Leaf256(_) => panic!("insert_child called on leaf node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal16(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal48(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal256(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Leaf4(_)
|
||||
| NodeVariantMut::Leaf16(_)
|
||||
| NodeVariantMut::Leaf48(_)
|
||||
| NodeVariantMut::Leaf256(_) => panic!("replace_child called on leaf node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(_)
|
||||
| NodeVariantMut::Internal16(_)
|
||||
| NodeVariantMut::Internal48(_)
|
||||
| NodeVariantMut::Internal256(_) => panic!("insert_value called on internal node"),
|
||||
NodeVariantMut::Leaf4(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf16(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf48(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf256(n) => n.insert_value(key_byte, value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(allocator: &Allocator) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: allocator.alloc(NodeInternal256::<V>::new()).as_ptr().cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node = allocator.alloc(NodeInternal4 {
|
||||
tag: NodeTag::Internal4,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [8; MAX_PREFIX_LEN],
|
||||
prefix_len: prefix.len() as u8,
|
||||
num_children: 0,
|
||||
|
||||
child_keys: [0; 4],
|
||||
child_ptrs: [const { NodePtr::null() }; 4],
|
||||
});
|
||||
node.prefix[0..prefix.len()].copy_from_slice(prefix);
|
||||
|
||||
node.as_ptr().into()
|
||||
}
|
||||
|
||||
pub fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node = allocator.alloc(NodeLeaf4 {
|
||||
tag: NodeTag::Leaf4,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [8; MAX_PREFIX_LEN],
|
||||
prefix_len: prefix.len() as u8,
|
||||
num_values: 0,
|
||||
|
||||
child_keys: [0; 4],
|
||||
child_values: [const { None }; 4],
|
||||
});
|
||||
node.prefix[0..prefix.len()].copy_from_slice(prefix);
|
||||
|
||||
node.as_ptr().into()
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal4<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key: u8) -> Option<NodePtr<V>> {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key {
|
||||
return Some(self.child_ptrs[i]);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
self.child_ptrs[i] = replacement;
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 4
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 4);
|
||||
|
||||
let idx = self.num_children as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_ptrs[idx] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node16 = allocator.alloc(NodeInternal16 {
|
||||
tag: NodeTag::Internal16,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children,
|
||||
|
||||
child_keys: [0; 16],
|
||||
child_ptrs: [const { NodePtr::null() }; 16],
|
||||
});
|
||||
for i in 0..self.num_children as usize {
|
||||
node16.child_keys[i] = self.child_keys[i];
|
||||
node16.child_ptrs[i] = self.child_ptrs[i];
|
||||
}
|
||||
|
||||
node16.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal16<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
return Some(self.child_ptrs[i]);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
self.child_ptrs[i] = replacement;
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 16
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 16);
|
||||
|
||||
let idx = self.num_children as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_ptrs[idx] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node48 = allocator.alloc(NodeInternal48 {
|
||||
tag: NodeTag::Internal48,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children,
|
||||
|
||||
child_indexes: [INVALID_CHILD_INDEX; 256],
|
||||
child_ptrs: [const { NodePtr::null() }; 48],
|
||||
});
|
||||
for i in 0..self.num_children as usize {
|
||||
let idx = self.child_keys[i] as usize;
|
||||
node48.child_indexes[idx] = i as u8;
|
||||
node48.child_ptrs[i] = self.child_ptrs[i];
|
||||
}
|
||||
|
||||
node48.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal48<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
let idx = self.child_indexes[key_byte as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
Some(self.child_ptrs[idx as usize])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
let idx = self.child_indexes[key_byte as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
self.child_ptrs[idx as usize] = replacement
|
||||
} else {
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 48
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 48);
|
||||
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
|
||||
let idx = self.num_children;
|
||||
self.child_indexes[key_byte as usize] = idx;
|
||||
self.child_ptrs[idx as usize] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node256 = allocator.alloc(NodeInternal256 {
|
||||
tag: NodeTag::Internal256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children as u16,
|
||||
|
||||
child_ptrs: [const { NodePtr::null() }; 256],
|
||||
});
|
||||
for i in 0..256 {
|
||||
let idx = self.child_indexes[i];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
node256.child_ptrs[i] = self.child_ptrs[idx as usize];
|
||||
}
|
||||
}
|
||||
node256.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal256<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
let idx = key_byte as usize;
|
||||
if !self.child_ptrs[idx].is_null() {
|
||||
Some(self.child_ptrs[idx])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
let idx = key_byte as usize;
|
||||
if !self.child_ptrs[idx].is_null() {
|
||||
self.child_ptrs[idx] = replacement
|
||||
} else {
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 256
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 256);
|
||||
assert!(self.child_ptrs[key_byte as usize].is_null());
|
||||
self.child_ptrs[key_byte as usize] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf4<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value<'a: 'b, 'b>(&'a self, key: u8) -> Option<&'b V> {
|
||||
for i in 0..self.num_values {
|
||||
if self.child_keys[i as usize] == key {
|
||||
assert!(self.child_values[i as usize].is_some());
|
||||
return self.child_values[i as usize].as_ref();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 4
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 16);
|
||||
|
||||
let idx = self.num_values as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_values[idx] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node16 = allocator.alloc(NodeLeaf16 {
|
||||
tag: NodeTag::Leaf16,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values,
|
||||
|
||||
child_keys: [0; 16],
|
||||
child_values: [const { None }; 16],
|
||||
});
|
||||
for i in 0..self.num_values as usize {
|
||||
node16.child_keys[i] = self.child_keys[i];
|
||||
node16.child_values[i] = self.child_values[i].clone();
|
||||
}
|
||||
node16.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf16<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
for i in 0..self.num_values {
|
||||
if self.child_keys[i as usize] == key {
|
||||
assert!(self.child_values[i as usize].is_some());
|
||||
return self.child_values[i as usize].as_ref();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 16
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 16);
|
||||
|
||||
let idx = self.num_values as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_values[idx] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node48 = allocator.alloc(NodeLeaf48 {
|
||||
tag: NodeTag::Leaf48,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values,
|
||||
|
||||
child_indexes: [INVALID_CHILD_INDEX; 256],
|
||||
child_values: [const { None }; 48],
|
||||
});
|
||||
for i in 0..self.num_values {
|
||||
let idx = self.child_keys[i as usize];
|
||||
node48.child_indexes[idx as usize] = i;
|
||||
node48.child_values[i as usize] = self.child_values[i as usize].clone();
|
||||
}
|
||||
node48.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf48<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
let idx = self.child_indexes[key as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
assert!(self.child_values[idx as usize].is_some());
|
||||
self.child_values[idx as usize].as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 48
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 48);
|
||||
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
|
||||
let idx = self.num_values;
|
||||
self.child_indexes[key_byte as usize] = idx;
|
||||
self.child_values[idx as usize] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node256 = allocator.alloc(NodeLeaf256 {
|
||||
tag: NodeTag::Leaf256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values as u16,
|
||||
|
||||
child_values: [const { None }; 256],
|
||||
});
|
||||
for i in 0..256 {
|
||||
let idx = self.child_indexes[i];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
node256.child_values[i] = self.child_values[idx as usize].clone();
|
||||
}
|
||||
}
|
||||
node256.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf256<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
let idx = key as usize;
|
||||
self.child_values[idx].as_ref()
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 256
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 256);
|
||||
assert!(self.child_values[key_byte as usize].is_none());
|
||||
self.child_values[key_byte as usize] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal256<V> {
|
||||
pub(crate) fn new() -> NodeInternal256<V> {
|
||||
NodeInternal256 {
|
||||
tag: NodeTag::Internal256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [0; MAX_PREFIX_LEN],
|
||||
prefix_len: 0,
|
||||
num_children: 0,
|
||||
|
||||
child_ptrs: [const { NodePtr::null() }; 256],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal4<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal4<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<V: Value> From<*mut NodeInternal16<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal16<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal48<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal48<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal256<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal256<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf4<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf4<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<V: Value> From<*mut NodeLeaf16<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf16<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf48<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf48<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf256<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf256<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,202 +0,0 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::lock_and_version::ResultOrRestart;
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::ChildOrValuePtr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::{Allocator, Value};
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<ReadLockedNodeRef<'e, V>> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
pub(crate) enum ChildOrValue<'e, V> {
|
||||
Child(NodeRef<'e, V>),
|
||||
Value(*const V),
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_value_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> ResultOrRestart<Option<ChildOrValue<'e, V>>> {
|
||||
let child_or_value = self.ptr.find_child_or_value(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(ChildOrValuePtr::Value(vptr)) => Ok(Some(ChildOrValue::Value(vptr))),
|
||||
Some(ChildOrValuePtr::Child(child_ptr)) => Ok(Some(ChildOrValue::Child(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
}))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> ResultOrRestart<WriteLockedNodeRef<'e, V>> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> ResultOrRestart<()> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
self.ptr.insert_value(key_byte, value)
|
||||
}
|
||||
|
||||
pub(crate) fn grow(&self, allocator: &Allocator) -> NewNodeRef<V> {
|
||||
let new_node = self.ptr.grow(allocator);
|
||||
NewNodeRef { ptr: new_node }
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<V> {
|
||||
ptr: NodePtr<V>,
|
||||
}
|
||||
|
||||
impl<V: Value> NewNodeRef<V> {
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
self.ptr.insert_value(key_byte, value)
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
ptr
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
|
||||
NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
|
||||
NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, allocator),
|
||||
}
|
||||
}
|
||||
@@ -1,107 +0,0 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
pub struct Allocator {
|
||||
area: *mut MaybeUninit<u8>,
|
||||
allocated: AtomicUsize,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
// FIXME: I don't know if these are really safe...
|
||||
unsafe impl Send for Allocator {}
|
||||
unsafe impl Sync for Allocator {}
|
||||
|
||||
#[repr(transparent)]
|
||||
pub struct AllocatedBox<'a, T> {
|
||||
inner: NonNull<T>,
|
||||
|
||||
_phantom: PhantomData<&'a Allocator>,
|
||||
}
|
||||
|
||||
// FIXME: I don't know if these are really safe...
|
||||
unsafe impl<'a, T> Send for AllocatedBox<'a, T> {}
|
||||
unsafe impl<'a, T> Sync for AllocatedBox<'a, T> {}
|
||||
|
||||
impl<T> Deref for AllocatedBox<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
unsafe { self.inner.as_ref() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for AllocatedBox<'_, T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
unsafe { self.inner.as_mut() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AsMut<T> for AllocatedBox<'_, T> {
|
||||
fn as_mut(&mut self) -> &mut T {
|
||||
unsafe { self.inner.as_mut() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AllocatedBox<'_, T> {
|
||||
pub fn as_ptr(&self) -> *mut T {
|
||||
self.inner.as_ptr()
|
||||
}
|
||||
}
|
||||
|
||||
const MAXALIGN: usize = std::mem::align_of::<usize>();
|
||||
|
||||
impl Allocator {
|
||||
pub fn new_uninit(area: &'static mut [MaybeUninit<u8>]) -> Allocator {
|
||||
let ptr = area.as_mut_ptr();
|
||||
let size = area.len();
|
||||
Self::new_from_ptr(ptr, size)
|
||||
}
|
||||
|
||||
pub fn new(area: &'static mut [u8]) -> Allocator {
|
||||
let ptr: *mut MaybeUninit<u8> = area.as_mut_ptr().cast();
|
||||
let size = area.len();
|
||||
Self::new_from_ptr(ptr, size)
|
||||
}
|
||||
|
||||
pub fn new_from_ptr(ptr: *mut MaybeUninit<u8>, size: usize) -> Allocator {
|
||||
let padding = ptr.align_offset(MAXALIGN);
|
||||
|
||||
Allocator {
|
||||
area: ptr,
|
||||
allocated: AtomicUsize::new(padding),
|
||||
size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alloc<'a, T: Sized>(&'a self, value: T) -> AllocatedBox<'a, T> {
|
||||
let sz = std::mem::size_of::<T>();
|
||||
|
||||
// pad all allocations to MAXALIGN boundaries
|
||||
assert!(std::mem::align_of::<T>() <= MAXALIGN);
|
||||
let sz = sz.next_multiple_of(MAXALIGN);
|
||||
|
||||
let offset = self.allocated.fetch_add(sz, Ordering::Relaxed);
|
||||
|
||||
if offset + sz > self.size {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let inner = unsafe {
|
||||
let inner = self.area.offset(offset as isize).cast::<T>();
|
||||
*inner = value;
|
||||
NonNull::new_unchecked(inner)
|
||||
};
|
||||
|
||||
AllocatedBox {
|
||||
inner,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn _dealloc_node<T>(&self, _node: AllocatedBox<T>) {
|
||||
// doesn't free it immediately.
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
//!
|
||||
//! FIXME: not implemented yet. (We haven't implemented removing any nodes from the ART
|
||||
//! tree, which is why we get away without this now)
|
||||
|
||||
pub(crate) struct EpochPin {}
|
||||
|
||||
pub(crate) fn pin_epoch() -> EpochPin {
|
||||
EpochPin {}
|
||||
}
|
||||
|
||||
/*
|
||||
struct CollectorGlobal {
|
||||
epoch: AtomicU64,
|
||||
|
||||
participants: CachePadded<AtomicU64>, // make it an array
|
||||
}
|
||||
|
||||
|
||||
struct CollectorQueue {
|
||||
|
||||
}
|
||||
*/
|
||||
@@ -1,301 +0,0 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Multi-value leaves. The values are stored directly in one of the four different leaf node
|
||||
//! types.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
|
||||
use allocator::AllocatedBox;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub use allocator::Allocator;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Clone + Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
pub trait Value: Clone {}
|
||||
|
||||
struct Tree<K: Key, V: Value> {
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value> {
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
allocator: &'t Allocator,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
allocator: &'t Allocator,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
}
|
||||
|
||||
impl<'a, 't: 'a, K: Key, V: Value> TreeInitStruct<'t, K, V> {
|
||||
pub fn new(allocator: &'t Allocator) -> TreeInitStruct<'t, K, V> {
|
||||
let tree = allocator.alloc(Tree {
|
||||
root: algorithm::new_root(allocator),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
phantom_key: PhantomData,
|
||||
});
|
||||
|
||||
TreeInitStruct { tree, allocator }
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess { tree: self.tree }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key + Clone, V: Value> TreeWriteAccess<'t, K, V> {
|
||||
pub fn start_write(&'t self) -> TreeWriteGuard<'t, K, V> {
|
||||
// TODO: grab epoch guard
|
||||
TreeWriteGuard {
|
||||
allocator: self.allocator,
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key + Clone, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'t, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
epoch_pin: EpochPin,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadGuard<'t, K, V> {
|
||||
pub fn get(&self, key: &K) -> Option<V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'t, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t AllocatedBox<'t, Tree<K, V>>,
|
||||
allocator: &'t Allocator,
|
||||
|
||||
epoch_pin: EpochPin,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeWriteGuard<'t, K, V> {
|
||||
pub fn insert(&mut self, key: &K, value: V) {
|
||||
self.update_with_fn(key, |_| Some(value))
|
||||
}
|
||||
|
||||
pub fn update_with_fn<F>(&mut self, key: &K, value_fn: F)
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
algorithm::update_fn(
|
||||
key,
|
||||
value_fn,
|
||||
self.tree.root,
|
||||
self.allocator,
|
||||
&self.epoch_pin,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get(&mut self, key: &K) -> Option<V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value + Debug> TreeWriteGuard<'t, K, V> {
|
||||
pub fn dump(&mut self) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::Allocator;
|
||||
use crate::TreeInitStruct;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let area = Box::leak(Box::new_uninit_slice(MEM_SIZE));
|
||||
|
||||
let allocator = Box::leak(Box::new(Allocator::new_uninit(area)));
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let mut w = tree_writer.start_write();
|
||||
w.insert(&(*k).into(), idx);
|
||||
eprintln!("INSERTED {:?}", Into::<TestKey>::into(*k));
|
||||
}
|
||||
|
||||
//tree_writer.start_read().dump();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut thread_rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
@@ -182,6 +182,7 @@ pub struct ConfigToml {
|
||||
pub tracing: Option<Tracing>,
|
||||
pub enable_tls_page_service_api: bool,
|
||||
pub dev_mode: bool,
|
||||
pub timeline_import_config: TimelineImportConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -300,6 +301,12 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct TimelineImportConfig {
|
||||
pub import_job_concurrency: NonZeroUsize,
|
||||
pub import_job_soft_size_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
pub mod statvfs {
|
||||
pub mod mock {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -659,6 +666,10 @@ impl Default for ConfigToml {
|
||||
tracing: None,
|
||||
enable_tls_page_service_api: false,
|
||||
dev_mode: false,
|
||||
timeline_import_config: TimelineImportConfig {
|
||||
import_job_concurrency: NonZeroUsize::new(128).unwrap(),
|
||||
import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1832,6 +1832,7 @@ pub mod virtual_file {
|
||||
Eq,
|
||||
Hash,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
@@ -1843,10 +1844,8 @@ pub mod virtual_file {
|
||||
/// Uses buffered IO.
|
||||
Buffered,
|
||||
/// Uses direct IO for reads only.
|
||||
#[cfg(target_os = "linux")]
|
||||
Direct,
|
||||
/// Use direct IO for reads and writes.
|
||||
#[cfg(target_os = "linux")]
|
||||
DirectRw,
|
||||
}
|
||||
|
||||
@@ -1854,26 +1853,13 @@ pub mod virtual_file {
|
||||
pub fn preferred() -> Self {
|
||||
// The default behavior when running Rust unit tests without any further
|
||||
// flags is to use the newest behavior (DirectRw).
|
||||
// The CI uses the following environment variable to unit tests for all
|
||||
// different modes.
|
||||
// The CI uses the environment variable to unit tests for all different modes.
|
||||
// NB: the Python regression & perf tests have their own defaults management
|
||||
// that writes pageserver.toml; they do not use this variable.
|
||||
if cfg!(test) {
|
||||
static CACHED: LazyLock<IoMode> = LazyLock::new(|| {
|
||||
utils::env::var_serde_json_string(
|
||||
"NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
|
||||
)
|
||||
.unwrap_or(
|
||||
#[cfg(target_os = "linux")]
|
||||
IoMode::DirectRw,
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
IoMode::Buffered,
|
||||
)
|
||||
});
|
||||
*CACHED
|
||||
} else {
|
||||
IoMode::Buffered
|
||||
}
|
||||
static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
|
||||
utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
|
||||
});
|
||||
ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1883,9 +1869,7 @@ pub mod virtual_file {
|
||||
fn try_from(value: u8) -> Result<Self, Self::Error> {
|
||||
Ok(match value {
|
||||
v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
|
||||
#[cfg(target_os = "linux")]
|
||||
v if v == (IoMode::Direct as u8) => IoMode::Direct,
|
||||
#[cfg(target_os = "linux")]
|
||||
v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw,
|
||||
x => return Err(x),
|
||||
})
|
||||
|
||||
@@ -841,6 +841,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
let expected_end = match &end {
|
||||
ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
|
||||
// The timeline doesn't exist and we have been requested to not auto-create it.
|
||||
// Compute requests for timelines that haven't been created yet
|
||||
// might reach us before the storcon request to create those timelines.
|
||||
TimelineNoCreate => true,
|
||||
CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
@@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd {
|
||||
Terminate,
|
||||
#[error("EOF on COPY stream")]
|
||||
EOF,
|
||||
#[error("timeline not found, and allow_timeline_creation is false")]
|
||||
TimelineNoCreate,
|
||||
/// The connection was lost
|
||||
#[error("connection error: {0}")]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
|
||||
@@ -299,11 +299,13 @@ pub struct PullTimelineRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub http_hosts: Vec<String>,
|
||||
pub ignore_tombstone: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PullTimelineResponse {
|
||||
// Donor safekeeper host
|
||||
pub safekeeper_host: String,
|
||||
/// Donor safekeeper host.
|
||||
/// None if no pull happened because the timeline already exists.
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
@@ -295,6 +295,9 @@ pub struct TenantId(Id);
|
||||
|
||||
id_newtype!(TenantId);
|
||||
|
||||
/// If needed, reuse small string from proxy/src/types.rc
|
||||
pub type EndpointId = String;
|
||||
|
||||
// A pair uniquely identifying Neon instance.
|
||||
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct TenantTimelineId {
|
||||
|
||||
@@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats {
|
||||
}
|
||||
|
||||
impl RateLimit {
|
||||
pub fn new(interval: Duration) -> Self {
|
||||
pub const fn new(interval: Duration) -> Self {
|
||||
Self {
|
||||
last: None,
|
||||
interval,
|
||||
|
||||
@@ -42,14 +42,12 @@ nix.workspace = true
|
||||
num_cpus.workspace = true
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
peekable.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
prost.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -62,7 +60,6 @@ serde_path_to_error.workspace = true
|
||||
serde_with.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tonic.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
@@ -79,7 +76,6 @@ url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_data_api.workspace = true
|
||||
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
|
||||
pageserver_compaction.workspace = true
|
||||
pem.workspace = true
|
||||
|
||||
@@ -14,6 +14,7 @@ use pageserver_api::key::Key;
|
||||
use pageserver_api::models::virtual_file::IoMode;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
];
|
||||
let exploded_parameters = {
|
||||
let mut out = Vec::new();
|
||||
for io_mode in [
|
||||
IoMode::Buffered,
|
||||
#[cfg(target_os = "linux")]
|
||||
IoMode::Direct,
|
||||
#[cfg(target_os = "linux")]
|
||||
IoMode::DirectRw,
|
||||
] {
|
||||
for io_mode in IoMode::iter() {
|
||||
for param in expect.clone() {
|
||||
let HandPickedParameters {
|
||||
volume_mib,
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
http.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pageserver_data_api.workspace = true
|
||||
@@ -1,221 +0,0 @@
|
||||
//! Pageserver Data API client
|
||||
//!
|
||||
//! - Manage connections to pageserver
|
||||
//! - Send requests to correct shards
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use bytes::Bytes;
|
||||
use http;
|
||||
use thiserror::Error;
|
||||
use tonic;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
|
||||
use pageserver_data_api::model::*;
|
||||
use pageserver_data_api::proto;
|
||||
|
||||
type Shardno = u16;
|
||||
|
||||
use pageserver_data_api::client::PageServiceClient;
|
||||
|
||||
type MyPageServiceClient = pageserver_data_api::client::PageServiceClient<
|
||||
tonic::service::interceptor::InterceptedService<tonic::transport::Channel, AuthInterceptor>,
|
||||
>;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverClientError {
|
||||
#[error("could not connect to service: {0}")]
|
||||
ConnectError(#[from] tonic::transport::Error),
|
||||
#[error("could not perform request: {0}`")]
|
||||
RequestError(#[from] tonic::Status),
|
||||
|
||||
#[error("could not perform request: {0}`")]
|
||||
InvalidUri(#[from] http::uri::InvalidUri),
|
||||
}
|
||||
|
||||
pub struct PageserverClient {
|
||||
_tenant_id: String,
|
||||
_timeline_id: String,
|
||||
|
||||
_auth_token: Option<String>,
|
||||
|
||||
shard_map: HashMap<Shardno, String>,
|
||||
|
||||
channels: RwLock<HashMap<Shardno, Channel>>,
|
||||
|
||||
auth_interceptor: AuthInterceptor,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// TODO: this doesn't currently react to changes in the shard map.
|
||||
pub fn new(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
auth_token: &Option<String>,
|
||||
shard_map: HashMap<Shardno, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
_tenant_id: tenant_id.to_string(),
|
||||
_timeline_id: timeline_id.to_string(),
|
||||
_auth_token: auth_token.clone(),
|
||||
shard_map,
|
||||
channels: RwLock::new(HashMap::new()),
|
||||
auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_ref()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process_rel_exists_request(
|
||||
&self,
|
||||
request: &RelExistsRequest,
|
||||
) -> Result<bool, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::RelExistsRequest::from(request);
|
||||
let response = client.rel_exists(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().exists)
|
||||
}
|
||||
|
||||
pub async fn process_rel_size_request(
|
||||
&self,
|
||||
request: &RelSizeRequest,
|
||||
) -> Result<u32, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::RelSizeRequest::from(request);
|
||||
let response = client.rel_size(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().num_blocks)
|
||||
}
|
||||
|
||||
pub async fn get_page(&self, request: &GetPageRequest) -> Result<Bytes, PageserverClientError> {
|
||||
// FIXME: calculate the shard number correctly
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::GetPageRequest::from(request);
|
||||
let response = client.get_page(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.into_inner().page_image)
|
||||
}
|
||||
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn process_dbsize_request(
|
||||
&self,
|
||||
request: &DbSizeRequest,
|
||||
) -> Result<u64, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::DbSizeRequest::from(request);
|
||||
let response = client.db_size(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().num_bytes)
|
||||
}
|
||||
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn get_base_backup(
|
||||
&self,
|
||||
request: &GetBaseBackupRequest,
|
||||
gzip: bool,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
|
||||
PageserverClientError,
|
||||
> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
if gzip {
|
||||
client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
|
||||
}
|
||||
|
||||
let request = proto::GetBaseBackupRequest::from(request);
|
||||
let response = client.get_base_backup(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Get a client for given shard
|
||||
///
|
||||
/// This implements very basic caching. If we already have a client for the given shard,
|
||||
/// reuse it. If not, create a new client and put it to the cache.
|
||||
async fn get_client(
|
||||
&self,
|
||||
shard_no: u16,
|
||||
) -> Result<MyPageServiceClient, PageserverClientError> {
|
||||
let reused_channel: Option<Channel> = {
|
||||
let channels = self.channels.read().unwrap();
|
||||
|
||||
channels.get(&shard_no).cloned()
|
||||
};
|
||||
|
||||
let channel = if let Some(reused_channel) = reused_channel {
|
||||
reused_channel
|
||||
} else {
|
||||
let endpoint: tonic::transport::Endpoint = self
|
||||
.shard_map
|
||||
.get(&shard_no)
|
||||
.expect("no url for shard {shard_no}")
|
||||
.parse()?;
|
||||
let channel = endpoint.connect().await?;
|
||||
|
||||
// Insert it to the cache so that it can be reused on subsequent calls. It's possible
|
||||
// that another thread did the same concurrently, in which case we will overwrite the
|
||||
// client in the cache.
|
||||
{
|
||||
let mut channels = self.channels.write().unwrap();
|
||||
channels.insert(shard_no, channel.clone());
|
||||
}
|
||||
channel
|
||||
};
|
||||
|
||||
let client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.clone());
|
||||
Ok(client)
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
|
||||
#[derive(Clone)]
|
||||
struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
timeline_id: AsciiMetadataValue,
|
||||
|
||||
auth_token: Option<AsciiMetadataValue>,
|
||||
}
|
||||
|
||||
impl AuthInterceptor {
|
||||
fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&String>) -> Self {
|
||||
Self {
|
||||
tenant_id: tenant_id.parse().expect("could not parse tenant id"),
|
||||
timeline_id: timeline_id.parse().expect("could not parse timeline id"),
|
||||
auth_token: auth_token.map(|x| x.parse().expect("could not parse auth token")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_token) = &self.auth_token {
|
||||
req.metadata_mut()
|
||||
.insert("neon-auth-token", auth_token.clone());
|
||||
}
|
||||
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
@@ -27,6 +28,7 @@ pub(crate) enum LayerCmd {
|
||||
path: PathBuf,
|
||||
tenant: String,
|
||||
timeline: String,
|
||||
key: Option<Key>,
|
||||
},
|
||||
/// Dump all information of a layer file
|
||||
DumpLayer {
|
||||
@@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
path,
|
||||
tenant,
|
||||
timeline,
|
||||
key,
|
||||
} => {
|
||||
let timeline_path = path
|
||||
.join(TENANTS_SEGMENT_NAME)
|
||||
@@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
.join(TIMELINES_SEGMENT_NAME)
|
||||
.join(timeline);
|
||||
let mut idx = 0;
|
||||
let mut to_print = Vec::default();
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
if let Some(key) = key {
|
||||
if layer_file.key_range.start <= *key && *key < layer_file.key_range.end {
|
||||
to_print.push((idx, layer_file));
|
||||
}
|
||||
} else {
|
||||
to_print.push((idx, layer_file));
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if key.is_some() {
|
||||
to_print
|
||||
.sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end));
|
||||
}
|
||||
|
||||
for (idx, layer_file) in to_print {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::DumpLayer {
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
[package]
|
||||
name = "pageserver_data_api"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
|
||||
# For Lsn.
|
||||
#
|
||||
# TODO: move Lsn to separate crate? This draws in a lot more dependencies
|
||||
utils.workspace = true
|
||||
|
||||
prost.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
@@ -1,8 +0,0 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Generate rust code from .proto protobuf.
|
||||
tonic_build::configure()
|
||||
.bytes(&["."])
|
||||
.compile_protos(&["proto/page_service.proto"], &["proto"])
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
// Page service presented by pageservers, for computes
|
||||
//
|
||||
// Each request must come with the following metadata:
|
||||
// - neon-tenant-id
|
||||
// - neon-timeline-id
|
||||
// - neon-auth-token (if auth is enabled)
|
||||
//
|
||||
// TODO: what else? Priority? OpenTelemetry tracing?
|
||||
//
|
||||
|
||||
syntax = "proto3";
|
||||
package page_service;
|
||||
|
||||
service PageService {
|
||||
rpc RelExists(RelExistsRequest) returns (RelExistsResponse);
|
||||
|
||||
// Returns size of a relation, as # of blocks
|
||||
rpc RelSize (RelSizeRequest) returns (RelSizeResponse);
|
||||
|
||||
rpc GetPage (GetPageRequest) returns (GetPageResponse);
|
||||
|
||||
// Returns total size of a database, as # of bytes
|
||||
rpc DbSize (DbSizeRequest) returns (DbSizeResponse);
|
||||
|
||||
rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
|
||||
}
|
||||
|
||||
message RequestCommon {
|
||||
uint64 request_lsn = 1;
|
||||
uint64 not_modified_since_lsn = 2;
|
||||
}
|
||||
|
||||
message RelTag {
|
||||
uint32 spc_oid = 1;
|
||||
uint32 db_oid = 2;
|
||||
uint32 rel_number = 3;
|
||||
uint32 fork_number = 4;
|
||||
}
|
||||
|
||||
message RelExistsRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message RelExistsResponse {
|
||||
bool exists = 1;
|
||||
}
|
||||
|
||||
message RelSizeRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message RelSizeResponse {
|
||||
uint32 num_blocks = 1;
|
||||
}
|
||||
|
||||
message GetPageRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
uint32 block_number = 3;
|
||||
}
|
||||
|
||||
message GetPageResponse {
|
||||
bytes page_image = 1;
|
||||
}
|
||||
|
||||
message DbSizeRequest {
|
||||
RequestCommon common = 1;
|
||||
uint32 db_oid = 2;
|
||||
}
|
||||
|
||||
message DbSizeResponse {
|
||||
uint64 num_bytes = 1;
|
||||
}
|
||||
|
||||
message GetBaseBackupRequest {
|
||||
RequestCommon common = 1;
|
||||
bool replica = 2;
|
||||
}
|
||||
|
||||
message GetBaseBackupResponseChunk {
|
||||
bytes chunk = 1;
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
//! This crate has two modules related to the Pageserver Data API:
|
||||
//!
|
||||
//! proto: code auto-generated from the protobuf definition
|
||||
//! model: slightly more ergonomic structs representing the same API
|
||||
//!
|
||||
//! See protobuf spec under the protos/ subdirectory.
|
||||
//!
|
||||
//! This crate is used by both the client and the server. Try to keep it slim.
|
||||
//!
|
||||
pub mod model;
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod proto {
|
||||
tonic::include_proto!("page_service");
|
||||
}
|
||||
|
||||
pub use proto::page_service_client as client;
|
||||
@@ -1,239 +0,0 @@
|
||||
//! Structs representing the API
|
||||
//!
|
||||
//! These mirror the pageserver APIs and the structs automatically generated
|
||||
//! from the protobuf specification. The differences are:
|
||||
//!
|
||||
//! - Types that are in fact required by the API are not Options. The protobuf "required"
|
||||
//! attribute is deprecated and 'prost' marks a lot of members as optional because of that.
|
||||
//! (See https://github.com/tokio-rs/prost/issues/800 for a gripe on this)
|
||||
//!
|
||||
//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::proto;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestCommon {
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
|
||||
pub struct RelTag {
|
||||
pub spc_oid: u32,
|
||||
pub db_oid: u32,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelExistsRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelSizeRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelSizeResponse {
|
||||
pub num_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetPageRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
pub block_number: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetPageResponse {
|
||||
pub page_image: std::vec::Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DbSizeRequest {
|
||||
pub common: RequestCommon,
|
||||
pub db_oid: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DbSizeResponse {
|
||||
pub num_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetBaseBackupRequest {
|
||||
pub common: RequestCommon,
|
||||
pub replica: bool,
|
||||
}
|
||||
|
||||
//--- Conversions to/from the generated proto types
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ProtocolError {
|
||||
#[error("the value for field `{0}` is invalid")]
|
||||
InvalidValue(&'static str),
|
||||
#[error("the required field `{0}` is missing ")]
|
||||
Missing(&'static str),
|
||||
}
|
||||
|
||||
impl From<ProtocolError> for tonic::Status {
|
||||
fn from(e: ProtocolError) -> Self {
|
||||
match e {
|
||||
ProtocolError::InvalidValue(_field) => tonic::Status::invalid_argument(e.to_string()),
|
||||
ProtocolError::Missing(_field) => tonic::Status::invalid_argument(e.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelTag> for proto::RelTag {
|
||||
fn from(value: &RelTag) -> proto::RelTag {
|
||||
proto::RelTag {
|
||||
spc_oid: value.spc_oid,
|
||||
db_oid: value.db_oid,
|
||||
rel_number: value.rel_number,
|
||||
fork_number: value.fork_number as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelTag> for RelTag {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelTag) -> Result<RelTag, ProtocolError> {
|
||||
Ok(RelTag {
|
||||
spc_oid: value.spc_oid,
|
||||
db_oid: value.db_oid,
|
||||
rel_number: value.rel_number,
|
||||
fork_number: value
|
||||
.fork_number
|
||||
.try_into()
|
||||
.or(Err(ProtocolError::InvalidValue("fork_number")))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RequestCommon> for proto::RequestCommon {
|
||||
fn from(value: &RequestCommon) -> proto::RequestCommon {
|
||||
proto::RequestCommon {
|
||||
request_lsn: value.request_lsn.into(),
|
||||
not_modified_since_lsn: value.not_modified_since_lsn.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<&proto::RequestCommon> for RequestCommon {
|
||||
fn from(value: &proto::RequestCommon) -> RequestCommon {
|
||||
RequestCommon {
|
||||
request_lsn: value.request_lsn.into(),
|
||||
not_modified_since_lsn: value.not_modified_since_lsn.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelExistsRequest> for proto::RelExistsRequest {
|
||||
fn from(value: &RelExistsRequest) -> proto::RelExistsRequest {
|
||||
proto::RelExistsRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelExistsRequest) -> Result<RelExistsRequest, ProtocolError> {
|
||||
Ok(RelExistsRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelSizeRequest> for proto::RelSizeRequest {
|
||||
fn from(value: &RelSizeRequest) -> proto::RelSizeRequest {
|
||||
proto::RelSizeRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelSizeRequest) -> Result<RelSizeRequest, ProtocolError> {
|
||||
Ok(RelSizeRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&GetPageRequest> for proto::GetPageRequest {
|
||||
fn from(value: &GetPageRequest) -> proto::GetPageRequest {
|
||||
proto::GetPageRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
block_number: value.block_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::GetPageRequest> for GetPageRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::GetPageRequest) -> Result<GetPageRequest, ProtocolError> {
|
||||
Ok(GetPageRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
block_number: value.block_number,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&DbSizeRequest> for proto::DbSizeRequest {
|
||||
fn from(value: &DbSizeRequest) -> proto::DbSizeRequest {
|
||||
proto::DbSizeRequest {
|
||||
common: Some((&value.common).into()),
|
||||
db_oid: value.db_oid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::DbSizeRequest) -> Result<DbSizeRequest, ProtocolError> {
|
||||
Ok(DbSizeRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
db_oid: value.db_oid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest {
|
||||
proto::GetBaseBackupRequest {
|
||||
common: Some((&value.common).into()),
|
||||
replica: value.replica,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(
|
||||
value: &proto::GetBaseBackupRequest,
|
||||
) -> Result<GetBaseBackupRequest, ProtocolError> {
|
||||
Ok(GetBaseBackupRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
replica: value.replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -23,8 +23,6 @@ tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
pageserver_client.workspace = true
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_data_api.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -9,9 +9,6 @@ use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
use pageserver_client_grpc;
|
||||
use pageserver_data_api::model::{GetBaseBackupRequest, RequestCommon};
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -25,8 +22,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -57,7 +52,7 @@ impl LiveStats {
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Range<Lsn>,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
@@ -110,7 +105,7 @@ async fn main_impl(
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: info.last_record_lsn..(info.last_record_lsn + 1),
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
})
|
||||
}
|
||||
});
|
||||
@@ -154,27 +149,14 @@ async fn main_impl(
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
|
||||
let client_task = if args.grpc {
|
||||
tokio::spawn(client_grpc(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
} else {
|
||||
tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
};
|
||||
tasks.push(client_task);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
@@ -183,7 +165,7 @@ async fn main_impl(
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = rng.gen_range(target.lsn_range.clone());
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
@@ -233,7 +215,7 @@ async fn main_impl(
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Lsn,
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
@@ -258,7 +240,7 @@ async fn client(
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn: Some(lsn),
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
@@ -288,71 +270,3 @@ async fn client(
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client_grpc(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
|
||||
let client = pageserver_client_grpc::PageserverClient::new(
|
||||
&timeline.tenant_id.to_string(),
|
||||
&timeline.timeline_id.to_string(),
|
||||
&None,
|
||||
shard_map,
|
||||
);
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
|
||||
//tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
|
||||
info!("starting get_base_backup");
|
||||
let mut basebackup_stream = client
|
||||
.get_base_backup(
|
||||
&GetBaseBackupRequest {
|
||||
common: RequestCommon {
|
||||
request_lsn: lsn,
|
||||
not_modified_since_lsn: lsn,
|
||||
},
|
||||
replica: false,
|
||||
},
|
||||
gzip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
info!("starting receive");
|
||||
use futures::StreamExt;
|
||||
let mut size = 0;
|
||||
let mut nchunks = 0;
|
||||
while let Some(chunk) = basebackup_stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("error during basebackup"))
|
||||
.unwrap();
|
||||
size += chunk.chunk.len();
|
||||
nchunks += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"basebackup size is {} bytes, avg chunk size {} bytes",
|
||||
size,
|
||||
size as f32 / nchunks as f32
|
||||
);
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -8,8 +8,6 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::StreamExt;
|
||||
use futures::stream::FuturesOrdered;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
@@ -27,8 +25,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -299,29 +295,7 @@ async fn main_impl(
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
if args.grpc {
|
||||
client_grpc(
|
||||
args,
|
||||
worker_id,
|
||||
ss,
|
||||
cancel,
|
||||
rps_period,
|
||||
ranges,
|
||||
weights,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
client_libpq(
|
||||
args,
|
||||
worker_id,
|
||||
ss,
|
||||
cancel,
|
||||
rps_period,
|
||||
ranges,
|
||||
weights,
|
||||
)
|
||||
.await
|
||||
}
|
||||
client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
|
||||
})
|
||||
};
|
||||
|
||||
@@ -460,100 +434,3 @@ async fn client_libpq(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn client_grpc(
|
||||
args: &Args,
|
||||
worker_id: WorkerId,
|
||||
shared_state: Arc<SharedState>,
|
||||
cancel: CancellationToken,
|
||||
rps_period: Option<Duration>,
|
||||
ranges: Vec<KeyRange>,
|
||||
weights: rand::distributions::weighted::WeightedIndex<i128>,
|
||||
) {
|
||||
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
|
||||
let client = pageserver_client_grpc::PageserverClient::new(
|
||||
&worker_id.timeline.tenant_id.to_string(),
|
||||
&worker_id.timeline.timeline_id.to_string(),
|
||||
&None,
|
||||
shard_map,
|
||||
);
|
||||
let client = Arc::new(client);
|
||||
|
||||
shared_state.start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = FuturesOrdered::new();
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
let periods_passed_until_now =
|
||||
usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
|
||||
|
||||
if periods_passed_until_now > ticks_processed {
|
||||
shared_state
|
||||
.live_stats
|
||||
.missed((periods_passed_until_now - ticks_processed) as u64);
|
||||
}
|
||||
ticks_processed = periods_passed_until_now;
|
||||
}
|
||||
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
pageserver_data_api::model::GetPageRequest {
|
||||
common: pageserver_data_api::model::RequestCommon {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since_lsn: r.timeline_lsn,
|
||||
},
|
||||
rel: pageserver_data_api::model::RelTag {
|
||||
spc_oid: rel_tag.spcnode,
|
||||
db_oid: rel_tag.dbnode,
|
||||
rel_number: rel_tag.relnode,
|
||||
fork_number: rel_tag.forknum,
|
||||
},
|
||||
block_number: block_no,
|
||||
}
|
||||
};
|
||||
let client_clone = client.clone();
|
||||
let getpage_fut = async move {
|
||||
let result = client_clone.get_page(&req).await;
|
||||
(start, result)
|
||||
};
|
||||
inflight.push_back(getpage_fut);
|
||||
}
|
||||
|
||||
let (start, result) = inflight.next().await.unwrap();
|
||||
result.expect("getpage request should succeed");
|
||||
let end = Instant::now();
|
||||
shared_state.live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
if let Some(period) = &rps_period {
|
||||
let next_at = client_start
|
||||
+ Duration::from_micros(
|
||||
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
|
||||
);
|
||||
tokio::time::sleep_until(next_at.into()).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,14 +151,10 @@ where
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
let res = basebackup
|
||||
basebackup
|
||||
.send_tarball()
|
||||
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
|
||||
.await;
|
||||
|
||||
info!("basebackup done!");
|
||||
|
||||
res
|
||||
.await
|
||||
}
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
|
||||
@@ -16,7 +16,6 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
|
||||
use metrics::set_build_info_metric;
|
||||
use nix::sys::socket::{setsockopt, sockopt};
|
||||
use pageserver::compute_service;
|
||||
use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
|
||||
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
|
||||
use pageserver::deletion_queue::DeletionQueue;
|
||||
@@ -28,7 +27,7 @@ use pageserver::task_mgr::{
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, task_mgr, virtual_file,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -505,7 +504,7 @@ fn start_pageserver(
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
remote_storage.clone(),
|
||||
StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?,
|
||||
StorageControllerUpcallClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
@@ -746,7 +745,7 @@ fn start_pageserver(
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
|
||||
let compute_service = compute_service::spawn(
|
||||
let page_service = page_service::spawn(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
@@ -783,7 +782,7 @@ fn start_pageserver(
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
https_endpoint_listener,
|
||||
compute_service,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
|
||||
@@ -1,286 +0,0 @@
|
||||
//!
|
||||
//! The Compute Service listens for compute connections, and serves requests like
|
||||
//! the GetPage@LSN requests.
|
||||
//!
|
||||
//! We support two protocols:
|
||||
//!
|
||||
//! 1. Legacy, connection-oriented libpq based protocol. That's
|
||||
//! handled by the code in page_service.rs.
|
||||
//!
|
||||
//! 2. gRPC based protocol. See compute_service_grpc.rs.
|
||||
//!
|
||||
//! To make the transition smooth, without having to open up new firewall ports
|
||||
//! etc, both protocols are served on the same port. When a new TCP connection
|
||||
//! is accepted, we peek at the first few bytes incoming from the client to
|
||||
//! determine which protocol it speaks.
|
||||
//!
|
||||
//! TODO: This gets easier once we drop the legacy protocol support. Or if we
|
||||
//! open a separate port for them.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::FutureExt;
|
||||
use pageserver_api::config::PageServicePipeliningConfig;
|
||||
use postgres_backend::AuthType;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
|
||||
use crate::compute_service_grpc::launch_compute_service_grpc_server;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_service::libpq_page_service_conn_main;
|
||||
use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub type ConnectionHandlerResult = anyhow::Result<()>;
|
||||
|
||||
pub struct Connections {
|
||||
cancel: CancellationToken,
|
||||
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
|
||||
gate: Gate,
|
||||
}
|
||||
|
||||
impl Connections {
|
||||
pub(crate) async fn shutdown(self) {
|
||||
let Self {
|
||||
cancel,
|
||||
mut tasks,
|
||||
gate,
|
||||
} = self;
|
||||
cancel.cancel();
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
Self::handle_connection_completion(res);
|
||||
}
|
||||
gate.close().await;
|
||||
}
|
||||
|
||||
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
|
||||
match res {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
|
||||
Err(e) => error!("page_service connection task panicked: {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Listener {
|
||||
cancel: CancellationToken,
|
||||
/// Cancel the listener task through `listen_cancel` to shut down the listener
|
||||
/// and get a handle on the existing connections.
|
||||
task: JoinHandle<Connections>,
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"compute connection listener",
|
||||
compute_connection_listener_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
.map(anyhow::Ok),
|
||||
));
|
||||
|
||||
Listener { cancel, task }
|
||||
}
|
||||
|
||||
impl Listener {
|
||||
pub async fn stop_accepting(self) -> Connections {
|
||||
self.cancel.cancel();
|
||||
self.task
|
||||
.await
|
||||
.expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
|
||||
}
|
||||
}
|
||||
|
||||
/// Listener loop. Listens for connections, and launches a new handler
|
||||
/// task for each.
|
||||
///
|
||||
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
|
||||
/// open connections.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn compute_connection_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
let connections_cancel = CancellationToken::new();
|
||||
let connections_gate = Gate::default();
|
||||
let mut connection_handler_tasks = tokio::task::JoinSet::default();
|
||||
|
||||
// The connection handling task passes the gRPC protocol
|
||||
// connections to this channel. The tonic gRPC server reads the
|
||||
// channel and takes over the connections from there.
|
||||
let (grpc_connections_tx, grpc_connections_rx) = tokio::sync::mpsc::channel(1000);
|
||||
|
||||
// Set up the gRPC service
|
||||
launch_compute_service_grpc_server(
|
||||
grpc_connections_rx,
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
auth.clone(),
|
||||
auth_type,
|
||||
connections_cancel.clone(),
|
||||
&listener_ctx,
|
||||
);
|
||||
|
||||
// Main listener loop
|
||||
loop {
|
||||
let gate_guard = match connections_gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => break,
|
||||
};
|
||||
|
||||
let accepted = tokio::select! {
|
||||
biased;
|
||||
_ = listener_cancel.cancelled() => break,
|
||||
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
|
||||
let res = next.expect("we dont poll while empty");
|
||||
Connections::handle_connection_completion(res);
|
||||
continue;
|
||||
}
|
||||
accepted = listener.accept() => accepted,
|
||||
};
|
||||
|
||||
match accepted {
|
||||
Ok((socket, peer_addr)) => {
|
||||
// Connection established. Spawn a new task to handle it.
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let local_auth = auth.clone();
|
||||
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch.clone())
|
||||
.detached_child();
|
||||
|
||||
connection_handler_tasks.spawn(page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
grpc_connections_tx.clone(),
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
// accept() failed. Log the error, and loop back to retry on next connection.
|
||||
error!("accept() failed: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("page_service listener loop terminated");
|
||||
|
||||
Connections {
|
||||
cancel: connections_cancel,
|
||||
tasks: connection_handler_tasks,
|
||||
gate: connections_gate,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a new incoming connection.
|
||||
///
|
||||
/// This peeks at the first few incoming bytes and dispatches the connection
|
||||
/// to the legacy libpq handler or the new gRPC handler accordingly.
|
||||
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
grpc_connections_tx: tokio::sync::mpsc::Sender<tokio::io::Result<tokio::net::TcpStream>>,
|
||||
) -> ConnectionHandlerResult {
|
||||
let mut buf: [u8; 4] = [0; 4];
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
// Peek
|
||||
socket.peek(&mut buf).await?;
|
||||
|
||||
let mut grpc = false;
|
||||
if buf[0] == 0x16 {
|
||||
// looks like a TLS handshake. Assume gRPC.
|
||||
// XXX: Starting with v17, PostgreSQL also supports "direct TLS mode". But
|
||||
// the compute doesn't use it.
|
||||
grpc = true;
|
||||
}
|
||||
|
||||
if buf[0] == b'G' || buf[0] == b'P' {
|
||||
// Looks like 'GET' or 'POST'
|
||||
// or 'PRI', indicating gRPC over HTTP/2 with prior knowledge
|
||||
grpc = true;
|
||||
}
|
||||
|
||||
// Dispatch
|
||||
if grpc {
|
||||
grpc_connections_tx.send(Ok(socket)).await?;
|
||||
info!("connection sent to channel");
|
||||
Ok(())
|
||||
} else {
|
||||
libpq_page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config,
|
||||
pipelining_config,
|
||||
connection_ctx,
|
||||
cancel,
|
||||
gate_guard,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -1,746 +0,0 @@
|
||||
//!
|
||||
//! Compute <-> Pageserver API handler. This is for the new gRPC-based protocol
|
||||
//!
|
||||
//! TODO:
|
||||
//!
|
||||
//! - Many of the API endpoints are still missing
|
||||
//!
|
||||
//! - This is very much not optimized.
|
||||
//!
|
||||
//! - Much of the code was copy-pasted from page_service.rs. Like the code to get the
|
||||
//! Timeline object, and the JWT auth. Could refactor and share.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::TenantManager;
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::mgr::ShardResolveResult;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::WaitLsnTimeout;
|
||||
use tokio::io::{AsyncWriteExt, ReadHalf, SimplexStream};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::codec::{Decoder, FramedRead};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
use pageserver_data_api::model;
|
||||
use pageserver_data_api::proto::page_service_server::PageService;
|
||||
use pageserver_data_api::proto::page_service_server::PageServiceServer;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::BytesMut;
|
||||
use jsonwebtoken::TokenData;
|
||||
use tracing::Instrument;
|
||||
use tracing::{debug, error};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::simple_rcu::RcuReadGuard;
|
||||
|
||||
use crate::tenant::PageReconstructError;
|
||||
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
use tonic;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::service::interceptor::InterceptedService;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
|
||||
use postgres_backend::AuthType;
|
||||
|
||||
pub use pageserver_data_api::proto;
|
||||
|
||||
pub(super) fn launch_compute_service_grpc_server(
|
||||
tcp_connections_rx: tokio::sync::mpsc::Receiver<tokio::io::Result<tokio::net::TcpStream>>,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
auth_type: AuthType,
|
||||
connections_cancel: CancellationToken,
|
||||
listener_ctx: &RequestContext,
|
||||
) {
|
||||
// Set up the gRPC service
|
||||
let service_ctx = RequestContextBuilder::from(listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.attached_child();
|
||||
let service = crate::compute_service_grpc::PageServiceService {
|
||||
conf,
|
||||
tenant_mgr: tenant_manager.clone(),
|
||||
ctx: Arc::new(service_ctx),
|
||||
};
|
||||
let authenticator = PageServiceAuthenticator {
|
||||
auth: auth.clone(),
|
||||
auth_type,
|
||||
};
|
||||
|
||||
let server = InterceptedService::new(
|
||||
PageServiceServer::new(service).send_compressed(CompressionEncoding::Gzip),
|
||||
authenticator,
|
||||
);
|
||||
|
||||
let cc = connections_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(server)
|
||||
.serve_with_incoming_shutdown(
|
||||
tokio_stream::wrappers::ReceiverStream::new(tcp_connections_rx),
|
||||
cc.cancelled(),
|
||||
)
|
||||
.await
|
||||
});
|
||||
}
|
||||
|
||||
struct PageServiceService {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_mgr: Arc<TenantManager>,
|
||||
ctx: Arc<RequestContext>,
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
impl From<PageReconstructError> for tonic::Status {
|
||||
fn from(e: PageReconstructError) -> Self {
|
||||
match e {
|
||||
PageReconstructError::Other(err) => tonic::Status::unknown(err.to_string()),
|
||||
PageReconstructError::AncestorLsnTimeout(_) => {
|
||||
tonic::Status::unavailable(e.to_string())
|
||||
}
|
||||
PageReconstructError::Cancelled => tonic::Status::aborted(e.to_string()),
|
||||
PageReconstructError::WalRedo(_) => tonic::Status::internal(e.to_string()),
|
||||
PageReconstructError::MissingKey(_) => tonic::Status::internal(e.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_reltag(value: &model::RelTag) -> pageserver_api::reltag::RelTag {
|
||||
pageserver_api::reltag::RelTag {
|
||||
spcnode: value.spc_oid,
|
||||
dbnode: value.db_oid,
|
||||
relnode: value.rel_number,
|
||||
forknum: value.fork_number,
|
||||
}
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl PageService for PageServiceService {
|
||||
type GetBaseBackupStream = GetBaseBackupStream;
|
||||
|
||||
async fn rel_exists(
|
||||
&self,
|
||||
request: tonic::Request<proto::RelExistsRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::RelExistsResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::RelExistsRequest = request.get_ref().try_into()?;
|
||||
|
||||
let rel = convert_reltag(&req.rel);
|
||||
let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(rel, Version::Lsn(lsn), &ctx)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::RelExistsResponse { exists }))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns size of a relation, as # of blocks
|
||||
async fn rel_size(
|
||||
&self,
|
||||
request: tonic::Request<proto::RelSizeRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::RelSizeResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::RelSizeRequest = request.get_ref().try_into()?;
|
||||
let rel = convert_reltag(&req.rel);
|
||||
|
||||
let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks }))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn get_page(
|
||||
&self,
|
||||
request: tonic::Request<proto::GetPageRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::GetPageResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::GetPageRequest = request.get_ref().try_into()?;
|
||||
|
||||
// Calculate shard number.
|
||||
//
|
||||
// FIXME: this should probably be part of the data_api crate.
|
||||
let rel = convert_reltag(&req.rel);
|
||||
let key = rel_block_to_key(rel, req.block_number);
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Page(key)).await?;
|
||||
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let shard_id = timeline.tenant_shard_id.shard_number;
|
||||
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let gate_guard = match timeline.gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(tonic::Status::unavailable("timeline is shutting down"));
|
||||
}
|
||||
};
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard);
|
||||
|
||||
let page_image = timeline
|
||||
.get_rel_page_at_lsn(
|
||||
rel,
|
||||
req.block_number,
|
||||
Version::Lsn(lsn),
|
||||
&ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::GetPageResponse {
|
||||
page_image: page_image,
|
||||
}))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn db_size(
|
||||
&self,
|
||||
request: tonic::Request<proto::DbSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::DbSizeResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::DbSizeRequest = request.get_ref().try_into()?;
|
||||
|
||||
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::DbSizeResponse {
|
||||
num_bytes: total_blocks as u64 * BLCKSZ as u64,
|
||||
}))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn get_base_backup(
|
||||
&self,
|
||||
request: tonic::Request<proto::GetBaseBackupRequest>,
|
||||
) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::GetBaseBackupRequest = request.get_ref().try_into()?;
|
||||
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn);
|
||||
|
||||
tracing::info!("starting basebackup");
|
||||
|
||||
#[allow(dead_code)]
|
||||
enum TestMode {
|
||||
/// Create real basebackup, in streaming fashion
|
||||
Streaming,
|
||||
/// Create real basebackup, but fully materialize it in the 'simplex' pipe buffer first
|
||||
Materialize,
|
||||
/// Create a dummy all-zeros basebackup, in streaming fashion
|
||||
DummyStreaming,
|
||||
/// Create a dummy all-zeros basebackup, but fully materialize it first
|
||||
DummyMaterialize,
|
||||
}
|
||||
let mode = TestMode::Streaming;
|
||||
|
||||
let buf_size = match mode {
|
||||
TestMode::Streaming | TestMode::DummyStreaming => 64 * 1024,
|
||||
TestMode::Materialize | TestMode::DummyMaterialize => 64 * 1024 * 1024,
|
||||
};
|
||||
|
||||
let (simplex_read, mut simplex_write) = tokio::io::simplex(buf_size);
|
||||
|
||||
let basebackup_task = match mode {
|
||||
TestMode::DummyStreaming => {
|
||||
tokio::spawn(
|
||||
async move {
|
||||
// hold onto the guard for as long as the basebackup runs
|
||||
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
|
||||
|
||||
let zerosbuf: [u8; 1024] = [0; 1024];
|
||||
let nbytes = 16900000;
|
||||
let mut bytes_written = 0;
|
||||
while bytes_written < nbytes {
|
||||
let s = std::cmp::min(1024, nbytes - bytes_written);
|
||||
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
|
||||
bytes_written += s;
|
||||
}
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.context("shutdown of basebackup pipe")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span),
|
||||
)
|
||||
}
|
||||
TestMode::DummyMaterialize => {
|
||||
let zerosbuf: [u8; 1024] = [0; 1024];
|
||||
let nbytes = 16900000;
|
||||
let mut bytes_written = 0;
|
||||
while bytes_written < nbytes {
|
||||
let s = std::cmp::min(1024, nbytes - bytes_written);
|
||||
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
|
||||
bytes_written += s;
|
||||
}
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("shutdown of basebackup pipe");
|
||||
tracing::info!("basebackup (dummy) materialized");
|
||||
let result = Ok(());
|
||||
|
||||
tokio::spawn(std::future::ready(result))
|
||||
}
|
||||
TestMode::Materialize => {
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(lsn),
|
||||
None,
|
||||
false,
|
||||
req.replica,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("shutdown of basebackup pipe");
|
||||
tracing::info!("basebackup materialized");
|
||||
|
||||
// Launch a task that writes the basebackup tarball to the simplex pipe
|
||||
tokio::spawn(std::future::ready(result))
|
||||
}
|
||||
TestMode::Streaming => {
|
||||
tokio::spawn(
|
||||
async move {
|
||||
// hold onto the guard for as long as the basebackup runs
|
||||
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
|
||||
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(lsn),
|
||||
None,
|
||||
false,
|
||||
req.replica,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.context("shutdown of basebackup pipe")?;
|
||||
result
|
||||
}
|
||||
.instrument(span),
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let response = new_basebackup_response_stream(simplex_read, basebackup_task);
|
||||
|
||||
Ok(tonic::Response::new(response))
|
||||
}
|
||||
}
|
||||
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
/// NB: and also different from page_service::ACTIVE_TENANT_TIMEOUT
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
impl PageServiceService {
|
||||
async fn get_timeline(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, tonic::Status> {
|
||||
let timeout = ACTIVE_TENANT_TIMEOUT;
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
let tenant_shard = loop {
|
||||
let resolved = self
|
||||
.tenant_mgr
|
||||
.resolve_attached_shard(&ttid.tenant_id, shard_selector);
|
||||
|
||||
match resolved {
|
||||
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
|
||||
ShardResolveResult::NotFound => {
|
||||
return Err(tonic::Status::not_found("tenant not found"));
|
||||
}
|
||||
ShardResolveResult::InProgress(barrier) => {
|
||||
// We can't authoritatively answer right now: wait for InProgress state
|
||||
// to end, then try again
|
||||
tokio::select! {
|
||||
_ = barrier.wait() => {
|
||||
// The barrier completed: proceed around the loop to try looking up again
|
||||
},
|
||||
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
|
||||
return Err(tonic::Status::unavailable("tenant is in InProgress state"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!("Waiting for tenant to enter active state...");
|
||||
tenant_shard
|
||||
.wait_to_become_active(deadline.duration_since(Instant::now()))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tonic::Status::unavailable(format!("tenant is not in active state: {e}"))
|
||||
})?;
|
||||
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(ttid.timeline_id, true)
|
||||
.map_err(|e| tonic::Status::unavailable(format!("could not get timeline: {e}")))?;
|
||||
|
||||
// FIXME: need to do something with the 'gate' here?
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Extract TenantTimelineId from the request metadata
|
||||
///
|
||||
/// Note: the interceptor has already authenticated the request
|
||||
///
|
||||
/// TOOD: Could we use "binary" metadata for these, for efficiency? gRPC has such a concept
|
||||
fn extract_ttid(
|
||||
&self,
|
||||
metadata: &tonic::metadata::MetadataMap,
|
||||
) -> Result<TenantTimelineId, tonic::Status> {
|
||||
let tenant_id = metadata
|
||||
.get("neon-tenant-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-tenant-id metadata missing",
|
||||
))?;
|
||||
let tenant_id = tenant_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
|
||||
})?;
|
||||
let tenant_id = TenantId::from_str(tenant_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
|
||||
|
||||
let timeline_id =
|
||||
metadata
|
||||
.get("neon-timeline-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-timeline-id metadata missing",
|
||||
))?;
|
||||
let timeline_id = timeline_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-timeline-id metadata")
|
||||
})?;
|
||||
let timeline_id = TimelineId::from_str(timeline_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-timelineid metadata"))?;
|
||||
|
||||
Ok(TenantTimelineId::new(tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
// XXX: copied from PageServerHandler
|
||||
async fn wait_or_get_last_lsn(
|
||||
timeline: &Timeline,
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, tonic::Status> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
// Sanity check the request
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"invalid request with request LSN {} and not_modified_since {}",
|
||||
request_lsn, not_modified_since,
|
||||
)));
|
||||
}
|
||||
|
||||
// Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
|
||||
if request_lsn == Lsn::INVALID {
|
||||
return Err(tonic::Status::invalid_argument("invalid LSN(0) in request"));
|
||||
}
|
||||
|
||||
// Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
|
||||
//
|
||||
// We may have older data available, but we make a best effort to detect this case and return an error,
|
||||
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
|
||||
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.lsn_covered_by_lease(request_lsn) {
|
||||
return Err(tonic::Status::not_found(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
request_lsn, **latest_gc_cutoff_lsn
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
if not_modified_since > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::unavailable("not_modified_since LSN not arrived yet")
|
||||
})?;
|
||||
// Since we waited for 'not_modified_since' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
Ok(not_modified_since)
|
||||
} else {
|
||||
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
|
||||
// here instead. That would give the same result, since we know that there
|
||||
// haven't been any modifications since 'not_modified_since'. Using an older
|
||||
// LSN might be faster, because that could allow skipping recent layers when
|
||||
// finding the page. However, we have historically used 'last_record_lsn', so
|
||||
// stick to that for now.
|
||||
Ok(std::cmp::min(last_record_lsn, request_lsn))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PageServiceAuthenticator {
|
||||
pub auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pub auth_type: AuthType,
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for PageServiceAuthenticator {
|
||||
fn call(
|
||||
&mut self,
|
||||
req: tonic::Request<()>,
|
||||
) -> std::result::Result<tonic::Request<()>, tonic::Status> {
|
||||
// Check the tenant_id in any case
|
||||
let tenant_id =
|
||||
req.metadata()
|
||||
.get("neon-tenant-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-tenant-id metadata missing",
|
||||
))?;
|
||||
let tenant_id = tenant_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
|
||||
})?;
|
||||
let tenant_id = TenantId::from_str(tenant_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
|
||||
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
let auth = if let Some(auth) = &self.auth {
|
||||
auth
|
||||
} else {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(req);
|
||||
};
|
||||
|
||||
let jwt = req
|
||||
.metadata()
|
||||
.get("neon-auth-token")
|
||||
.ok_or(tonic::Status::unauthenticated("no neon-auth-token"))?;
|
||||
let jwt = jwt.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-auth-token metadata")
|
||||
})?;
|
||||
|
||||
let jwtdata: TokenData<utils::auth::Claims> = auth
|
||||
.decode(jwt)
|
||||
.map_err(|err| tonic::Status::unauthenticated(format!("invalid JWT token: {}", err)))?;
|
||||
let claims = jwtdata.claims;
|
||||
|
||||
if matches!(claims.scope, utils::auth::Scope::Tenant) && claims.tenant_id.is_none() {
|
||||
return Err(tonic::Status::unauthenticated(
|
||||
"jwt token scope is Tenant, but tenant id is missing",
|
||||
));
|
||||
}
|
||||
|
||||
debug!(
|
||||
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
|
||||
claims.scope, claims.tenant_id,
|
||||
);
|
||||
|
||||
// The token is valid. Check if it's allowed to access the tenant ID
|
||||
// given in the request.
|
||||
|
||||
check_permission(&claims, Some(tenant_id))
|
||||
.map_err(|err| tonic::Status::permission_denied(err.to_string()))?;
|
||||
|
||||
// All checks out
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stream of GetBaseBackupResponseChunk messages.
|
||||
///
|
||||
/// The first part of the Chain chunks the tarball. The second part checks the return value
|
||||
/// of the send_basebackup_tarball Future that created the tarball.
|
||||
|
||||
type GetBaseBackupStream = futures::stream::Chain<BasebackupChunkedStream, CheckResultStream>;
|
||||
|
||||
fn new_basebackup_response_stream(
|
||||
simplex_read: ReadHalf<SimplexStream>,
|
||||
basebackup_task: JoinHandle<Result<(), BasebackupError>>,
|
||||
) -> GetBaseBackupStream {
|
||||
let framed = FramedRead::new(simplex_read, GetBaseBackupResponseDecoder {});
|
||||
|
||||
framed.chain(CheckResultStream { basebackup_task })
|
||||
}
|
||||
|
||||
/// Stream that uses GetBaseBackupResponseDecoder
|
||||
type BasebackupChunkedStream =
|
||||
tokio_util::codec::FramedRead<ReadHalf<SimplexStream>, GetBaseBackupResponseDecoder>;
|
||||
|
||||
struct GetBaseBackupResponseDecoder;
|
||||
impl Decoder for GetBaseBackupResponseDecoder {
|
||||
type Item = proto::GetBaseBackupResponseChunk;
|
||||
type Error = tonic::Status;
|
||||
|
||||
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if src.len() < 64 * 1024 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let item = proto::GetBaseBackupResponseChunk {
|
||||
chunk: bytes::Bytes::from(std::mem::take(src)),
|
||||
};
|
||||
|
||||
Ok(Some(item))
|
||||
}
|
||||
|
||||
fn decode_eof(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if src.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let item = proto::GetBaseBackupResponseChunk {
|
||||
chunk: bytes::Bytes::from(std::mem::take(src)),
|
||||
};
|
||||
|
||||
Ok(Some(item))
|
||||
}
|
||||
}
|
||||
|
||||
struct CheckResultStream {
|
||||
basebackup_task: tokio::task::JoinHandle<Result<(), BasebackupError>>,
|
||||
}
|
||||
impl futures::Stream for CheckResultStream {
|
||||
type Item = Result<proto::GetBaseBackupResponseChunk, tonic::Status>;
|
||||
|
||||
fn poll_next(
|
||||
mut self: Pin<&mut Self>,
|
||||
ctx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Option<Self::Item>> {
|
||||
let task = Pin::new(&mut self.basebackup_task);
|
||||
match task.poll(ctx) {
|
||||
Poll::Pending => Poll::Pending,
|
||||
Poll::Ready(Ok(Ok(()))) => Poll::Ready(None),
|
||||
Poll::Ready(Ok(Err(basebackup_err))) => {
|
||||
error!(error=%basebackup_err, "error getting basebackup");
|
||||
Poll::Ready(Some(Err(tonic::Status::internal(
|
||||
"could not get basebackup",
|
||||
))))
|
||||
}
|
||||
Poll::Ready(Err(join_err)) => {
|
||||
error!(error=%join_err, "JoinError getting basebackup");
|
||||
Poll::Ready(Some(Err(tonic::Status::internal(
|
||||
"could not get basebackup",
|
||||
))))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,7 +150,7 @@ pub struct PageServerConf {
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_api: Url,
|
||||
|
||||
/// JWT token for use with the control plane API.
|
||||
pub control_plane_api_token: Option<SecretString>,
|
||||
@@ -230,6 +230,8 @@ pub struct PageServerConf {
|
||||
/// such as authentication requirements for HTTP and PostgreSQL APIs.
|
||||
/// This is insecure and should only be used in development environments.
|
||||
pub dev_mode: bool,
|
||||
|
||||
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -404,6 +406,7 @@ impl PageServerConf {
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -438,7 +441,8 @@ impl PageServerConf {
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
control_plane_api: control_plane_api
|
||||
.ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
@@ -456,6 +460,7 @@ impl PageServerConf {
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -573,6 +578,7 @@ impl PageServerConf {
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
load_previous_heatmap: Some(true),
|
||||
generate_unarchival_heatmap: Some(true),
|
||||
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
@@ -641,9 +647,12 @@ mod tests {
|
||||
use super::PageServerConf;
|
||||
|
||||
#[test]
|
||||
fn test_empty_config_toml_is_valid() {
|
||||
// we use Default impl of everything in this situation
|
||||
fn test_minimal_config_toml_is_valid() {
|
||||
// The minimal valid config for running a pageserver:
|
||||
// - control_plane_api is mandatory, as pageservers cannot run in isolation
|
||||
// - we use Default impl of everything else in this situation
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("empty config is valid");
|
||||
|
||||
@@ -30,9 +30,6 @@ pub(super) enum Name {
|
||||
/// Tenant remote size
|
||||
#[serde(rename = "remote_storage_size")]
|
||||
RemoteSize,
|
||||
/// Tenant resident size
|
||||
#[serde(rename = "resident_size")]
|
||||
ResidentSize,
|
||||
/// Tenant synthetic size
|
||||
#[serde(rename = "synthetic_storage_size")]
|
||||
SyntheticSize,
|
||||
@@ -187,18 +184,6 @@ impl MetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
|
||||
///
|
||||
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
|
||||
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: Name::ResidentSize,
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
|
||||
@@ -261,10 +246,7 @@ where
|
||||
let mut tenants = std::pin::pin!(tenants);
|
||||
|
||||
while let Some((tenant_id, tenant)) = tenants.next().await {
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
let timelines = tenant.list_timelines();
|
||||
let timelines_len = timelines.len();
|
||||
for timeline in timelines {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
@@ -287,16 +269,9 @@ where
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
tenant_resident_size += timeline.resident_physical_size();
|
||||
}
|
||||
|
||||
if timelines_len == 0 {
|
||||
// Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
|
||||
tenant_resident_size = 1;
|
||||
}
|
||||
|
||||
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
|
||||
let snap = TenantSnapshot::collect(&tenant);
|
||||
snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
|
||||
}
|
||||
|
||||
@@ -305,19 +280,14 @@ where
|
||||
|
||||
/// In-between abstraction to allow testing metrics without actual Tenants.
|
||||
struct TenantSnapshot {
|
||||
resident_size: u64,
|
||||
remote_size: u64,
|
||||
synthetic_size: u64,
|
||||
}
|
||||
|
||||
impl TenantSnapshot {
|
||||
/// Collect tenant status to have metrics created out of it.
|
||||
///
|
||||
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
|
||||
/// cannot just list timelines here.
|
||||
fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
|
||||
fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
|
||||
TenantSnapshot {
|
||||
resident_size,
|
||||
remote_size: t.remote_size(),
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
@@ -334,8 +304,6 @@ impl TenantSnapshot {
|
||||
) {
|
||||
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
|
||||
|
||||
let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
|
||||
|
||||
let synthetic_size = {
|
||||
let factory = MetricsKey::synthetic_size(tenant_id);
|
||||
let mut synthetic_size = self.synthetic_size;
|
||||
@@ -355,11 +323,7 @@ impl TenantSnapshot {
|
||||
}
|
||||
};
|
||||
|
||||
metrics.extend(
|
||||
[Some(remote_size), Some(resident_size), synthetic_size]
|
||||
.into_iter()
|
||||
.flatten(),
|
||||
);
|
||||
metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let ts = TenantSnapshot {
|
||||
resident_size: 1000,
|
||||
remote_size: 1000,
|
||||
// not yet calculated
|
||||
synthetic_size: 0,
|
||||
@@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::synthetic_size(tenant_id).at(now, 1000),
|
||||
]
|
||||
);
|
||||
@@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let ts = TenantSnapshot {
|
||||
resident_size: 1000,
|
||||
remote_size: 1000,
|
||||
// not yet calculated
|
||||
synthetic_size: 0,
|
||||
@@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 1000),
|
||||
// no synthetic size here
|
||||
]
|
||||
);
|
||||
@@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old(
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [RawMetric; 6] {
|
||||
) -> [RawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until_old_format(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
|
||||
]
|
||||
}
|
||||
@@ -312,13 +307,12 @@ pub(crate) const fn metric_examples(
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [NewRawMetric; 6] {
|
||||
) -> [NewRawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at(now, 1),
|
||||
]
|
||||
}
|
||||
|
||||
@@ -521,10 +521,6 @@ mod tests {
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
@@ -564,7 +560,7 @@ mod tests {
|
||||
assert_eq!(upgraded_samples, new_samples);
|
||||
}
|
||||
|
||||
fn metric_samples_old() -> [RawMetric; 6] {
|
||||
fn metric_samples_old() -> [RawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
@@ -576,7 +572,7 @@ mod tests {
|
||||
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
|
||||
}
|
||||
|
||||
fn metric_samples() -> [NewRawMetric; 6] {
|
||||
fn metric_samples() -> [NewRawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
|
||||
@@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi {
|
||||
impl StorageControllerUpcallClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Option<Self>, reqwest::Error> {
|
||||
let mut url = match conf.control_plane_api.as_ref() {
|
||||
Some(u) => u.clone(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self {
|
||||
let mut url = conf.control_plane_api.clone();
|
||||
|
||||
if let Ok(mut segs) = url.path_segments_mut() {
|
||||
// This ensures that `url` ends with a slash if it doesn't already.
|
||||
@@ -85,15 +79,17 @@ impl StorageControllerUpcallClient {
|
||||
}
|
||||
|
||||
for cert in &conf.ssl_ca_certs {
|
||||
client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
|
||||
client = client.add_root_certificate(
|
||||
Certificate::from_der(cert.contents()).expect("Invalid certificate in config"),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Some(Self {
|
||||
http_client: client.build()?,
|
||||
Self {
|
||||
http_client: client.build().expect("Failed to construct HTTP client"),
|
||||
base_url: url,
|
||||
node_id: conf.id,
|
||||
cancel: cancel.clone(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
|
||||
@@ -585,7 +585,7 @@ impl DeletionQueue {
|
||||
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, DeletionQueueWorkers<C>)
|
||||
where
|
||||
@@ -701,7 +701,7 @@ mod test {
|
||||
async fn restart(&mut self) {
|
||||
let (deletion_queue, workers) = DeletionQueue::new(
|
||||
self.storage.clone(),
|
||||
Some(self.mock_control_plane.clone()),
|
||||
self.mock_control_plane.clone(),
|
||||
self.harness.conf,
|
||||
);
|
||||
|
||||
@@ -821,11 +821,8 @@ mod test {
|
||||
|
||||
let mock_control_plane = MockStorageController::new();
|
||||
|
||||
let (deletion_queue, worker) = DeletionQueue::new(
|
||||
storage.clone(),
|
||||
Some(mock_control_plane.clone()),
|
||||
harness.conf,
|
||||
);
|
||||
let (deletion_queue, worker) =
|
||||
DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);
|
||||
|
||||
let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ where
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
// Client for calling into control plane API for validation of deletes
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
|
||||
// DeletionLists which are waiting generation validation. Not safe to
|
||||
// execute until [`validate`] has processed them.
|
||||
@@ -86,7 +86,7 @@ where
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -137,20 +137,16 @@ where
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
|
||||
match controller_upcall_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
// The only way a validation call returns an error is when the cancellation token fires
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
let tenants_valid = match self
|
||||
.controller_upcall_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
// The only way a validation call returns an error is when the cancellation token fires
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
} else {
|
||||
// Control plane API disabled. In legacy mode we consider everything valid.
|
||||
tenant_generations.keys().map(|k| (*k, true)).collect()
|
||||
};
|
||||
|
||||
let mut validated_sequence: Option<u64> = None;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user