mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
Compare commits
48 Commits
lfc_perfor
...
tristan957
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a3c5981106 | ||
|
|
0e0ad073bf | ||
|
|
6827f2f58c | ||
|
|
c82e363ed9 | ||
|
|
50dc2fae77 | ||
|
|
62ac5b94b3 | ||
|
|
f0e7b3e0ef | ||
|
|
c6ff18affc | ||
|
|
16ca74a3f4 | ||
|
|
cb67f9a651 | ||
|
|
baf425a2cd | ||
|
|
0b243242df | ||
|
|
6131d86ec9 | ||
|
|
4b9087651c | ||
|
|
79699aebc8 | ||
|
|
22290eb7ba | ||
|
|
bbc35e10b8 | ||
|
|
ae2c3ac12f | ||
|
|
16d594b7b3 | ||
|
|
f999632327 | ||
|
|
5bd850d15a | ||
|
|
1b789e8d7c | ||
|
|
bec7427d9e | ||
|
|
e2db76b9be | ||
|
|
6b4b8e0d8b | ||
|
|
1d68577fbd | ||
|
|
60f63c076f | ||
|
|
8da4ec9740 | ||
|
|
b48404952d | ||
|
|
1d06172d59 | ||
|
|
a08c1a23eb | ||
|
|
a2adc7dbd3 | ||
|
|
768a580373 | ||
|
|
09247de8d5 | ||
|
|
0b35929211 | ||
|
|
b3db7f66ac | ||
|
|
498d852bde | ||
|
|
7f8b1d79c0 | ||
|
|
d15f2ff57a | ||
|
|
3593356c10 | ||
|
|
9e8ab2ab4f | ||
|
|
c1ff7db187 | ||
|
|
6d6b83e737 | ||
|
|
0482690534 | ||
|
|
a750026c2e | ||
|
|
998d2c2ce9 | ||
|
|
b1fa68f659 | ||
|
|
84bc3380cc |
5
.github/actionlint.yml
vendored
5
.github/actionlint.yml
vendored
@@ -33,9 +33,14 @@ config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
- SLACK_COMPUTE_CHANNEL_ID
|
||||
- SLACK_ON_CALL_DEVPROD_STREAM
|
||||
- SLACK_ON_CALL_QA_STAGING_STREAM
|
||||
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
|
||||
- SLACK_ONCALL_COMPUTE_GROUP
|
||||
- SLACK_ONCALL_PROXY_GROUP
|
||||
- SLACK_ONCALL_STORAGE_GROUP
|
||||
- SLACK_PROXY_CHANNEL_ID
|
||||
- SLACK_RUST_CHANNEL_ID
|
||||
- SLACK_STORAGE_CHANNEL_ID
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
2
.github/scripts/lint-release-pr.sh
vendored
2
.github/scripts/lint-release-pr.sh
vendored
@@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
|
||||
LAST_COMMIT=$(git rev-parse HEAD)
|
||||
|
||||
MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
|
||||
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
|
||||
EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$"
|
||||
|
||||
if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
|
||||
report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
|
||||
|
||||
103
.github/workflows/_create-release-pr.yml
vendored
103
.github/workflows/_create-release-pr.yml
vendored
@@ -1,103 +0,0 @@
|
||||
name: Create Release PR
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
component-name:
|
||||
description: 'Component name'
|
||||
required: true
|
||||
type: string
|
||||
source-branch:
|
||||
description: 'Source branch'
|
||||
required: true
|
||||
type: string
|
||||
secrets:
|
||||
ci-access-token:
|
||||
description: 'CI access token'
|
||||
required: true
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
create-release-branch:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
ref: ${{ inputs.source-branch }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set variables
|
||||
id: vars
|
||||
env:
|
||||
COMPONENT_NAME: ${{ inputs.component-name }}
|
||||
RELEASE_BRANCH: >-
|
||||
${{
|
||||
false
|
||||
|| inputs.component-name == 'Storage' && 'release'
|
||||
|| inputs.component-name == 'Proxy' && 'release-proxy'
|
||||
|| inputs.component-name == 'Compute' && 'release-compute'
|
||||
}}
|
||||
run: |
|
||||
now_date=$(date -u +'%Y-%m-%d')
|
||||
now_time=$(date -u +'%H-%M-%Z')
|
||||
{
|
||||
echo "title=${COMPONENT_NAME} release ${now_date}"
|
||||
echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
|
||||
echo "release-branch=${RELEASE_BRANCH}"
|
||||
} | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Create RC branch
|
||||
env:
|
||||
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
|
||||
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
|
||||
TITLE: ${{ steps.vars.outputs.title }}
|
||||
run: |
|
||||
git switch -c "${RC_BRANCH}"
|
||||
|
||||
# Manually create a merge commit on the current branch, keeping the
|
||||
# tree and setting the parents to the current HEAD and the HEAD of the
|
||||
# release branch. This commit is what we'll fast-forward the release
|
||||
# branch to when merging the release branch.
|
||||
# For details on why, look at
|
||||
# https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
|
||||
current_tree=$(git rev-parse 'HEAD^{tree}')
|
||||
release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
|
||||
current_head=$(git rev-parse HEAD)
|
||||
merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
|
||||
|
||||
# Fast-forward the current branch to the newly created merge_commit
|
||||
git merge --ff-only ${merge_commit}
|
||||
|
||||
git push origin "${RC_BRANCH}"
|
||||
|
||||
- name: Create a PR into ${{ steps.vars.outputs.release-branch }}
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.ci-access-token }}
|
||||
RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
|
||||
RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
|
||||
TITLE: ${{ steps.vars.outputs.title }}
|
||||
run: |
|
||||
gh pr create --title "${TITLE}" \
|
||||
--body "" \
|
||||
--head "${RC_BRANCH}" \
|
||||
--base "${RELEASE_BRANCH}"
|
||||
71
.github/workflows/benchmarking.yml
vendored
71
.github/workflows/benchmarking.yml
vendored
@@ -53,6 +53,77 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --init
|
||||
env:
|
||||
ORG_ID: org-solitary-dew-09443886
|
||||
LIMIT: 100
|
||||
SEARCH: "GITHUB_RUN_ID="
|
||||
BASE_URL: https://console-stage.neon.build/api/v2
|
||||
DRY_RUN: "false" # Set to "true" to just test out the workflow
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Cleanup inactive Neon projects left over from prior runs
|
||||
env:
|
||||
API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
NOW=$(date -u +%s)
|
||||
DAYS_AGO=$((NOW - 5 * 86400))
|
||||
|
||||
REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID"
|
||||
|
||||
echo "Requesting project list from:"
|
||||
echo "$REQUEST_URL"
|
||||
|
||||
response=$(curl -s -X GET "$REQUEST_URL" \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}" )
|
||||
|
||||
echo "Response:"
|
||||
echo "$response" | jq .
|
||||
|
||||
projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" '
|
||||
.projects[]
|
||||
| select(.compute_last_active_at != null)
|
||||
| select((.compute_last_active_at | fromdateiso8601) < $cutoff)
|
||||
| {id, name, compute_last_active_at}
|
||||
')
|
||||
|
||||
if [ -z "$projects_to_delete" ]; then
|
||||
echo "No projects eligible for deletion."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Projects that will be deleted:"
|
||||
echo "$projects_to_delete" | jq -r '.id'
|
||||
|
||||
if [ "$DRY_RUN" = "false" ]; then
|
||||
echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do
|
||||
echo "Deleting project: $project_id"
|
||||
curl -s -X DELETE "$BASE_URL/projects/$project_id" \
|
||||
--header "Accept: application/json" \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
done
|
||||
else
|
||||
echo "Dry run enabled — no projects were deleted."
|
||||
fi
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -69,7 +69,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for file changes
|
||||
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
|
||||
uses: step-security/paths-filter@v3
|
||||
id: files-changed
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@@ -824,7 +824,7 @@ jobs:
|
||||
- pg: v17
|
||||
debian: bookworm
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.42.2
|
||||
VM_BUILDER_VERSION: v0.46.0
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
@@ -1434,10 +1434,10 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
notify-storage-release-deploy-failure:
|
||||
needs: [ deploy ]
|
||||
notify-release-deploy-failure:
|
||||
needs: [ meta, deploy ]
|
||||
# We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
|
||||
if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
|
||||
if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always()
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
@@ -1445,15 +1445,40 @@ jobs:
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Post release-deploy failure to team-storage slack channel
|
||||
- name: Post release-deploy failure to team slack channel
|
||||
uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
|
||||
env:
|
||||
TEAM_ONCALL: >-
|
||||
${{
|
||||
fromJSON(format('{
|
||||
"storage-release": "<!subteam^{0}|@oncall-storage>",
|
||||
"compute-release": "<!subteam^{1}|@oncall-compute>",
|
||||
"proxy-release": "<!subteam^{2}|@oncall-proxy>"
|
||||
}',
|
||||
vars.SLACK_ONCALL_STORAGE_GROUP,
|
||||
vars.SLACK_ONCALL_COMPUTE_GROUP,
|
||||
vars.SLACK_ONCALL_PROXY_GROUP
|
||||
))[needs.meta.outputs.run-kind]
|
||||
}}
|
||||
CHANNEL: >-
|
||||
${{
|
||||
fromJSON(format('{
|
||||
"storage-release": "{0}",
|
||||
"compute-release": "{1}",
|
||||
"proxy-release": "{2}"
|
||||
}',
|
||||
vars.SLACK_STORAGE_CHANNEL_ID,
|
||||
vars.SLACK_COMPUTE_CHANNEL_ID,
|
||||
vars.SLACK_PROXY_CHANNEL_ID
|
||||
))[needs.meta.outputs.run-kind]
|
||||
}}
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
|
||||
channel: ${{ env.CHANNEL }}
|
||||
text: |
|
||||
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
|
||||
2
.github/workflows/cloud-extensions.yml
vendored
2
.github/workflows/cloud-extensions.yml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ inputs.region_id }}
|
||||
region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ matrix.pg-version }}
|
||||
project_settings: ${{ steps.project-settings.outputs.settings }}
|
||||
# We need these settings to get the expected output results.
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Check for Postgres changes
|
||||
uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3
|
||||
uses: step-security/paths-filter@v3
|
||||
id: files_changed
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
|
||||
12
.github/workflows/release-compute.yml
vendored
Normal file
12
.github/workflows/release-compute.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create compute release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * FRI'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: compute
|
||||
secrets: inherit
|
||||
12
.github/workflows/release-proxy.yml
vendored
Normal file
12
.github/workflows/release-proxy.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create proxy release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * TUE'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: proxy
|
||||
secrets: inherit
|
||||
12
.github/workflows/release-storage.yml
vendored
Normal file
12
.github/workflows/release-storage.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
name: Create storage release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * FRI'
|
||||
|
||||
jobs:
|
||||
create-release-pr:
|
||||
uses: ./.github/workflows/release.yml
|
||||
with:
|
||||
component: storage
|
||||
secrets: inherit
|
||||
93
.github/workflows/release.yml
vendored
93
.github/workflows/release.yml
vendored
@@ -1,25 +1,34 @@
|
||||
name: Create Release Branch
|
||||
name: Create release PR
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * TUE' # Proxy release
|
||||
- cron: '0 6 * * FRI' # Storage release
|
||||
- cron: '0 7 * * FRI' # Compute release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Storage release PR'
|
||||
component:
|
||||
description: "Component to release"
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- compute
|
||||
- proxy
|
||||
- storage
|
||||
cherry-pick:
|
||||
description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
|
||||
required: false
|
||||
create-proxy-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
create-compute-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Compute release PR'
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
workflow_call:
|
||||
inputs:
|
||||
component:
|
||||
description: "Component to release"
|
||||
required: true
|
||||
type: string
|
||||
cherry-pick:
|
||||
description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
@@ -29,41 +38,31 @@ defaults:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
|
||||
create-release-pr:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Storage'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }}
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
- name: Configure git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Proxy'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
create-compute-release-branch:
|
||||
if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
uses: ./.github/workflows/_create-release-pr.yml
|
||||
with:
|
||||
component-name: 'Compute'
|
||||
source-branch: ${{ github.ref_name }}
|
||||
secrets:
|
||||
ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
- name: Create release PR
|
||||
uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677
|
||||
with:
|
||||
component: ${{ inputs.component }}
|
||||
cherry-pick: ${{ inputs.cherry-pick }}
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
1
Makefile
1
Makefile
@@ -239,6 +239,7 @@ walproposer-lib: neon-pg-ext-v17
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpq.so $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
|
||||
pg_strong_random.o
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
|
||||
@@ -1085,6 +1085,23 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions pgrx14"
|
||||
#
|
||||
# Version 14 is now required by a few
|
||||
# This layer should be used as a base for new pgrx extensions,
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "pg-onnx-build" and "pgrag-build"
|
||||
@@ -1100,11 +1117,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
echo "#nothing to test here" > neon-test.sh
|
||||
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
|
||||
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \
|
||||
echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pgrag-build
|
||||
FROM rust-extensions-build-pgrx14 AS pgrag-build
|
||||
COPY --from=pgrag-src /ext-src/ /ext-src/
|
||||
|
||||
# Install build-time dependencies
|
||||
@@ -1124,19 +1141,19 @@ RUN . venv/bin/activate && \
|
||||
|
||||
WORKDIR /ext-src/pgrag-src
|
||||
RUN cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
|
||||
|
||||
RUN cd exts/rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
|
||||
|
||||
RUN cd exts/rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
@@ -1319,6 +1336,39 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pg_session_jwt-src
|
||||
RUN cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-anon-pg-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg_anon-src
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
WORKDIR /ext-src
|
||||
COPY compute/patches/anon_v2.patch .
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
patch -p1 < /ext-src/anon_v2.patch
|
||||
|
||||
FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg_anon-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src
|
||||
RUN cd pg_anon-src && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
|
||||
chmod -R a+r ../pg_anon-src && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control;
|
||||
|
||||
########################################################################################
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
@@ -1615,6 +1665,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
@@ -23,6 +23,8 @@
|
||||
import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
|
||||
import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
|
||||
import 'sql_exporter/getpage_sync_requests_total.libsonnet',
|
||||
import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
|
||||
import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
|
||||
import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
|
||||
type: 'gauge',
|
||||
help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
|
||||
values: [
|
||||
'compute_getpage_max_inflight_stuck_time_ms',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
metric_name: 'compute_getpage_stuck_requests_total',
|
||||
type: 'counter',
|
||||
help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
|
||||
values: [
|
||||
'compute_getpage_stuck_requests_total',
|
||||
],
|
||||
query_ref: 'neon_perf_counters',
|
||||
}
|
||||
@@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
compute_getpage_stuck_requests_total numeric,
|
||||
compute_getpage_max_inflight_stuck_time_ms numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
getpage_prefetches_buffered numeric,
|
||||
|
||||
129
compute/patches/anon_v2.patch
Normal file
129
compute/patches/anon_v2.patch
Normal file
@@ -0,0 +1,129 @@
|
||||
diff --git a/sql/anon.sql b/sql/anon.sql
|
||||
index 0cdc769..f6cc950 100644
|
||||
--- a/sql/anon.sql
|
||||
+++ b/sql/anon.sql
|
||||
@@ -1141,3 +1141,8 @@ $$
|
||||
-- TODO : https://en.wikipedia.org/wiki/L-diversity
|
||||
|
||||
-- TODO : https://en.wikipedia.org/wiki/T-closeness
|
||||
+
|
||||
+-- NEON Patches
|
||||
+
|
||||
+GRANT ALL ON SCHEMA anon to neon_superuser;
|
||||
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
|
||||
diff --git a/sql/init.sql b/sql/init.sql
|
||||
index 7da6553..9b6164b 100644
|
||||
--- a/sql/init.sql
|
||||
+++ b/sql/init.sql
|
||||
@@ -74,50 +74,49 @@ $$
|
||||
|
||||
SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED';
|
||||
|
||||
--- load fake data from a given path
|
||||
-CREATE OR REPLACE FUNCTION anon.init(
|
||||
- datapath TEXT
|
||||
-)
|
||||
+CREATE OR REPLACE FUNCTION anon.load_fake_data()
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
DECLARE
|
||||
- datapath_check TEXT;
|
||||
success BOOLEAN;
|
||||
+ sharedir TEXT;
|
||||
+ datapath TEXT;
|
||||
BEGIN
|
||||
|
||||
- IF anon.is_initialized() THEN
|
||||
- RAISE NOTICE 'The anon extension is already initialized.';
|
||||
- RETURN TRUE;
|
||||
- END IF;
|
||||
+ datapath := '/extension/anon/';
|
||||
+ -- find the local extension directory
|
||||
+ SELECT setting INTO sharedir
|
||||
+ FROM pg_catalog.pg_config
|
||||
+ WHERE name = 'SHAREDIR';
|
||||
|
||||
SELECT bool_or(results) INTO success
|
||||
FROM unnest(array[
|
||||
- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'),
|
||||
- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'),
|
||||
- anon.load_csv('anon.address',datapath ||'/address.csv'),
|
||||
- anon.load_csv('anon.city',datapath ||'/city.csv'),
|
||||
- anon.load_csv('anon.company',datapath ||'/company.csv'),
|
||||
- anon.load_csv('anon.country',datapath ||'/country.csv'),
|
||||
- anon.load_csv('anon.email', datapath ||'/email.csv'),
|
||||
- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'),
|
||||
- anon.load_csv('anon.iban',datapath ||'/iban.csv'),
|
||||
- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'),
|
||||
- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'),
|
||||
- anon.load_csv('anon.siret',datapath ||'/siret.csv'),
|
||||
- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv')
|
||||
+ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'),
|
||||
+ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'),
|
||||
+ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'),
|
||||
+ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'),
|
||||
+ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'),
|
||||
+ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'),
|
||||
+ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'),
|
||||
+ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'),
|
||||
+ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'),
|
||||
+ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'),
|
||||
+ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'),
|
||||
+ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'),
|
||||
+ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv')
|
||||
]) results;
|
||||
RETURN success;
|
||||
-
|
||||
END;
|
||||
$$
|
||||
- LANGUAGE PLPGSQL
|
||||
+ LANGUAGE plpgsql
|
||||
VOLATILE
|
||||
RETURNS NULL ON NULL INPUT
|
||||
- PARALLEL UNSAFE -- because load_csv is unsafe
|
||||
- SECURITY INVOKER
|
||||
+ PARALLEL UNSAFE -- because of the EXCEPTION
|
||||
+ SECURITY DEFINER
|
||||
SET search_path=''
|
||||
;
|
||||
-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED';
|
||||
+
|
||||
+SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED';
|
||||
|
||||
-- People tend to forget the anon.init() step
|
||||
-- This is a friendly notice for them
|
||||
@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED';
|
||||
CREATE OR REPLACE FUNCTION anon.load(TEXT)
|
||||
RETURNS BOOLEAN AS
|
||||
$$
|
||||
- SELECT anon.init($1);
|
||||
+ SELECT anon.init();
|
||||
$$
|
||||
LANGUAGE SQL
|
||||
VOLATILE
|
||||
@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED';
|
||||
CREATE OR REPLACE FUNCTION anon.init()
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
- WITH conf AS (
|
||||
- -- find the local extension directory
|
||||
- SELECT setting AS sharedir
|
||||
- FROM pg_catalog.pg_config
|
||||
- WHERE name = 'SHAREDIR'
|
||||
- )
|
||||
- SELECT anon.init(conf.sharedir || '/extension/anon/')
|
||||
- FROM conf;
|
||||
+BEGIN
|
||||
+ IF anon.is_initialized() THEN
|
||||
+ RAISE NOTICE 'The anon extension is already initialized.';
|
||||
+ RETURN TRUE;
|
||||
+ END IF;
|
||||
+
|
||||
+ RETURN anon.load_fake_data();
|
||||
+END;
|
||||
$$
|
||||
- LANGUAGE SQL
|
||||
+ LANGUAGE plpgsql
|
||||
VOLATILE
|
||||
PARALLEL UNSAFE -- because init is unsafe
|
||||
SECURITY INVOKER
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -22,7 +22,7 @@ commands:
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -218,7 +218,9 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.find("neon.safekeeper_connstrings")
|
||||
// TODO(tristan957): Remove the compatibility code here.
|
||||
.or(spec.cluster.settings.find("neon.safekeepers"))
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
|
||||
@@ -75,7 +75,7 @@ pub fn write_postgres_conf(
|
||||
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers={}",
|
||||
"neon.safekeeper_connstrings={}",
|
||||
escape_conf_value(&neon_safekeepers_value)
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
experimental,
|
||||
};
|
||||
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
})
|
||||
|
||||
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
neon.safekeeper_connstrings = 'host=127.0.0.1 port=6502,host=127.0.0.1 port=6503,host=127.0.0.1 port=6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
shared_buffers = 32768
|
||||
|
||||
@@ -632,7 +632,7 @@ struct EndpointStartCmdArgs {
|
||||
|
||||
#[clap(
|
||||
long,
|
||||
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
|
||||
help = "Safekeepers membership generation to prefix neon.safekeeper_connstrings with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
|
||||
)]
|
||||
safekeepers_generation: Option<u32>,
|
||||
#[clap(
|
||||
|
||||
@@ -454,10 +454,10 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
|
||||
.map(|sk| format!("host=localhost port={}", sk.get_compute_port()))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
conf.append("neon.safekeeper_connstrings", &safekeepers);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
@@ -623,7 +623,8 @@ impl Endpoint {
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
safekeeper_connstrings
|
||||
.push(format!("host=127.0.0.1 port={}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
Ok(safekeeper_connstrings)
|
||||
|
||||
@@ -112,7 +112,7 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
/// Initializes a safekeeper node by creating all necessary files,
|
||||
/// e.g. SSL certificates.
|
||||
/// e.g. SSL certificates and JWT token file.
|
||||
pub fn initialize(&self) -> anyhow::Result<()> {
|
||||
if self.env.generate_local_ssl_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
@@ -120,6 +120,17 @@ impl SafekeeperNode {
|
||||
&self.datadir_path().join("server.key"),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Generate a token file for authentication with other safekeepers
|
||||
if self.conf.auth_enabled {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
|
||||
let token_path = self.datadir_path().join("peer_jwt_token");
|
||||
std::fs::write(token_path, token)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -218,14 +229,26 @@ impl SafekeeperNode {
|
||||
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
|
||||
}
|
||||
|
||||
if self.conf.auth_enabled {
|
||||
let token_path = self.datadir_path().join("peer_jwt_token");
|
||||
let token_path_str = token_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Token path {token_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned();
|
||||
args.extend(["--auth-token-path".to_owned(), token_path_str]);
|
||||
}
|
||||
|
||||
args.extend_from_slice(extra_opts);
|
||||
|
||||
let env_variables = Vec::new();
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
self.safekeeper_env_variables()?,
|
||||
env_variables,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
@@ -239,18 +262,6 @@ impl SafekeeperNode {
|
||||
.await
|
||||
}
|
||||
|
||||
fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||
// Generate a token to connect from safekeeper to peers
|
||||
if self.conf.auth_enabled {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
|
||||
@@ -104,6 +104,11 @@
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeeper_connstrings",
|
||||
"value": "host=safekeeper1 port=5454,host=safekeeper2 port=5454,host=safekeeper3 port=5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
|
||||
@@ -3,3 +3,5 @@ pg_distrib_dir='/usr/local/'
|
||||
listen_pg_addr='0.0.0.0:6400'
|
||||
listen_http_addr='0.0.0.0:9898'
|
||||
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
|
||||
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
|
||||
control_plane_emergency_mode=true
|
||||
|
||||
@@ -24,7 +24,7 @@ because configs may be parsed and dumped into logs.
|
||||
#### Tokens generation and validation
|
||||
JWT tokens are signed using a private key.
|
||||
Compute/pageserver/safekeeper use the private key's public counterpart to validate JWT tokens.
|
||||
These components should not have access to the private key and may only get tokens from their configuration or external clients.
|
||||
These components should not have access to the private key and may only get tokens from their configuration or external clients.
|
||||
|
||||
The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
|
||||
There is currently no way to rotate the key without bringing down all components.
|
||||
@@ -117,8 +117,8 @@ pageserver uses JWT tokens for authentication, so the password is really a
|
||||
token.)
|
||||
|
||||
Compute connects to Safekeepers to write and commit data. The list of safekeeper
|
||||
addresses is given in the `neon.safekeepers` GUC. The connections to the
|
||||
safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
|
||||
addresses is given in the `neon.safekeeper_connstrings` GUC. The connections to
|
||||
the safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
|
||||
variable, if set.
|
||||
|
||||
The `compute_ctl` binary that runs before the PostgreSQL server, and launches
|
||||
|
||||
@@ -38,11 +38,6 @@ Currently, the following metrics are collected:
|
||||
Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
This is an absolute, per-timeline metric.
|
||||
|
||||
- `resident_size`
|
||||
|
||||
Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||
This is an absolute, per-tenant metric.
|
||||
|
||||
- `remote_storage_size`
|
||||
|
||||
Size of the remote storage (S3) directory.
|
||||
|
||||
@@ -269,11 +269,13 @@ calls should be retried until they succeed.
|
||||
|
||||
When compute receives safekeepers list from control plane it needs to know the
|
||||
generation to checked whether it should be updated (note that compute may get
|
||||
safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
|
||||
GUC is just a comma separates list of `host:port`. Let's prefix it with
|
||||
`g#<generation>:` to this end, so it will look like
|
||||
safekeeper list from either cplane or safekeepers). Currently
|
||||
`neon.safekeeper_connstrings` GUC is just a comma separates list of Postgres
|
||||
connection strings. Let's prefix it with `g#<generation>:` to this end, so it
|
||||
will look like:
|
||||
|
||||
```
|
||||
g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
|
||||
g#42:host=safekeeper-0.eu-central-1.aws.neon.tech port=6401,host=safekeeper-2.eu-central-1.aws.neon.tech port=6401,host=safekeeper-1.eu-central-1.aws.neon.tech port=6401
|
||||
```
|
||||
|
||||
To summarize, list of cplane changes:
|
||||
@@ -281,7 +283,7 @@ To summarize, list of cplane changes:
|
||||
- `/notify-safekeepers` endpoint.
|
||||
- Branch creation call may return list of safekeepers and when it is
|
||||
present cplane should adopt it instead of choosing on its own like it does currently.
|
||||
- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
|
||||
- `neon.safekeeper_connstrings` GUC should be prefixed with `g#<generation>:`.
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
@@ -455,16 +457,16 @@ So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
|
||||
joined with timeline configuration to get current conf (with generation `n`)
|
||||
for the safekeeper and does the jobs, infinitely retrying failures:
|
||||
1) If node is member (`include`):
|
||||
- Check if timeline exists on it, if not, call pull_timeline on it from
|
||||
- Check if timeline exists on it, if not, call pull_timeline on it from
|
||||
other members
|
||||
- Call switch configuration to the current
|
||||
2) If node is not member (`exclude`):
|
||||
- Call switch configuration to the current, 404 is ok.
|
||||
3) If timeline is deleted (`delete`), call delete.
|
||||
|
||||
In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
|
||||
In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
|
||||
timeline with generation <= `n` if `op_type` is not `delete`.
|
||||
In case 3 also remove `safekeeper_timeline_pending_ops`
|
||||
In case 3 also remove `safekeeper_timeline_pending_ops`
|
||||
entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline.
|
||||
|
||||
Let's consider in details how APIs can be implemented from this angle.
|
||||
@@ -483,7 +485,7 @@ corruption. The following sequence works:
|
||||
changes once ingestion starts, insert must not overwrite it (as well as other
|
||||
fields like membership conf). On the contrary, start_lsn used in the next
|
||||
step must be set to the value in the db. cplane_notified_generation can be set
|
||||
to 1 (initial generation) in insert to avoid notifying cplane about initial
|
||||
to 1 (initial generation) in insert to avoid notifying cplane about initial
|
||||
conf as cplane will receive it in timeline creation request anyway.
|
||||
3) Issue timeline creation calls to at least majority of safekeepers. Using
|
||||
majority here is not necessary but handy because it guarantees that any live
|
||||
@@ -492,15 +494,15 @@ corruption. The following sequence works:
|
||||
create timeline special init case. OFC if timeline is already exists call is
|
||||
ignored.
|
||||
4) For minority of safekeepers which could have missed creation insert
|
||||
entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
|
||||
because response to cplane is sent only after it has happened, and cplane
|
||||
entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
|
||||
because response to cplane is sent only after it has happened, and cplane
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
@@ -514,13 +516,13 @@ Timeline migration.
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
@@ -601,7 +603,7 @@ Let's have the following implementation bits for gradual rollout:
|
||||
(and returns them in response to cplane) only when it is set to
|
||||
true.
|
||||
- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
|
||||
prefixes `neon.safekeepers` GUC with generation number. When it is 0
|
||||
prefixes `neon.safekeeper_connstrings` GUC with generation number. When it is 0
|
||||
(or prefix not present at all), walproposer behaves as currently, committing on
|
||||
the provided safekeeper list -- generations are disabled.
|
||||
If it is non 0 it follows this RFC rules.
|
||||
|
||||
@@ -111,7 +111,7 @@ pub struct ComputeSpec {
|
||||
pub endpoint_id: Option<String>,
|
||||
|
||||
/// Safekeeper membership config generation. It is put in
|
||||
/// neon.safekeepers GUC and serves two purposes:
|
||||
/// neon.safekeeper_connstrings GUC and serves two purposes:
|
||||
/// 1) Non zero value forces walproposer to use membership configurations.
|
||||
/// 2) If walproposer wants to update list of safekeepers to connect to
|
||||
/// taking them from some safekeeper mconf, it should check what value
|
||||
|
||||
@@ -85,8 +85,8 @@
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
|
||||
"name": "neon.safekeeper_connstrings",
|
||||
"value": "host=127.0.0.1 port=6502,host=127.0.0.1 port=6503,host=127.0.0.1 port=6501",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -16,6 +16,7 @@ pub struct Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
|
||||
// SAFETY: libc::sysconf is safe, it merely returns a value.
|
||||
let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
|
||||
if long == -1 {
|
||||
panic!("sysconf(_SC_CLK_TCK) failed");
|
||||
|
||||
@@ -841,6 +841,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
let expected_end = match &end {
|
||||
ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
|
||||
// The timeline doesn't exist and we have been requested to not auto-create it.
|
||||
// Compute requests for timelines that haven't been created yet
|
||||
// might reach us before the storcon request to create those timelines.
|
||||
TimelineNoCreate => true,
|
||||
CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
@@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd {
|
||||
Terminate,
|
||||
#[error("EOF on COPY stream")]
|
||||
EOF,
|
||||
#[error("timeline not found, and allow_timeline_creation is false")]
|
||||
TimelineNoCreate,
|
||||
/// The connection was lost
|
||||
#[error("connection error: {0}")]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
|
||||
@@ -303,7 +303,8 @@ pub struct PullTimelineRequest {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PullTimelineResponse {
|
||||
// Donor safekeeper host
|
||||
pub safekeeper_host: String,
|
||||
/// Donor safekeeper host.
|
||||
/// None if no pull happened because the timeline already exists.
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ fn main() -> anyhow::Result<()> {
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-lib=static=pgport");
|
||||
println!("cargo:rustc-link-lib=static=pgcommon");
|
||||
println!("cargo:rustc-link-lib=pq");
|
||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||
|
||||
// Rebuild crate when libwalproposer.a changes
|
||||
|
||||
@@ -171,8 +171,8 @@ pub enum WaitResult {
|
||||
pub struct Config {
|
||||
/// Tenant and timeline id
|
||||
pub ttid: TenantTimelineId,
|
||||
/// List of safekeepers in format `host:port`
|
||||
pub safekeepers_list: Vec<String>,
|
||||
/// List of safekeeper connection strings
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
/// Safekeeper reconnect timeout in milliseconds
|
||||
pub safekeeper_reconnect_timeout: i32,
|
||||
/// Safekeeper connection timeout in milliseconds
|
||||
@@ -185,7 +185,7 @@ pub struct Config {
|
||||
/// WalProposer main struct. C methods are reexported as Rust functions.
|
||||
pub struct Wrapper {
|
||||
wp: *mut WalProposer,
|
||||
_safekeepers_list_vec: Vec<u8>,
|
||||
_safekeeper_connstrings_vec: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Wrapper {
|
||||
@@ -197,18 +197,19 @@ impl Wrapper {
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
|
||||
let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
|
||||
let mut safekeeper_connstrings_vec = CString::new(config.safekeeper_connstrings.join(","))
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
|
||||
assert!(safekeeper_connstrings_vec.len() == safekeeper_connstrings_vec.capacity());
|
||||
let safekeeper_connstrings =
|
||||
safekeeper_connstrings_vec.as_mut_ptr() as *mut std::ffi::c_char;
|
||||
|
||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||
|
||||
let c_config = WalProposerConfig {
|
||||
neon_tenant,
|
||||
neon_timeline,
|
||||
safekeepers_list,
|
||||
safekeeper_connstrings,
|
||||
safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
|
||||
safekeeper_connection_timeout: config.safekeeper_connection_timeout,
|
||||
wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
|
||||
@@ -224,7 +225,7 @@ impl Wrapper {
|
||||
let wp = unsafe { WalProposerCreate(c_config, api) };
|
||||
Wrapper {
|
||||
wp,
|
||||
_safekeepers_list_vec: safekeepers_list_vec,
|
||||
_safekeeper_connstrings_vec: safekeeper_connstrings_vec,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -575,7 +576,7 @@ mod tests {
|
||||
});
|
||||
let config = crate::walproposer::Config {
|
||||
ttid,
|
||||
safekeepers_list: vec!["localhost:5000".to_string()],
|
||||
safekeeper_connstrings: vec!["host=localhost port=5000".to_string()],
|
||||
safekeeper_reconnect_timeout: 1000,
|
||||
safekeeper_connection_timeout: 10000,
|
||||
sync_safekeepers: true,
|
||||
|
||||
@@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver_api::key::Key;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
@@ -27,6 +28,7 @@ pub(crate) enum LayerCmd {
|
||||
path: PathBuf,
|
||||
tenant: String,
|
||||
timeline: String,
|
||||
key: Option<Key>,
|
||||
},
|
||||
/// Dump all information of a layer file
|
||||
DumpLayer {
|
||||
@@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
path,
|
||||
tenant,
|
||||
timeline,
|
||||
key,
|
||||
} => {
|
||||
let timeline_path = path
|
||||
.join(TENANTS_SEGMENT_NAME)
|
||||
@@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
.join(TIMELINES_SEGMENT_NAME)
|
||||
.join(timeline);
|
||||
let mut idx = 0;
|
||||
let mut to_print = Vec::default();
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
if let Some(key) = key {
|
||||
if layer_file.key_range.start <= *key && *key < layer_file.key_range.end {
|
||||
to_print.push((idx, layer_file));
|
||||
}
|
||||
} else {
|
||||
to_print.push((idx, layer_file));
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if key.is_some() {
|
||||
to_print
|
||||
.sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end));
|
||||
}
|
||||
|
||||
for (idx, layer_file) in to_print {
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
layer_file.key_range.start,
|
||||
layer_file.key_range.end,
|
||||
layer_file.lsn_range.start,
|
||||
layer_file.lsn_range.end,
|
||||
layer_file.is_delta,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::DumpLayer {
|
||||
|
||||
@@ -504,7 +504,7 @@ fn start_pageserver(
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
remote_storage.clone(),
|
||||
StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?,
|
||||
StorageControllerUpcallClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
|
||||
@@ -150,7 +150,7 @@ pub struct PageServerConf {
|
||||
/// not terrible.
|
||||
pub background_task_maximum_delay: Duration,
|
||||
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_api: Url,
|
||||
|
||||
/// JWT token for use with the control plane API.
|
||||
pub control_plane_api_token: Option<SecretString>,
|
||||
@@ -438,7 +438,8 @@ impl PageServerConf {
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
control_plane_api: control_plane_api
|
||||
.ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
@@ -573,6 +574,7 @@ impl PageServerConf {
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
load_previous_heatmap: Some(true),
|
||||
generate_unarchival_heatmap: Some(true),
|
||||
control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
@@ -641,9 +643,12 @@ mod tests {
|
||||
use super::PageServerConf;
|
||||
|
||||
#[test]
|
||||
fn test_empty_config_toml_is_valid() {
|
||||
// we use Default impl of everything in this situation
|
||||
fn test_minimal_config_toml_is_valid() {
|
||||
// The minimal valid config for running a pageserver:
|
||||
// - control_plane_api is mandatory, as pageservers cannot run in isolation
|
||||
// - we use Default impl of everything else in this situation
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("empty config is valid");
|
||||
|
||||
@@ -30,9 +30,6 @@ pub(super) enum Name {
|
||||
/// Tenant remote size
|
||||
#[serde(rename = "remote_storage_size")]
|
||||
RemoteSize,
|
||||
/// Tenant resident size
|
||||
#[serde(rename = "resident_size")]
|
||||
ResidentSize,
|
||||
/// Tenant synthetic size
|
||||
#[serde(rename = "synthetic_storage_size")]
|
||||
SyntheticSize,
|
||||
@@ -187,18 +184,6 @@ impl MetricsKey {
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
|
||||
///
|
||||
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
|
||||
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: Name::ResidentSize,
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
|
||||
@@ -261,10 +246,7 @@ where
|
||||
let mut tenants = std::pin::pin!(tenants);
|
||||
|
||||
while let Some((tenant_id, tenant)) = tenants.next().await {
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
let timelines = tenant.list_timelines();
|
||||
let timelines_len = timelines.len();
|
||||
for timeline in timelines {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
@@ -287,16 +269,9 @@ where
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
tenant_resident_size += timeline.resident_physical_size();
|
||||
}
|
||||
|
||||
if timelines_len == 0 {
|
||||
// Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
|
||||
tenant_resident_size = 1;
|
||||
}
|
||||
|
||||
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
|
||||
let snap = TenantSnapshot::collect(&tenant);
|
||||
snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
|
||||
}
|
||||
|
||||
@@ -305,19 +280,14 @@ where
|
||||
|
||||
/// In-between abstraction to allow testing metrics without actual Tenants.
|
||||
struct TenantSnapshot {
|
||||
resident_size: u64,
|
||||
remote_size: u64,
|
||||
synthetic_size: u64,
|
||||
}
|
||||
|
||||
impl TenantSnapshot {
|
||||
/// Collect tenant status to have metrics created out of it.
|
||||
///
|
||||
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
|
||||
/// cannot just list timelines here.
|
||||
fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
|
||||
fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
|
||||
TenantSnapshot {
|
||||
resident_size,
|
||||
remote_size: t.remote_size(),
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
@@ -334,8 +304,6 @@ impl TenantSnapshot {
|
||||
) {
|
||||
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
|
||||
|
||||
let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
|
||||
|
||||
let synthetic_size = {
|
||||
let factory = MetricsKey::synthetic_size(tenant_id);
|
||||
let mut synthetic_size = self.synthetic_size;
|
||||
@@ -355,11 +323,7 @@ impl TenantSnapshot {
|
||||
}
|
||||
};
|
||||
|
||||
metrics.extend(
|
||||
[Some(remote_size), Some(resident_size), synthetic_size]
|
||||
.into_iter()
|
||||
.flatten(),
|
||||
);
|
||||
metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let ts = TenantSnapshot {
|
||||
resident_size: 1000,
|
||||
remote_size: 1000,
|
||||
// not yet calculated
|
||||
synthetic_size: 0,
|
||||
@@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::synthetic_size(tenant_id).at(now, 1000),
|
||||
]
|
||||
);
|
||||
@@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let ts = TenantSnapshot {
|
||||
resident_size: 1000,
|
||||
remote_size: 1000,
|
||||
// not yet calculated
|
||||
synthetic_size: 0,
|
||||
@@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 1000),
|
||||
// no synthetic size here
|
||||
]
|
||||
);
|
||||
@@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old(
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [RawMetric; 6] {
|
||||
) -> [RawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until_old_format(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
|
||||
]
|
||||
}
|
||||
@@ -312,13 +307,12 @@ pub(crate) const fn metric_examples(
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [NewRawMetric; 6] {
|
||||
) -> [NewRawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at(now, 1),
|
||||
]
|
||||
}
|
||||
|
||||
@@ -521,10 +521,6 @@ mod tests {
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
|
||||
@@ -564,7 +560,7 @@ mod tests {
|
||||
assert_eq!(upgraded_samples, new_samples);
|
||||
}
|
||||
|
||||
fn metric_samples_old() -> [RawMetric; 6] {
|
||||
fn metric_samples_old() -> [RawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
@@ -576,7 +572,7 @@ mod tests {
|
||||
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
|
||||
}
|
||||
|
||||
fn metric_samples() -> [NewRawMetric; 6] {
|
||||
fn metric_samples() -> [NewRawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
|
||||
@@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi {
|
||||
impl StorageControllerUpcallClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Option<Self>, reqwest::Error> {
|
||||
let mut url = match conf.control_plane_api.as_ref() {
|
||||
Some(u) => u.clone(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self {
|
||||
let mut url = conf.control_plane_api.clone();
|
||||
|
||||
if let Ok(mut segs) = url.path_segments_mut() {
|
||||
// This ensures that `url` ends with a slash if it doesn't already.
|
||||
@@ -85,15 +79,17 @@ impl StorageControllerUpcallClient {
|
||||
}
|
||||
|
||||
for cert in &conf.ssl_ca_certs {
|
||||
client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
|
||||
client = client.add_root_certificate(
|
||||
Certificate::from_der(cert.contents()).expect("Invalid certificate in config"),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Some(Self {
|
||||
http_client: client.build()?,
|
||||
Self {
|
||||
http_client: client.build().expect("Failed to construct HTTP client"),
|
||||
base_url: url,
|
||||
node_id: conf.id,
|
||||
cancel: cancel.clone(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
|
||||
@@ -585,7 +585,7 @@ impl DeletionQueue {
|
||||
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, DeletionQueueWorkers<C>)
|
||||
where
|
||||
@@ -701,7 +701,7 @@ mod test {
|
||||
async fn restart(&mut self) {
|
||||
let (deletion_queue, workers) = DeletionQueue::new(
|
||||
self.storage.clone(),
|
||||
Some(self.mock_control_plane.clone()),
|
||||
self.mock_control_plane.clone(),
|
||||
self.harness.conf,
|
||||
);
|
||||
|
||||
@@ -821,11 +821,8 @@ mod test {
|
||||
|
||||
let mock_control_plane = MockStorageController::new();
|
||||
|
||||
let (deletion_queue, worker) = DeletionQueue::new(
|
||||
storage.clone(),
|
||||
Some(mock_control_plane.clone()),
|
||||
harness.conf,
|
||||
);
|
||||
let (deletion_queue, worker) =
|
||||
DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);
|
||||
|
||||
let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ where
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
// Client for calling into control plane API for validation of deletes
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
|
||||
// DeletionLists which are waiting generation validation. Not safe to
|
||||
// execute until [`validate`] has processed them.
|
||||
@@ -86,7 +86,7 @@ where
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
controller_upcall_client: Option<C>,
|
||||
controller_upcall_client: C,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -137,20 +137,16 @@ where
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
|
||||
match controller_upcall_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
// The only way a validation call returns an error is when the cancellation token fires
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
let tenants_valid = match self
|
||||
.controller_upcall_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
// The only way a validation call returns an error is when the cancellation token fires
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
} else {
|
||||
// Control plane API disabled. In legacy mode we consider everything valid.
|
||||
tenant_generations.keys().map(|k| (*k, true)).collect()
|
||||
};
|
||||
|
||||
let mut validated_sequence: Option<u64> = None;
|
||||
|
||||
@@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_ondemand_download_bytes_total",
|
||||
"Total bytes of layers on-demand downloaded",
|
||||
&["task_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_ondemand_download_count",
|
||||
"Total count of layers on-demand downloaded",
|
||||
&["task_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod wait_ondemand_download_time {
|
||||
use super::*;
|
||||
const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
|
||||
@@ -2180,6 +2198,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
|
||||
// If you want to change categorize of a specific error, also change it in `log_query_error`.
|
||||
let metric = match res {
|
||||
Ok(_) => &self.parent.ok,
|
||||
Err(QueryError::Shutdown) => {
|
||||
// Do not observe ok/err for shutdown
|
||||
return;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
|
||||
@@ -1035,10 +1035,25 @@ impl PageServerHandler {
|
||||
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
|
||||
macro_rules! mkspan {
|
||||
(before shard routing) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
|
||||
tracing::info_span!(
|
||||
parent: &parent_span,
|
||||
"handle_get_page_request",
|
||||
rel = %req.rel,
|
||||
blkno = %req.blkno,
|
||||
req_lsn = %req.hdr.request_lsn,
|
||||
not_modified_since_lsn = %req.hdr.not_modified_since
|
||||
)
|
||||
}};
|
||||
($shard_id:expr) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
|
||||
tracing::info_span!(
|
||||
parent: &parent_span,
|
||||
"handle_get_page_request",
|
||||
rel = %req.rel,
|
||||
blkno = %req.blkno,
|
||||
req_lsn = %req.hdr.request_lsn,
|
||||
not_modified_since_lsn = %req.hdr.not_modified_since,
|
||||
shard_id = %$shard_id
|
||||
)
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -1102,6 +1117,7 @@ impl PageServerHandler {
|
||||
shard_id = %shard.get_shard_identity().shard_slug(),
|
||||
timeline_id = %timeline_id,
|
||||
lsn = %req.hdr.request_lsn,
|
||||
not_modified_since_lsn = %req.hdr.not_modified_since,
|
||||
request_id = %req.hdr.reqid,
|
||||
key = %key,
|
||||
)
|
||||
|
||||
@@ -1084,8 +1084,17 @@ impl Timeline {
|
||||
let mut result = HashMap::new();
|
||||
for (k, v) in kv {
|
||||
let v = v?;
|
||||
if v.is_empty() {
|
||||
// This is a tombstone -- we can skip it.
|
||||
// Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of
|
||||
// the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone,
|
||||
// we also need to consider that. Such tombstones might be written on the detach ancestor code path to
|
||||
// avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.)
|
||||
continue;
|
||||
}
|
||||
let origin_id = k.field6 as RepOriginId;
|
||||
let origin_lsn = Lsn::des(&v).unwrap();
|
||||
let origin_lsn = Lsn::des(&v)
|
||||
.with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
|
||||
if origin_lsn != Lsn::INVALID {
|
||||
result.insert(origin_id, origin_lsn);
|
||||
}
|
||||
@@ -2578,6 +2587,11 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn put_for_unit_test(&mut self, key: Key, val: Value) {
|
||||
self.put(key, val);
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
if Self::is_data_key(&key) {
|
||||
self.put_data(key.to_compact(), val)
|
||||
|
||||
@@ -4254,9 +4254,7 @@ impl TenantShard {
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
) -> TenantShard {
|
||||
debug_assert!(
|
||||
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
|
||||
);
|
||||
assert!(!attached_conf.location.generation.is_none());
|
||||
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
@@ -5949,7 +5947,9 @@ mod tests {
|
||||
use itertools::Itertools;
|
||||
#[cfg(feature = "testing")]
|
||||
use models::CompactLsnRange;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
|
||||
use pageserver_api::key::{
|
||||
AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key,
|
||||
};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
@@ -8185,6 +8185,54 @@ mod tests {
|
||||
assert_eq!(files.get("pg_logical/mappings/test2"), None);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repl_origin_tombstones() {
|
||||
let harness = TenantHarness::create("test_repl_origin_tombstones")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let repl_lsn = Lsn(0x10);
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new()));
|
||||
modification.set_replorigin(1, repl_lsn).await.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// we can read everything from the storage
|
||||
let repl_origins = tline
|
||||
.get_replorigins(lsn, &ctx, io_concurrency.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(repl_origins.len(), 1);
|
||||
assert_eq!(repl_origins[&1], lsn);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification.put_for_unit_test(
|
||||
repl_origin_key(3),
|
||||
Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")),
|
||||
);
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
let result = tline
|
||||
.get_replorigins(lsn, &ctx, io_concurrency.clone())
|
||||
.await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metadata_image_creation() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_metadata_image_creation").await?;
|
||||
|
||||
@@ -346,7 +346,8 @@ async fn init_load_generations(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
);
|
||||
emergency_generations(tenant_confs)
|
||||
} else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? {
|
||||
} else {
|
||||
let client = StorageControllerUpcallClient::new(conf, cancel);
|
||||
info!("Calling {} API to re-attach tenants", client.base_url());
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
@@ -360,9 +361,6 @@ async fn init_load_generations(
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Control plane API not configured, tenant generations are disabled");
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
|
||||
@@ -1153,17 +1151,8 @@ impl TenantManager {
|
||||
// Testing hack: if we are configured with no control plane, then drop the generation
|
||||
// from upserts. This enables creating generation-less tenants even though neon_local
|
||||
// always uses generations when calling the location conf API.
|
||||
let attached_conf = if cfg!(feature = "testing") {
|
||||
let mut conf = AttachedTenantConf::try_from(new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?;
|
||||
if self.conf.control_plane_api.is_none() {
|
||||
conf.location.generation = Generation::none();
|
||||
}
|
||||
conf
|
||||
} else {
|
||||
AttachedTenantConf::try_from(new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?
|
||||
};
|
||||
let attached_conf = AttachedTenantConf::try_from(new_location_config)
|
||||
.map_err(UpsertLocationError::BadRequest)?;
|
||||
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
|
||||
@@ -1441,14 +1441,6 @@ impl DeltaLayerInner {
|
||||
offset
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
self.iter_with_options(
|
||||
ctx,
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
)
|
||||
}
|
||||
|
||||
pub fn iter_with_options<'a>(
|
||||
&'a self,
|
||||
ctx: &'a RequestContext,
|
||||
@@ -1634,7 +1626,6 @@ pub(crate) mod test {
|
||||
use crate::tenant::disk_btree::tests::TestDisk;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
@@ -2311,8 +2302,7 @@ pub(crate) mod test {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
@@ -2329,8 +2319,7 @@ pub(crate) mod test {
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = delta_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
|
||||
assert_delta_iter_equal(&mut iter, &test_deltas).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
let merge_iter = MergeIterator::create_for_testing(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
@@ -182,7 +182,7 @@ mod tests {
|
||||
result.extend(test_deltas1[90..100].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
let merge_iter = MergeIterator::create_for_testing(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
|
||||
@@ -684,14 +684,6 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
|
||||
self.iter_with_options(
|
||||
ctx,
|
||||
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
|
||||
1024, // The default value. Unit tests might use a different value
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn iter_with_options<'a>(
|
||||
&'a self,
|
||||
ctx: &'a RequestContext,
|
||||
@@ -1240,7 +1232,6 @@ mod test {
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::{TenantShard, Timeline};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -1507,8 +1498,7 @@ mod test {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
|
||||
let mut num_items = 0;
|
||||
for _ in 0..3 {
|
||||
iter.next_batch().await.unwrap();
|
||||
@@ -1525,8 +1515,7 @@ mod test {
|
||||
iter.key_values_batch.clear();
|
||||
}
|
||||
// Test if the result is correct
|
||||
let mut iter = img_layer.iter(&ctx);
|
||||
iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
|
||||
let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
|
||||
assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::sync::{Arc, Weak};
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
@@ -1255,6 +1256,14 @@ impl LayerInner {
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
|
||||
let task_kind: &'static str = ctx.task_kind().into();
|
||||
ONDEMAND_DOWNLOAD_BYTES
|
||||
.with_label_values(&[task_kind])
|
||||
.inc_by(self.desc.file_size);
|
||||
ONDEMAND_DOWNLOAD_COUNT
|
||||
.with_label_values(&[task_kind])
|
||||
.inc();
|
||||
|
||||
Ok(self.initialize_after_layer_is_on_disk(permit))
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> {
|
||||
}
|
||||
|
||||
impl<'a> LayerRef<'a> {
|
||||
#[allow(dead_code)]
|
||||
fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
|
||||
match self {
|
||||
Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
|
||||
Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_with_options(
|
||||
self,
|
||||
ctx: &'a RequestContext,
|
||||
@@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
|
||||
}
|
||||
|
||||
impl<'a> MergeIterator<'a> {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn create_for_testing(
|
||||
deltas: &[&'a DeltaLayerInner],
|
||||
images: &[&'a ImageLayerInner],
|
||||
ctx: &'a RequestContext,
|
||||
) -> Self {
|
||||
Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
|
||||
}
|
||||
|
||||
/// Create a new merge iterator with custom options.
|
||||
///
|
||||
/// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale
|
||||
/// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that
|
||||
/// the buffer does not take too much memory.
|
||||
///
|
||||
/// The default options for L0 compactions are:
|
||||
/// - max_read_size: 1024 * 8192 (8MB)
|
||||
/// - max_batch_size: 1024
|
||||
///
|
||||
/// The default options for gc-compaction are:
|
||||
/// - max_read_size: 128 * 8192 (1MB)
|
||||
/// - max_batch_size: 128
|
||||
pub fn create_with_options(
|
||||
deltas: &[&'a DeltaLayerInner],
|
||||
images: &[&'a ImageLayerInner],
|
||||
@@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create(
|
||||
deltas: &[&'a DeltaLayerInner],
|
||||
images: &[&'a ImageLayerInner],
|
||||
ctx: &'a RequestContext,
|
||||
) -> Self {
|
||||
Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
|
||||
}
|
||||
|
||||
pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
|
||||
while let Some(mut iter) = self.heap.peek_mut() {
|
||||
if !iter.is_loaded() {
|
||||
@@ -477,7 +483,7 @@ mod tests {
|
||||
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
let mut merge_iter = MergeIterator::create_for_testing(
|
||||
&[
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
@@ -549,7 +555,7 @@ mod tests {
|
||||
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
let mut merge_iter = MergeIterator::create_for_testing(
|
||||
&[
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
|
||||
@@ -670,7 +676,7 @@ mod tests {
|
||||
// Test with different layer order for MergeIterator::create to ensure the order
|
||||
// is stable.
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
let mut merge_iter = MergeIterator::create_for_testing(
|
||||
&[
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
@@ -682,7 +688,7 @@ mod tests {
|
||||
);
|
||||
assert_merge_iter_equal(&mut merge_iter, &expect).await;
|
||||
|
||||
let mut merge_iter = MergeIterator::create(
|
||||
let mut merge_iter = MergeIterator::create_for_testing(
|
||||
&[
|
||||
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
|
||||
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
|
||||
|
||||
@@ -1994,7 +1994,13 @@ impl Timeline {
|
||||
let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
||||
deltas.push(l);
|
||||
}
|
||||
MergeIterator::create(&deltas, &[], ctx)
|
||||
MergeIterator::create_with_options(
|
||||
&deltas,
|
||||
&[],
|
||||
ctx,
|
||||
1024 * 8192, /* 8 MiB buffer per layer iterator */
|
||||
1024,
|
||||
)
|
||||
};
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
@@ -2828,7 +2834,7 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if the memory usage is within the limit.
|
||||
/// Check to bail out of gc compaction early if it would use too much memory.
|
||||
async fn check_memory_usage(
|
||||
self: &Arc<Self>,
|
||||
layer_selection: &[Layer],
|
||||
@@ -2841,7 +2847,8 @@ impl Timeline {
|
||||
let layer_desc = layer.layer_desc();
|
||||
if layer_desc.is_delta() {
|
||||
// Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB).
|
||||
// Multiply the layer size so that tests can pass.
|
||||
// Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt
|
||||
// use 3MB layer size and we need to account for that).
|
||||
estimated_memory_usage_mb +=
|
||||
3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64;
|
||||
num_delta_layers += 1;
|
||||
|
||||
@@ -178,7 +178,7 @@ impl Attempt {
|
||||
}
|
||||
}
|
||||
|
||||
async fn generate_tombstone_image_layer(
|
||||
pub(crate) async fn generate_tombstone_image_layer(
|
||||
detached: &Arc<Timeline>,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ancestor_lsn: Lsn,
|
||||
|
||||
@@ -163,8 +163,7 @@ pub async fn doit(
|
||||
// Ensure at-least-once delivery of the upcall to storage controller
|
||||
// before we mark the task as done and never come here again.
|
||||
//
|
||||
let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)?
|
||||
.expect("storcon configured");
|
||||
let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
|
||||
storcon_client
|
||||
.put_timeline_import_status(
|
||||
timeline.tenant_shard_id,
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||
#[cfg(target_os = "linux")]
|
||||
use std::os::unix::fs::OpenOptionsExt;
|
||||
use std::sync::LazyLock;
|
||||
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
|
||||
|
||||
@@ -99,7 +97,7 @@ impl VirtualFile {
|
||||
|
||||
pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
#[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let mode = get_io_mode();
|
||||
@@ -112,21 +110,16 @@ impl VirtualFile {
|
||||
#[cfg(target_os = "linux")]
|
||||
(IoMode::DirectRw, _) => true,
|
||||
};
|
||||
let open_options = open_options.clone();
|
||||
let open_options = if set_o_direct {
|
||||
if set_o_direct {
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
let mut open_options = open_options;
|
||||
open_options.custom_flags(nix::libc::O_DIRECT);
|
||||
open_options
|
||||
open_options = open_options.custom_flags(nix::libc::O_DIRECT);
|
||||
}
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
unreachable!(
|
||||
"O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined"
|
||||
);
|
||||
} else {
|
||||
open_options
|
||||
};
|
||||
}
|
||||
let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
|
||||
Ok(VirtualFile { inner, _mode: mode })
|
||||
}
|
||||
@@ -530,7 +523,7 @@ impl VirtualFileInner {
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<VirtualFileInner, std::io::Error> {
|
||||
Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await
|
||||
Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
|
||||
}
|
||||
|
||||
/// Open a file with given options.
|
||||
@@ -558,10 +551,11 @@ impl VirtualFileInner {
|
||||
// It would perhaps be nicer to check just for the read and write flags
|
||||
// explicitly, but OpenOptions doesn't contain any functions to read flags,
|
||||
// only to set them.
|
||||
let mut reopen_options = open_options.clone();
|
||||
reopen_options.create(false);
|
||||
reopen_options.create_new(false);
|
||||
reopen_options.truncate(false);
|
||||
let reopen_options = open_options
|
||||
.clone()
|
||||
.create(false)
|
||||
.create_new(false)
|
||||
.truncate(false);
|
||||
|
||||
let vfile = VirtualFileInner {
|
||||
handle: RwLock::new(handle),
|
||||
@@ -1307,7 +1301,7 @@ mod tests {
|
||||
opts: OpenOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<MaybeVirtualFile, anyhow::Error> {
|
||||
let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?;
|
||||
let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?;
|
||||
Ok(MaybeVirtualFile::VirtualFile(vf))
|
||||
}
|
||||
}
|
||||
@@ -1374,7 +1368,7 @@ mod tests {
|
||||
let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err();
|
||||
|
||||
// Close the file and re-open for reading
|
||||
let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
|
||||
let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
let _ = file_a
|
||||
@@ -1393,8 +1387,7 @@ mod tests {
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.to_owned(),
|
||||
.truncate(true),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1412,12 +1405,7 @@ mod tests {
|
||||
|
||||
let mut vfiles = Vec::new();
|
||||
for _ in 0..100 {
|
||||
let mut vfile = A::open(
|
||||
path_b.clone(),
|
||||
OpenOptions::new().read(true).to_owned(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?;
|
||||
assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?);
|
||||
vfiles.push(vfile);
|
||||
}
|
||||
@@ -1466,7 +1454,7 @@ mod tests {
|
||||
for _ in 0..VIRTUAL_FILES {
|
||||
let f = VirtualFileInner::open_with_options(
|
||||
&test_file_path,
|
||||
OpenOptions::new().read(true).clone(),
|
||||
OpenOptions::new().read(true),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
|
||||
|
||||
use std::os::fd::OwnedFd;
|
||||
use std::os::unix::fs::OpenOptionsExt;
|
||||
use std::path::Path;
|
||||
|
||||
use super::io_engine::IoEngine;
|
||||
@@ -43,7 +44,7 @@ impl OpenOptions {
|
||||
self.write
|
||||
}
|
||||
|
||||
pub fn read(&mut self, read: bool) -> &mut OpenOptions {
|
||||
pub fn read(mut self, read: bool) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.read(read);
|
||||
@@ -56,7 +57,7 @@ impl OpenOptions {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn write(&mut self, write: bool) -> &mut OpenOptions {
|
||||
pub fn write(mut self, write: bool) -> Self {
|
||||
self.write = write;
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
@@ -70,7 +71,7 @@ impl OpenOptions {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn create(&mut self, create: bool) -> &mut OpenOptions {
|
||||
pub fn create(mut self, create: bool) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.create(create);
|
||||
@@ -83,7 +84,7 @@ impl OpenOptions {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
|
||||
pub fn create_new(mut self, create_new: bool) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.create_new(create_new);
|
||||
@@ -96,7 +97,7 @@ impl OpenOptions {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
|
||||
pub fn truncate(mut self, truncate: bool) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.truncate(truncate);
|
||||
@@ -124,10 +125,8 @@ impl OpenOptions {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
|
||||
fn mode(&mut self, mode: u32) -> &mut OpenOptions {
|
||||
pub fn mode(mut self, mode: u32) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.mode(mode);
|
||||
@@ -140,7 +139,7 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
|
||||
self
|
||||
}
|
||||
|
||||
fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
|
||||
pub fn custom_flags(mut self, flags: i32) -> Self {
|
||||
match &mut self.inner {
|
||||
Inner::StdFs(x) => {
|
||||
let _ = x.custom_flags(flags);
|
||||
|
||||
@@ -36,6 +36,8 @@ DATA = \
|
||||
neon--1.2--1.3.sql \
|
||||
neon--1.3--1.4.sql \
|
||||
neon--1.4--1.5.sql \
|
||||
neon--1.5--1.6.sql \
|
||||
neon--1.6--1.5.sql \
|
||||
neon--1.5--1.4.sql \
|
||||
neon--1.4--1.3.sql \
|
||||
neon--1.3--1.2.sql \
|
||||
|
||||
@@ -687,8 +687,14 @@ prefetch_wait_for(uint64 ring_index)
|
||||
END_PREFETCH_RECEIVE_WORK();
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
return result;
|
||||
if (result)
|
||||
{
|
||||
/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
|
||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
||||
return slot->status == PRFS_RECEIVED;
|
||||
}
|
||||
return false;
|
||||
;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -98,7 +98,6 @@
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
|
||||
|
||||
#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
|
||||
|
||||
/*
|
||||
@@ -135,6 +134,15 @@ typedef struct FileCacheEntry
|
||||
#define N_COND_VARS 64
|
||||
#define CV_WAIT_TIMEOUT 10
|
||||
|
||||
#define MAX_PREWARM_WORKERS 8
|
||||
|
||||
typedef struct PrewarmWorkerState
|
||||
{
|
||||
uint32 prewarmed_pages;
|
||||
uint32 skipped_pages;
|
||||
TimestampTz completed;
|
||||
} PrewarmWorkerState;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
{
|
||||
uint64 generation; /* generation is needed to handle correct hash
|
||||
@@ -156,25 +164,43 @@ typedef struct FileCacheControl
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
|
||||
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
|
||||
size_t n_prewarm_workers;
|
||||
size_t n_prewarm_entries;
|
||||
size_t total_prewarm_pages;
|
||||
size_t prewarm_batch;
|
||||
bool prewarm_active;
|
||||
bool prewarm_canceled;
|
||||
dsm_handle prewarm_lfc_state_handle;
|
||||
} FileCacheControl;
|
||||
|
||||
bool lfc_store_prefetch_result;
|
||||
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
|
||||
|
||||
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
|
||||
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
|
||||
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
|
||||
|
||||
static HTAB *lfc_hash;
|
||||
static int lfc_desc = -1;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_prewarm_limit;
|
||||
static int lfc_prewarm_batch;
|
||||
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
|
||||
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
|
||||
static char *lfc_path;
|
||||
static uint64 lfc_generation;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
static bool lfc_do_prewarm;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
|
||||
bool lfc_store_prefetch_result;
|
||||
bool lfc_prewarm_update_ws_estimation;
|
||||
|
||||
#define LFC_ENABLED() (lfc_ctl->limit != 0)
|
||||
|
||||
/*
|
||||
@@ -500,6 +526,17 @@ lfc_init(void)
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomBoolVariable("neon.prewarm_update_ws_estimation",
|
||||
"Consider prewarmed pages for working set estimation",
|
||||
NULL,
|
||||
&lfc_prewarm_update_ws_estimation,
|
||||
true,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_file_cache_size",
|
||||
"Maximal size of Neon local file cache",
|
||||
NULL,
|
||||
@@ -550,6 +587,32 @@ lfc_init(void)
|
||||
lfc_change_chunk_size,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
|
||||
"Maximal number of prewarmed chunks",
|
||||
NULL,
|
||||
&lfc_prewarm_limit,
|
||||
INT_MAX, /* no limit by default */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.file_cache_prewarm_batch",
|
||||
"Number of pages retrivied by prewarm from page server",
|
||||
NULL,
|
||||
&lfc_prewarm_batch,
|
||||
64,
|
||||
1,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
if (lfc_max_size == 0)
|
||||
return;
|
||||
|
||||
@@ -563,6 +626,317 @@ lfc_init(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
FileCacheState*
|
||||
lfc_get_state(size_t max_entries)
|
||||
{
|
||||
FileCacheState* fcs = NULL;
|
||||
|
||||
if (lfc_maybe_disabled() || max_entries == 0) /* fast exit if file cache is disabled */
|
||||
return NULL;
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
dlist_iter iter;
|
||||
size_t i = 0;
|
||||
uint8* bitmap;
|
||||
size_t n_pages = 0;
|
||||
size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
|
||||
size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
|
||||
fcs = (FileCacheState*)palloc0(state_size);
|
||||
SET_VARSIZE(fcs, state_size);
|
||||
fcs->magic = FILE_CACHE_STATE_MAGIC;
|
||||
fcs->chunk_size_log = lfc_chunk_size_log;
|
||||
fcs->n_chunks = n_entries;
|
||||
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
|
||||
|
||||
dlist_reverse_foreach(iter, &lfc_ctl->lru)
|
||||
{
|
||||
FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
|
||||
fcs->chunks[i] = entry->key;
|
||||
for (int j = 0; j < lfc_blocks_per_chunk; j++)
|
||||
{
|
||||
if (GET_STATE(entry, j) != UNAVAILABLE)
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
n_pages += 1;
|
||||
}
|
||||
}
|
||||
if (++i == n_entries)
|
||||
break;
|
||||
}
|
||||
Assert(i == n_entries);
|
||||
fcs->n_pages = n_pages;
|
||||
Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
|
||||
}
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
return fcs;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
|
||||
* and avoid race conditions with other backends.
|
||||
*/
|
||||
void
|
||||
lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
{
|
||||
size_t fcs_chunk_size_log;
|
||||
size_t n_entries;
|
||||
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
|
||||
size_t fcs_size;
|
||||
dsm_segment *seg;
|
||||
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
|
||||
|
||||
|
||||
if (!lfc_ensure_opened())
|
||||
return;
|
||||
|
||||
if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
|
||||
{
|
||||
elog(LOG, "LFC: prewarm is disabled");
|
||||
return;
|
||||
}
|
||||
|
||||
if (n_workers > MAX_PREWARM_WORKERS)
|
||||
{
|
||||
elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
|
||||
}
|
||||
|
||||
if (fcs == NULL || fcs->n_chunks == 0)
|
||||
{
|
||||
elog(LOG, "LFC: nothing to prewarm");
|
||||
return;
|
||||
}
|
||||
|
||||
if (fcs->magic != FILE_CACHE_STATE_MAGIC)
|
||||
{
|
||||
elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
|
||||
}
|
||||
|
||||
fcs_size = VARSIZE(fcs);
|
||||
if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
|
||||
{
|
||||
elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
|
||||
}
|
||||
|
||||
fcs_chunk_size_log = fcs->chunk_size_log;
|
||||
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
|
||||
{
|
||||
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
|
||||
}
|
||||
|
||||
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
|
||||
Assert(n_entries != 0);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Do not prewarm more entries than LFC limit */
|
||||
if (lfc_ctl->limit <= lfc_ctl->size)
|
||||
{
|
||||
elog(LOG, "LFC: skip prewarm because LFC is already filled");
|
||||
LWLockRelease(lfc_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
if (lfc_ctl->prewarm_active)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
|
||||
}
|
||||
lfc_ctl->n_prewarm_entries = n_entries;
|
||||
lfc_ctl->n_prewarm_workers = n_workers;
|
||||
lfc_ctl->prewarm_active = true;
|
||||
lfc_ctl->prewarm_canceled = false;
|
||||
lfc_ctl->prewarm_batch = prewarm_batch;
|
||||
memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
/* Calculate total number of pages to be prewarmed */
|
||||
lfc_ctl->total_prewarm_pages = fcs->n_pages;
|
||||
|
||||
seg = dsm_create(fcs_size, 0);
|
||||
memcpy(dsm_segment_address(seg), fcs, fcs_size);
|
||||
lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
|
||||
|
||||
/* Spawn background workers */
|
||||
for (uint32 i = 0; i < n_workers; i++)
|
||||
{
|
||||
BackgroundWorker worker = {0};
|
||||
|
||||
worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
worker.bgw_start_time = BgWorkerStart_ConsistentState;
|
||||
worker.bgw_restart_time = BGW_NEVER_RESTART;
|
||||
strcpy(worker.bgw_library_name, "neon");
|
||||
strcpy(worker.bgw_function_name, "lfc_prewarm_main");
|
||||
snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
|
||||
strcpy(worker.bgw_type, "LFC prewarm worker");
|
||||
worker.bgw_main_arg = Int32GetDatum(i);
|
||||
/* must set notify PID to wait for shutdown */
|
||||
worker.bgw_notify_pid = MyProcPid;
|
||||
|
||||
if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
|
||||
{
|
||||
ereport(LOG,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("LFC: registering dynamic bgworker prewarm failed"),
|
||||
errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
|
||||
n_workers = i;
|
||||
lfc_ctl->prewarm_canceled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32 i = 0; i < n_workers; i++)
|
||||
{
|
||||
bool interrupted;
|
||||
do
|
||||
{
|
||||
interrupted = false;
|
||||
PG_TRY();
|
||||
{
|
||||
BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
|
||||
if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
|
||||
{
|
||||
elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
elog(LOG, "LFC: cancel prewarm");
|
||||
lfc_ctl->prewarm_canceled = true;
|
||||
interrupted = true;
|
||||
}
|
||||
PG_END_TRY();
|
||||
} while (interrupted);
|
||||
|
||||
if (!lfc_ctl->prewarm_workers[i].completed)
|
||||
{
|
||||
/* Background worker doesn't set completion time: it means that it was abnormally terminated */
|
||||
elog(LOG, "LFC: prewarm worker %d failed", i+1);
|
||||
/* Set completion time to prevent get_prewarm_info from considering this worker as active */
|
||||
lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
|
||||
}
|
||||
}
|
||||
dsm_detach(seg);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
lfc_ctl->prewarm_active = false;
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
void
|
||||
lfc_prewarm_main(Datum main_arg)
|
||||
{
|
||||
size_t snd_idx = 0, rcv_idx = 0;
|
||||
size_t n_sent = 0, n_received = 0;
|
||||
size_t fcs_chunk_size_log;
|
||||
size_t max_prefetch_pages;
|
||||
size_t prewarm_batch;
|
||||
size_t n_workers;
|
||||
dsm_segment *seg;
|
||||
FileCacheState* fcs;
|
||||
uint8* bitmap;
|
||||
BufferTag tag;
|
||||
PrewarmWorkerState* ws;
|
||||
uint32 worker_id = DatumGetInt32(main_arg);
|
||||
|
||||
pqsignal(SIGTERM, die);
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
|
||||
if (seg == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("could not map dynamic shared memory segment")));
|
||||
|
||||
fcs = (FileCacheState*) dsm_segment_address(seg);
|
||||
prewarm_batch = lfc_ctl->prewarm_batch;
|
||||
fcs_chunk_size_log = fcs->chunk_size_log;
|
||||
n_workers = lfc_ctl->n_prewarm_workers;
|
||||
max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
|
||||
ws = &lfc_ctl->prewarm_workers[worker_id];
|
||||
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
|
||||
|
||||
/* enable prefetch in LFC */
|
||||
lfc_store_prefetch_result = true;
|
||||
lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
|
||||
|
||||
elog(LOG, "LFC: worker %d start prewarming", worker_id);
|
||||
while (!lfc_ctl->prewarm_canceled)
|
||||
{
|
||||
if (snd_idx < max_prefetch_pages)
|
||||
{
|
||||
if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
|
||||
{
|
||||
/* If there are multiple workers, split chunks between them */
|
||||
snd_idx += 1 << fcs_chunk_size_log;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (BITMAP_ISSET(bitmap, snd_idx))
|
||||
{
|
||||
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
|
||||
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
|
||||
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
|
||||
{
|
||||
(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
n_sent += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
ws->skipped_pages += 1;
|
||||
BITMAP_CLR(bitmap, snd_idx);
|
||||
}
|
||||
}
|
||||
snd_idx += 1;
|
||||
}
|
||||
}
|
||||
if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
|
||||
{
|
||||
if (n_received == n_sent && snd_idx == max_prefetch_pages)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
|
||||
{
|
||||
/* Skip chunks processed by other workers */
|
||||
rcv_idx += 1 << fcs_chunk_size_log;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Locate next block to prefetch */
|
||||
while (!BITMAP_ISSET(bitmap, rcv_idx))
|
||||
{
|
||||
rcv_idx += 1;
|
||||
}
|
||||
tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
|
||||
tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
|
||||
if (communicator_prefetch_receive(tag))
|
||||
{
|
||||
ws->prewarmed_pages += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
ws->skipped_pages += 1;
|
||||
}
|
||||
rcv_idx += 1;
|
||||
n_received += 1;
|
||||
}
|
||||
}
|
||||
/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
|
||||
* connection to PS dropped just after return from this function.
|
||||
*/
|
||||
Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
|
||||
elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
|
||||
lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Check if page is present in the cache.
|
||||
* Returns true if page is found in local cache.
|
||||
@@ -1001,8 +1375,11 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
* If we can't (e.g. because all other slots are being accessed)
|
||||
* then we will remove this entry from the hash and continue
|
||||
* on to the next chunk, as we may not exceed the limit.
|
||||
*
|
||||
* While prewarming LFC we do not want to replace existed entries,
|
||||
* so we just stop prewarm is LFC cache is full.
|
||||
*/
|
||||
else if (!dlist_is_empty(&lfc_ctl->lru))
|
||||
else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm)
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
|
||||
@@ -1026,6 +1403,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
/* Can't add this chunk - we don't have the space for it */
|
||||
hash_search_with_hash_value(lfc_hash, &entry->key, hash,
|
||||
HASH_REMOVE, NULL);
|
||||
lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1112,9 +1490,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
|
||||
|
||||
tag.blockNum = blkno;
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
|
||||
if (lfc_prewarm_update_ws_estimation)
|
||||
{
|
||||
tag.blockNum = blkno;
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
}
|
||||
if (found)
|
||||
{
|
||||
state = GET_STATE(entry, chunk_offs);
|
||||
@@ -1748,3 +2128,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
}
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(get_local_cache_state);
|
||||
|
||||
Datum
|
||||
get_local_cache_state(PG_FUNCTION_ARGS)
|
||||
{
|
||||
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
|
||||
FileCacheState* fcs = lfc_get_state(max_entries);
|
||||
if (fcs != NULL)
|
||||
PG_RETURN_BYTEA_P((bytea*)fcs);
|
||||
else
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(prewarm_local_cache);
|
||||
|
||||
Datum
|
||||
prewarm_local_cache(PG_FUNCTION_ARGS)
|
||||
{
|
||||
bytea* state = PG_GETARG_BYTEA_PP(0);
|
||||
uint32 n_workers = PG_GETARG_INT32(1);
|
||||
FileCacheState* fcs = (FileCacheState*)state;
|
||||
|
||||
lfc_prewarm(fcs, n_workers);
|
||||
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(get_prewarm_info);
|
||||
|
||||
Datum
|
||||
get_prewarm_info(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Datum values[4];
|
||||
bool nulls[4];
|
||||
TupleDesc tupdesc;
|
||||
uint32 prewarmed_pages = 0;
|
||||
uint32 skipped_pages = 0;
|
||||
uint32 active_workers = 0;
|
||||
uint32 total_pages;
|
||||
size_t n_workers;
|
||||
|
||||
if (lfc_size_limit == 0)
|
||||
PG_RETURN_NULL();
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
n_workers = lfc_ctl->n_prewarm_workers;
|
||||
total_pages = lfc_ctl->total_prewarm_pages;
|
||||
for (size_t i = 0; i < n_workers; i++)
|
||||
{
|
||||
PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
|
||||
prewarmed_pages += ws->prewarmed_pages;
|
||||
skipped_pages += ws->skipped_pages;
|
||||
active_workers += ws->completed != 0;
|
||||
}
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
tupdesc = CreateTemplateTupleDesc(4);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
|
||||
tupdesc = BlessTupleDesc(tupdesc);
|
||||
|
||||
MemSet(nulls, 0, sizeof(nulls));
|
||||
|
||||
values[0] = Int32GetDatum(total_pages);
|
||||
values[1] = Int32GetDatum(prewarmed_pages);
|
||||
values[2] = Int32GetDatum(skipped_pages);
|
||||
values[3] = Int32GetDatum(active_workers);
|
||||
|
||||
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,17 @@
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
typedef struct FileCacheState
|
||||
{
|
||||
int32 vl_len_; /* varlena header (do not touch directly!) */
|
||||
uint32 magic;
|
||||
uint32 n_chunks;
|
||||
uint32 n_pages;
|
||||
uint16 chunk_size_log;
|
||||
BufferTag chunks[FLEXIBLE_ARRAY_MEMBER];
|
||||
/* followed by bitmap */
|
||||
} FileCacheState;
|
||||
|
||||
/* GUCs */
|
||||
extern bool lfc_store_prefetch_result;
|
||||
|
||||
@@ -32,7 +43,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
extern void lfc_init(void);
|
||||
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
const void* buffer, XLogRecPtr lsn);
|
||||
extern FileCacheState* lfc_get_state(size_t max_entries);
|
||||
extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
|
||||
|
||||
PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
|
||||
|
||||
static inline bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "portability/instr_time.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/fd.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
@@ -79,6 +80,7 @@ int neon_protocol_version = 3;
|
||||
static int neon_compute_mode = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
static int max_sockets;
|
||||
|
||||
static int pageserver_response_log_timeout = 10000;
|
||||
/* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
|
||||
@@ -336,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
|
||||
pageserver_disconnect(i);
|
||||
}
|
||||
pagestore_local_counter = end_update_counter;
|
||||
|
||||
/* Reserve file descriptors for sockets */
|
||||
while (max_sockets < num_shards)
|
||||
{
|
||||
max_sockets += 1;
|
||||
ReserveExternalFD();
|
||||
}
|
||||
}
|
||||
|
||||
if (num_shards_p)
|
||||
@@ -736,8 +745,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
default:
|
||||
neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
|
||||
}
|
||||
/* This shouldn't be hit */
|
||||
Assert(false);
|
||||
|
||||
pg_unreachable();
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -877,6 +886,7 @@ retry:
|
||||
int port;
|
||||
int sndbuf;
|
||||
int recvbuf;
|
||||
uint64* max_wait;
|
||||
|
||||
get_local_port(PQsocket(pageserver_conn), &port);
|
||||
get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
|
||||
@@ -887,7 +897,10 @@ retry:
|
||||
shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf,
|
||||
pageserver_conn->inStart, pageserver_conn->inEnd);
|
||||
shard->receive_last_log_time = now;
|
||||
MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged;
|
||||
shard->receive_logged = true;
|
||||
max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms;
|
||||
*max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -910,6 +923,7 @@ retry:
|
||||
get_local_port(PQsocket(pageserver_conn), &port);
|
||||
neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start), port);
|
||||
MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
|
||||
pageserver_disconnect(shard_no);
|
||||
return -1;
|
||||
}
|
||||
@@ -933,6 +947,7 @@ retry:
|
||||
INSTR_TIME_SET_ZERO(shard->receive_start_time);
|
||||
INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
|
||||
shard->receive_logged = false;
|
||||
MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ typedef struct WalProposerConn
|
||||
* walprop_async_read */
|
||||
} WalProposerConn;
|
||||
|
||||
extern WalProposerConn *libpqwp_connect_start(char *conninfo);
|
||||
extern WalProposerConn *libpqwp_connect_start(const char *conninfo);
|
||||
extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
|
||||
extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
|
||||
extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
|
||||
22
pgxn/neon/neon--1.5--1.6.sql
Normal file
22
pgxn/neon/neon--1.5--1.6.sql
Normal file
@@ -0,0 +1,22 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer)
|
||||
RETURNS record
|
||||
AS 'MODULE_PATHNAME', 'get_prewarm_info'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION get_local_cache_state(max_chunks integer default null)
|
||||
RETURNS bytea
|
||||
AS 'MODULE_PATHNAME', 'get_local_cache_state'
|
||||
LANGUAGE C
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1)
|
||||
RETURNS void
|
||||
AS 'MODULE_PATHNAME', 'prewarm_local_cache'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
|
||||
|
||||
7
pgxn/neon/neon--1.6--1.5.sql
Normal file
7
pgxn/neon/neon--1.6--1.5.sql
Normal file
@@ -0,0 +1,7 @@
|
||||
DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer);
|
||||
|
||||
DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
|
||||
|
||||
DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1);
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "storage/buf_internals.h"
|
||||
@@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn,
|
||||
XLogRecPtr
|
||||
neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
|
||||
{
|
||||
if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
|
||||
if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
|
||||
return lsn;
|
||||
|
||||
Assert(lsn >= WalSegMinSize);
|
||||
LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
|
||||
lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks);
|
||||
LWLockRelease(LastWrittenLsnLock);
|
||||
@@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
|
||||
NInfoGetRelNumber(relfilenode) == InvalidOid)
|
||||
return InvalidXLogRecPtr;
|
||||
|
||||
|
||||
BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid);
|
||||
|
||||
LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
|
||||
@@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
|
||||
{
|
||||
XLogRecPtr lsn = lsns[i];
|
||||
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
continue;
|
||||
|
||||
Assert(lsn >= WalSegMinSize);
|
||||
key.blockNum = blockno + i;
|
||||
entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
|
||||
if (found)
|
||||
|
||||
@@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram,
|
||||
static metric_t *
|
||||
neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
{
|
||||
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
|
||||
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
|
||||
metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
|
||||
int i = 0;
|
||||
|
||||
@@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
|
||||
APPEND_METRIC(getpage_prefetch_requests_total);
|
||||
APPEND_METRIC(getpage_sync_requests_total);
|
||||
APPEND_METRIC(compute_getpage_stuck_requests_total);
|
||||
APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms);
|
||||
APPEND_METRIC(getpage_prefetch_misses_total);
|
||||
APPEND_METRIC(getpage_prefetch_discards_total);
|
||||
APPEND_METRIC(pageserver_requests_sent_total);
|
||||
@@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
totals.file_cache_hits_total += counters->file_cache_hits_total;
|
||||
histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
|
||||
histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
|
||||
|
||||
totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
|
||||
totals.compute_getpage_max_inflight_stuck_time_ms = Max(
|
||||
totals.compute_getpage_max_inflight_stuck_time_ms,
|
||||
counters->compute_getpage_max_inflight_stuck_time_ms);
|
||||
}
|
||||
|
||||
metrics = neon_perf_counters_to_metrics(&totals);
|
||||
|
||||
@@ -57,6 +57,18 @@ typedef struct
|
||||
uint64 getpage_prefetch_requests_total;
|
||||
uint64 getpage_sync_requests_total;
|
||||
|
||||
/*
|
||||
* Total number of Getpage requests left without an answer for more than
|
||||
* pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout
|
||||
*/
|
||||
uint64 compute_getpage_stuck_requests_total;
|
||||
|
||||
/*
|
||||
* Longest waiting time for active stuck requests. If a stuck request gets a
|
||||
* response or disconnects, this metric is updated
|
||||
*/
|
||||
uint64 compute_getpage_max_inflight_stuck_time_ms;
|
||||
|
||||
/*
|
||||
* Total number of readahead misses; consisting of either prefetches that
|
||||
* don't satisfy the LSN bounds, or cases where no readahead was issued
|
||||
|
||||
@@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state)
|
||||
* fetched from timeline 'tli'.
|
||||
*
|
||||
* Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
|
||||
* occurs, in which case 'err' has the desciption. Error always closes remote
|
||||
* occurs, in which case 'err' has the description. Error always closes remote
|
||||
* connection, if there was any, so socket subscription should be removed.
|
||||
*
|
||||
* NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
|
||||
|
||||
@@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
/*
|
||||
* We have to disable this check for pg14-16 because sorted build of GIST index requires
|
||||
* to perform unlogged build several times
|
||||
*/
|
||||
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
|
||||
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
#endif
|
||||
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
|
||||
@@ -35,9 +35,11 @@
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
#include "postgres.h"
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
@@ -90,9 +92,8 @@ static void MembershipConfigurationFree(MembershipConfiguration *mconf);
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
{
|
||||
char *host;
|
||||
char *connstring;
|
||||
char *sep;
|
||||
char *port;
|
||||
WalProposer *wp;
|
||||
|
||||
wp = palloc0(sizeof(WalProposer));
|
||||
@@ -103,71 +104,122 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->mconf.members.len = 0;
|
||||
wp->mconf.new_members.len = 0;
|
||||
|
||||
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
|
||||
wp_log(LOG, "neon.safekeeper_connstrings=%s", wp->config->safekeeper_connstrings);
|
||||
|
||||
/*
|
||||
* If safekeepers list starts with g# parse generation number followed by
|
||||
* :
|
||||
*/
|
||||
if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
|
||||
if (strncmp(wp->config->safekeeper_connstrings, "g#", 2) == 0)
|
||||
{
|
||||
char *endptr;
|
||||
|
||||
errno = 0;
|
||||
wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
|
||||
wp->safekeepers_generation = strtoul(wp->config->safekeeper_connstrings + 2, &endptr, 10);
|
||||
if (errno != 0)
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
|
||||
wp_log(FATAL, "failed to parse neon.safekeeper_connstrings generation number: %m");
|
||||
}
|
||||
/* Skip past : to the first hostname. */
|
||||
host = endptr + 1;
|
||||
connstring = endptr + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
host = wp->config->safekeepers_list;
|
||||
wp->safekeepers_generation = INVALID_GENERATION;
|
||||
connstring = wp->config->safekeeper_connstrings;
|
||||
}
|
||||
wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
|
||||
|
||||
for (; host != NULL && *host != '\0'; host = sep)
|
||||
for (; connstring != NULL && *connstring != '\0'; connstring = sep)
|
||||
{
|
||||
port = strchr(host, ':');
|
||||
if (port == NULL)
|
||||
{
|
||||
wp_log(FATAL, "port is not specified");
|
||||
}
|
||||
*port++ = '\0';
|
||||
sep = strchr(port, ',');
|
||||
char *port;
|
||||
char *errmsg;
|
||||
Safekeeper *sk;
|
||||
PQconninfoOption *conninfo_options, *option;
|
||||
|
||||
sk = &wp->safekeeper[wp->n_safekeepers];
|
||||
|
||||
sep = strchr(connstring, ',');
|
||||
if (sep != NULL)
|
||||
*sep++ = '\0';
|
||||
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
wp_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
wp->safekeeper[wp->n_safekeepers].host = host;
|
||||
wp->safekeeper[wp->n_safekeepers].port = port;
|
||||
wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
|
||||
wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
|
||||
wp->safekeeper[wp->n_safekeepers].wp = wp;
|
||||
|
||||
/* TODO(tristan957): Remove this compatibility code and only keep the
|
||||
* else branch.
|
||||
*
|
||||
* Check if we have a connection string formatted as host:port, and use
|
||||
* the old parsing code.
|
||||
*/
|
||||
port = strchr(connstring, ':');
|
||||
if (port)
|
||||
{
|
||||
sk->host = connstring;
|
||||
*port++ = '\0';
|
||||
sk->port = port;
|
||||
}
|
||||
else
|
||||
{
|
||||
conninfo_options = PQconninfoParse(connstring, &errmsg);
|
||||
if (!conninfo_options)
|
||||
wp_log(FATAL, "invalid safekeeper connection string: %s", errmsg);
|
||||
|
||||
// Save off the host and port for identification purposes
|
||||
option = conninfo_options;
|
||||
while (option)
|
||||
{
|
||||
if (!option->keyword)
|
||||
break;
|
||||
|
||||
if (strcmp(option->keyword, "host") == 0)
|
||||
{
|
||||
sk->host = pstrdup(option->val);
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (strcmp(option->keyword, "port") == 0)
|
||||
{
|
||||
sk->port = pstrdup(option->val);
|
||||
goto end;
|
||||
}
|
||||
|
||||
end:
|
||||
// We've saved both the host and port, so we can skip iterating
|
||||
// the rest of the list
|
||||
if (sk->host && sk->port)
|
||||
break;
|
||||
|
||||
option++;
|
||||
}
|
||||
|
||||
PQconninfoFree(conninfo_options);
|
||||
conninfo_options = option = NULL;
|
||||
}
|
||||
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
sk->wp = wp;
|
||||
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers];
|
||||
int written = 0;
|
||||
|
||||
written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
|
||||
written = snprintf(sk->conninfo, sizeof(sk->conninfo),
|
||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
if (written > sizeof(sk->conninfo) || written < 0)
|
||||
wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||
}
|
||||
|
||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||
wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
|
||||
wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
|
||||
initStringInfo(&sk->outbuf);
|
||||
sk->startStreamingAt = InvalidXLogRecPtr;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
wp->n_safekeepers += 1;
|
||||
}
|
||||
if (wp->n_safekeepers < 1)
|
||||
{
|
||||
wp_log(FATAL, "safekeepers addresses are not specified");
|
||||
wp_log(FATAL, "safekeepers connection strings are not specified");
|
||||
}
|
||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||
|
||||
@@ -756,7 +808,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
SafekeeperId *sk_id = &wp->mconf.members.m[i];
|
||||
|
||||
if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
|
||||
if (sk_id->node_id == sk->greetResponse.nodeId)
|
||||
{
|
||||
/*
|
||||
* If mconf or list of safekeepers to connect to changed (the
|
||||
@@ -781,7 +833,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
|
||||
|
||||
if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
|
||||
if (sk_id->node_id == sk->greetResponse.nodeId)
|
||||
{
|
||||
if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
|
||||
{
|
||||
@@ -836,7 +888,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
|
||||
{
|
||||
uint32 n_greeted = 0;
|
||||
|
||||
for (uint32 i = 0; i < wp->mconf.members.len; i++)
|
||||
for (uint32 i = 0; i < mset->len; i++)
|
||||
{
|
||||
Safekeeper *sk = msk[i];
|
||||
|
||||
@@ -1071,7 +1123,6 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
/* ready for elected message */
|
||||
sk->state = SS_WAIT_ELECTED;
|
||||
|
||||
wp->n_votes++;
|
||||
/* Are we already elected? */
|
||||
if (wp->state == WPS_CAMPAIGN)
|
||||
{
|
||||
@@ -1106,7 +1157,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
|
||||
{
|
||||
uint32 n_votes = 0;
|
||||
|
||||
for (uint32 i = 0; i < wp->mconf.members.len; i++)
|
||||
for (uint32 i = 0; i < mset->len; i++)
|
||||
{
|
||||
Safekeeper *sk = msk[i];
|
||||
|
||||
|
||||
@@ -707,12 +707,11 @@ typedef struct WalProposerConfig
|
||||
char *neon_timeline;
|
||||
|
||||
/*
|
||||
* Comma-separated list of safekeepers, in the following format:
|
||||
* host1:port1,host2:port2,host3:port3
|
||||
* Comma-separated list of safekeeper connection strings
|
||||
*
|
||||
* This cstr should be editable.
|
||||
*/
|
||||
char *safekeepers_list;
|
||||
char *safekeeper_connstrings;
|
||||
|
||||
/*
|
||||
* WalProposer reconnects to offline safekeepers once in this interval.
|
||||
@@ -788,14 +787,15 @@ typedef struct WalProposer
|
||||
/*
|
||||
* Generation of the membership conf of which safekeepers[] are presumably
|
||||
* members. To make cplane life a bit easier and have more control in
|
||||
* tests with which sks walproposer gets connected neon.safekeepers GUC
|
||||
* doesn't provide full mconf, only the list of endpoints to connect to.
|
||||
* We still would like to know generation associated with it because 1) we
|
||||
* need some handle to enforce using generations in walproposer, and
|
||||
* non-zero value of this serves the purpose; 2) currently we don't do
|
||||
* that, but in theory walproposer can update list of safekeepers to
|
||||
* connect to upon receiving mconf from safekeepers, and generation number
|
||||
* must be checked to see which list is newer.
|
||||
* tests with which sks walproposer gets connected
|
||||
* neon.safekeeper_connstrings GUC doesn't provide full mconf, only the list
|
||||
* of endpoints to connect to. We still would like to know generation
|
||||
* associated with it because 1) we need some handle to enforce using
|
||||
* generations in walproposer, and non-zero value of this serves the
|
||||
* purpose; 2) currently we don't do that, but in theory walproposer can
|
||||
* update list of safekeepers to connect to upon receiving mconf from
|
||||
* safekeepers, and generation number must be checked to see which list is
|
||||
* newer.
|
||||
*/
|
||||
Generation safekeepers_generation;
|
||||
/* Number of occupied slots in safekeepers[] */
|
||||
@@ -845,9 +845,6 @@ typedef struct WalProposer
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
|
||||
/* number of votes collected from safekeepers */
|
||||
int n_votes;
|
||||
|
||||
/* number of successful connections over the lifetime of walproposer */
|
||||
int n_connected;
|
||||
|
||||
|
||||
@@ -63,7 +63,8 @@
|
||||
char *wal_acceptors_list = "";
|
||||
int wal_acceptor_reconnect_timeout = 1000;
|
||||
int wal_acceptor_connection_timeout = 10000;
|
||||
int safekeeper_proto_version = 2;
|
||||
char *safekeeper_connstrings = "";
|
||||
int safekeeper_proto_version = 3;
|
||||
|
||||
/* Set to true in the walproposer bgw. */
|
||||
static bool am_walproposer;
|
||||
@@ -81,6 +82,7 @@ static HotStandbyFeedback agg_hs_feedback;
|
||||
static void nwp_shmem_startup_hook(void);
|
||||
static void nwp_register_gucs(void);
|
||||
static void assign_neon_safekeepers(const char *newval, void *extra);
|
||||
static void assign_neon_safekeeper_connstrings(const char *newval, void *extra);
|
||||
static void nwp_prepare_shmem(void);
|
||||
static uint64 backpressure_lag_impl(void);
|
||||
static uint64 startup_backpressure_wrap(void);
|
||||
@@ -117,8 +119,11 @@ init_walprop_config(bool syncSafekeepers)
|
||||
{
|
||||
walprop_config.neon_tenant = neon_tenant;
|
||||
walprop_config.neon_timeline = neon_timeline;
|
||||
/* TODO(tristan957): Remove this compatibility code after the control plane
|
||||
* is updated to pass neon.safekeeper_connstrings
|
||||
*/
|
||||
/* WalProposerCreate scribbles directly on it, so pstrdup */
|
||||
walprop_config.safekeepers_list = pstrdup(wal_acceptors_list);
|
||||
walprop_config.safekeeper_connstrings = pstrdup(wal_acceptors_list[0] == '\0' ? safekeeper_connstrings : wal_acceptors_list);
|
||||
walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout;
|
||||
walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout;
|
||||
walprop_config.wal_segment_size = wal_segment_size;
|
||||
@@ -203,6 +208,15 @@ nwp_register_gucs(void)
|
||||
* GUC_LIST_QUOTE */
|
||||
NULL, assign_neon_safekeepers, NULL);
|
||||
|
||||
DefineCustomStringVariable(
|
||||
"neon.safekeeper_connstrings",
|
||||
"Comma-separated list of safekeeper connection strings with an optional generation prefix of the form g#X:",
|
||||
NULL,
|
||||
&safekeeper_connstrings,
|
||||
"",
|
||||
PGC_SIGHUP,
|
||||
GUC_LIST_INPUT,
|
||||
NULL, assign_neon_safekeeper_connstrings, NULL);
|
||||
DefineCustomIntVariable(
|
||||
"neon.safekeeper_reconnect_timeout",
|
||||
"Walproposer reconnects to offline safekeepers once in this interval.",
|
||||
@@ -228,7 +242,7 @@ nwp_register_gucs(void)
|
||||
"Version of compute <-> safekeeper protocol.",
|
||||
"Used while migrating from 2 to 3.",
|
||||
&safekeeper_proto_version,
|
||||
2, 0, INT_MAX,
|
||||
3, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
@@ -236,24 +250,24 @@ nwp_register_gucs(void)
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
split_safekeeper_connstrings(char *connstrings, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
char *curr_sk = connstrings;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
for (char *comma = connstrings; comma != NULL && *comma != '\0'; curr_sk = comma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS)
|
||||
{
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
comma = strchr(comma, ',');
|
||||
safekeepers[n_safekeepers - 1] = curr_sk;
|
||||
|
||||
if (coma != NULL)
|
||||
if (comma != NULL)
|
||||
{
|
||||
*coma++ = '\0';
|
||||
*comma++ = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,19 +275,19 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept two coma-separated strings with list of safekeeper host:port addresses.
|
||||
* Accept two comma-separated strings with list of safekeeper host:port addresses.
|
||||
* Split them into arrays and return false if two sets do not match, ignoring the order.
|
||||
*/
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
safekeeper_connstrings_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
len_old = split_safekeeper_connstrings(old, safekeepers_old);
|
||||
len_new = split_safekeeper_connstrings(new, safekeepers_new);
|
||||
|
||||
if (len_old != len_new)
|
||||
{
|
||||
@@ -294,6 +308,44 @@ safekeepers_cmp(char *old, char *new)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeeper_connstrings. Restarts walproposer through
|
||||
* FATAL if the list changed.
|
||||
*/
|
||||
static void
|
||||
assign_neon_safekeeper_connstrings(const char *newval, void *extra)
|
||||
{
|
||||
char *newval_copy;
|
||||
char *oldval;
|
||||
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
if (!newval)
|
||||
{
|
||||
/* should never happen */
|
||||
wpg_log(FATAL, "neon.safekeeper_connstrings is empty");
|
||||
}
|
||||
|
||||
/* Copy values because we will modify them in split_safekeeper_connstrings() */
|
||||
newval_copy = pstrdup(newval);
|
||||
oldval = pstrdup(safekeeper_connstrings);
|
||||
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit
|
||||
* and thus remove this delay. XXX: If you change anything here, sync with
|
||||
* test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (!safekeeper_connstrings_cmp(oldval, newval_copy))
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from \"%s\" to \"%s\"",
|
||||
safekeeper_connstrings, newval);
|
||||
}
|
||||
pfree(newval_copy);
|
||||
pfree(oldval);
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
|
||||
* the list changed.
|
||||
@@ -323,7 +375,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
* and thus remove this delay. XXX: If you change anything here, sync with
|
||||
* test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
if (!safekeeper_connstrings_cmp(oldval, newval_copy))
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
|
||||
wal_acceptors_list, newval);
|
||||
@@ -500,8 +552,8 @@ walprop_register_bgworker(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
|
||||
/* If no wal acceptors are specified, don't start the background worker. */
|
||||
if (*wal_acceptors_list == '\0')
|
||||
/* If no safekeepers are specified, don't start the background worker. */
|
||||
if (*safekeeper_connstrings == '\0')
|
||||
return;
|
||||
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
@@ -841,7 +893,7 @@ walprop_status(Safekeeper *sk)
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
libpqwp_connect_start(char *conninfo)
|
||||
libpqwp_connect_start(const char *conninfo)
|
||||
{
|
||||
|
||||
PGconn *pg_conn;
|
||||
|
||||
@@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation
|
||||
(see end of this page on how to generate certs with openssl):
|
||||
|
||||
```
|
||||
./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
|
||||
LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
|
||||
```
|
||||
|
||||
If both postgres and proxy are running you may send a SQL query:
|
||||
@@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
|
||||
|
||||
Then we need to build proxy with 'testing' feature and run, e.g.:
|
||||
```sh
|
||||
RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
|
||||
RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
|
||||
```
|
||||
|
||||
Now from client you can start a new session:
|
||||
|
||||
@@ -409,14 +409,22 @@ impl JwkCacheEntryLock {
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
if now >= exp + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
|
||||
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
|
||||
exp.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
if nbf >= now + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::InvalidClaims(
|
||||
JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
JwtClaimsError::JwtTokenNotYetReadyToUse(
|
||||
nbf.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs(),
|
||||
),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -534,10 +542,10 @@ struct JwtPayload<'a> {
|
||||
#[serde(rename = "aud", default)]
|
||||
audience: OneOrMany,
|
||||
/// Expiration - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
|
||||
#[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)]
|
||||
expiration: Option<SystemTime>,
|
||||
/// Not before - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
|
||||
/// Not before - Time before which the JWT is not valid
|
||||
#[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)]
|
||||
not_before: Option<SystemTime>,
|
||||
|
||||
// the following entries are only extracted for the sake of debug logging.
|
||||
@@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany {
|
||||
}
|
||||
|
||||
fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
|
||||
let d = <Option<u64>>::deserialize(d)?;
|
||||
Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
|
||||
<Option<u64>>::deserialize(d)?
|
||||
.map(|t| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_secs(t))
|
||||
.ok_or_else(|| {
|
||||
serde::de::Error::custom(format_args!("timestamp out of bounds: {t}"))
|
||||
})
|
||||
})
|
||||
.transpose()
|
||||
}
|
||||
|
||||
struct JwkRenewalPermit<'a> {
|
||||
@@ -746,11 +761,11 @@ pub enum JwtClaimsError {
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
#[error("JWT token has expired (exp={0})")]
|
||||
JwtTokenHasExpired(u64),
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
#[error("JWT token is not yet ready to use (nbf={0})")]
|
||||
JwtTokenNotYetReadyToUse(u64),
|
||||
}
|
||||
|
||||
#[allow(dead_code, reason = "Debug use only")]
|
||||
@@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
"nbf": now + 60,
|
||||
"aud": "neon",
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60),
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"exp": now - 60,
|
||||
"aud": ["neon"],
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenHasExpired,
|
||||
error: JwtClaimsError::JwtTokenHasExpired(now - 60),
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
|
||||
@@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError {
|
||||
option: EndpointId,
|
||||
},
|
||||
|
||||
#[error(
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
.cn,
|
||||
)]
|
||||
UnknownCommonName { cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(EndpointId),
|
||||
}
|
||||
@@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn endpoint_sni(
|
||||
sni: &str,
|
||||
common_names: &HashSet<String>,
|
||||
) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
|
||||
let Some((subdomain, common_name)) = sni.split_once('.') else {
|
||||
return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
|
||||
};
|
||||
pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet<String>) -> Option<EndpointId> {
|
||||
let (subdomain, common_name) = sni.split_once('.')?;
|
||||
if !common_names.contains(common_name) {
|
||||
return Err(ComputeUserInfoParseError::UnknownCommonName {
|
||||
cn: common_name.into(),
|
||||
});
|
||||
return None;
|
||||
}
|
||||
if subdomain == SERVERLESS_DRIVER_SNI {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
Ok(Some(EndpointId::from(subdomain)))
|
||||
Some(EndpointId::from(subdomain))
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
@@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
})
|
||||
.map(|name| name.into());
|
||||
|
||||
let endpoint_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
endpoint_sni(sni_str, cn)?
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let endpoint_from_domain =
|
||||
sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn)));
|
||||
|
||||
let endpoint = match (endpoint_option, endpoint_from_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
@@ -424,21 +404,34 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_inconsistent_sni() {
|
||||
fn parse_unknown_sni() {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
UnknownCommonName { cn } => {
|
||||
assert_eq!(cn, "localhost");
|
||||
}
|
||||
_ => panic!("bad error: {err:?}"),
|
||||
}
|
||||
let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.unwrap();
|
||||
|
||||
assert!(info.endpoint_id.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_sni_with_options() {
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("options", "endpoint=foo-bar-baz-1234"),
|
||||
]);
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -132,11 +132,10 @@ impl Drop for LoggingGuard {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: make JSON the default
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
|
||||
enum LogFormat {
|
||||
Text,
|
||||
#[default]
|
||||
Text = 1,
|
||||
Json,
|
||||
}
|
||||
|
||||
|
||||
@@ -24,9 +24,6 @@ pub(crate) enum HandshakeError {
|
||||
#[error("protocol violation")]
|
||||
ProtocolViolation,
|
||||
|
||||
#[error("missing certificate")]
|
||||
MissingCertificate,
|
||||
|
||||
#[error("{0}")]
|
||||
StreamUpgradeError(#[from] StreamUpgradeError),
|
||||
|
||||
@@ -42,10 +39,6 @@ impl ReportableError for HandshakeError {
|
||||
match self {
|
||||
HandshakeError::EarlyData => crate::error::ErrorKind::User,
|
||||
HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
|
||||
// This error should not happen, but will if we have no default certificate and
|
||||
// the client sends no SNI extension.
|
||||
// If they provide SNI then we can be sure there is a certificate that matches.
|
||||
HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
|
||||
HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
|
||||
StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
|
||||
StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
@@ -146,7 +139,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
// try parse endpoint
|
||||
let ep = conn_info
|
||||
.server_name()
|
||||
.and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
|
||||
.and_then(|sni| endpoint_sni(sni, &tls.common_names));
|
||||
if let Some(ep) = ep {
|
||||
ctx.set_endpoint_id(ep);
|
||||
}
|
||||
@@ -161,10 +154,8 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
}
|
||||
|
||||
let (_, tls_server_end_point) = tls
|
||||
.cert_resolver
|
||||
.resolve(conn_info.server_name())
|
||||
.ok_or(HandshakeError::MissingCertificate)?;
|
||||
let (_, tls_server_end_point) =
|
||||
tls.cert_resolver.resolve(conn_info.server_name());
|
||||
|
||||
stream = PqStream {
|
||||
framed: Framed {
|
||||
|
||||
@@ -98,8 +98,7 @@ fn generate_tls_config<'a>(
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(vec![cert.clone()], key.clone_key())?;
|
||||
|
||||
let mut cert_resolver = CertResolver::new();
|
||||
cert_resolver.add_cert(key, vec![cert], true)?;
|
||||
let cert_resolver = CertResolver::new(key, vec![cert])?;
|
||||
|
||||
let common_names = cert_resolver.get_common_names();
|
||||
|
||||
|
||||
@@ -199,8 +199,7 @@ fn get_conn_info(
|
||||
let endpoint = match connection_url.host() {
|
||||
Some(url::Host::Domain(hostname)) => {
|
||||
if let Some(tls) = tls {
|
||||
endpoint_sni(hostname, &tls.common_names)?
|
||||
.ok_or(ConnInfoError::MalformedEndpoint)?
|
||||
endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)?
|
||||
} else {
|
||||
hostname
|
||||
.split_once('.')
|
||||
|
||||
@@ -5,6 +5,7 @@ use anyhow::{Context, bail};
|
||||
use itertools::Itertools;
|
||||
use rustls::crypto::ring::{self, sign};
|
||||
use rustls::pki_types::{CertificateDer, PrivateKeyDer};
|
||||
use rustls::sign::CertifiedKey;
|
||||
use x509_cert::der::{Reader, SliceReader};
|
||||
|
||||
use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint};
|
||||
@@ -25,10 +26,8 @@ pub fn configure_tls(
|
||||
certs_dir: Option<&String>,
|
||||
allow_tls_keylogfile: bool,
|
||||
) -> anyhow::Result<TlsConfig> {
|
||||
let mut cert_resolver = CertResolver::new();
|
||||
|
||||
// add default certificate
|
||||
cert_resolver.add_cert_path(key_path, cert_path, true)?;
|
||||
let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?;
|
||||
|
||||
// add extra certificates
|
||||
if let Some(certs_dir) = certs_dir {
|
||||
@@ -40,11 +39,8 @@ pub fn configure_tls(
|
||||
let key_path = path.join("tls.key");
|
||||
let cert_path = path.join("tls.crt");
|
||||
if key_path.exists() && cert_path.exists() {
|
||||
cert_resolver.add_cert_path(
|
||||
&key_path.to_string_lossy(),
|
||||
&cert_path.to_string_lossy(),
|
||||
false,
|
||||
)?;
|
||||
cert_resolver
|
||||
.add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,92 +79,42 @@ pub fn configure_tls(
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
#[derive(Debug)]
|
||||
pub struct CertResolver {
|
||||
certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
|
||||
default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
|
||||
default: (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint),
|
||||
}
|
||||
|
||||
impl CertResolver {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result<Self> {
|
||||
let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
|
||||
Self::new(priv_key, cert_chain)
|
||||
}
|
||||
|
||||
fn add_cert_path(
|
||||
&mut self,
|
||||
key_path: &str,
|
||||
cert_path: &str,
|
||||
is_default: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let priv_key = {
|
||||
let key_bytes = std::fs::read(key_path)
|
||||
.with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
rustls_pemfile::private_key(&mut &key_bytes[..])
|
||||
.with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
|
||||
.with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
|
||||
};
|
||||
pub fn new(
|
||||
priv_key: PrivateKeyDer<'static>,
|
||||
cert_chain: Vec<CertificateDer<'static>>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.try_collect()
|
||||
.with_context(|| {
|
||||
format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
|
||||
})?
|
||||
};
|
||||
|
||||
self.add_cert(priv_key, cert_chain, is_default)
|
||||
let mut certs = HashMap::new();
|
||||
let default = (cert.clone(), tls_server_end_point);
|
||||
certs.insert(common_name, (cert, tls_server_end_point));
|
||||
Ok(Self { certs, default })
|
||||
}
|
||||
|
||||
pub fn add_cert(
|
||||
fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
|
||||
let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
|
||||
self.add_cert(priv_key, cert_chain)
|
||||
}
|
||||
|
||||
fn add_cert(
|
||||
&mut self,
|
||||
priv_key: PrivateKeyDer<'static>,
|
||||
cert_chain: Vec<CertificateDer<'static>>,
|
||||
is_default: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
|
||||
|
||||
let first_cert = &cert_chain[0];
|
||||
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
|
||||
|
||||
let certificate = SliceReader::new(first_cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
let common_name = certificate.tbs_certificate.subject.to_string();
|
||||
|
||||
// We need to get the canonical name for this certificate so we can match them against any domain names
|
||||
// seen within the proxy codebase.
|
||||
//
|
||||
// In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
|
||||
// We need to remove the wildcard prefix for the purposes of certificate selection.
|
||||
//
|
||||
// auth-broker does not use SNI and instead uses the Neon-Connection-String header.
|
||||
// Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
|
||||
//
|
||||
// Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
|
||||
// validation, so let's we can continue with any common-name
|
||||
let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
|
||||
s.to_string()
|
||||
} else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
|
||||
s.to_string()
|
||||
} else if let Some(s) = common_name.strip_prefix("CN=") {
|
||||
s.to_string()
|
||||
} else {
|
||||
bail!("Failed to parse common name from certificate")
|
||||
};
|
||||
|
||||
let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
|
||||
|
||||
if is_default {
|
||||
self.default = Some((cert.clone(), tls_server_end_point));
|
||||
}
|
||||
|
||||
let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;
|
||||
self.certs.insert(common_name, (cert, tls_server_end_point));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -177,12 +123,82 @@ impl CertResolver {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_key_cert(
|
||||
key_path: &str,
|
||||
cert_path: &str,
|
||||
) -> anyhow::Result<(PrivateKeyDer<'static>, Vec<CertificateDer<'static>>)> {
|
||||
let priv_key = {
|
||||
let key_bytes = std::fs::read(key_path)
|
||||
.with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
rustls_pemfile::private_key(&mut &key_bytes[..])
|
||||
.with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
|
||||
.with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
|
||||
};
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.try_collect()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
)
|
||||
})?
|
||||
};
|
||||
|
||||
Ok((priv_key, cert_chain))
|
||||
}
|
||||
|
||||
fn process_key_cert(
|
||||
priv_key: PrivateKeyDer<'static>,
|
||||
cert_chain: Vec<CertificateDer<'static>>,
|
||||
) -> anyhow::Result<(String, Arc<CertifiedKey>, TlsServerEndPoint)> {
|
||||
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
|
||||
|
||||
let first_cert = &cert_chain[0];
|
||||
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
|
||||
|
||||
let certificate = SliceReader::new(first_cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
let common_name = certificate.tbs_certificate.subject.to_string();
|
||||
|
||||
// We need to get the canonical name for this certificate so we can match them against any domain names
|
||||
// seen within the proxy codebase.
|
||||
//
|
||||
// In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
|
||||
// We need to remove the wildcard prefix for the purposes of certificate selection.
|
||||
//
|
||||
// auth-broker does not use SNI and instead uses the Neon-Connection-String header.
|
||||
// Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
|
||||
//
|
||||
// Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
|
||||
// validation, so let's we can continue with any common-name
|
||||
let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
|
||||
s.to_string()
|
||||
} else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
|
||||
s.to_string()
|
||||
} else if let Some(s) = common_name.strip_prefix("CN=") {
|
||||
s.to_string()
|
||||
} else {
|
||||
bail!("Failed to parse common name from certificate")
|
||||
};
|
||||
|
||||
let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
|
||||
|
||||
Ok((common_name, cert, tls_server_end_point))
|
||||
}
|
||||
|
||||
impl rustls::server::ResolvesServerCert for CertResolver {
|
||||
fn resolve(
|
||||
&self,
|
||||
client_hello: rustls::server::ClientHello<'_>,
|
||||
) -> Option<Arc<rustls::sign::CertifiedKey>> {
|
||||
self.resolve(client_hello.server_name()).map(|x| x.0)
|
||||
Some(self.resolve(client_hello.server_name()).0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,7 +206,7 @@ impl CertResolver {
|
||||
pub fn resolve(
|
||||
&self,
|
||||
server_name: Option<&str>,
|
||||
) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
|
||||
) -> (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint) {
|
||||
// loop here and cut off more and more subdomains until we find
|
||||
// a match to get a proper wildcard support. OTOH, we now do not
|
||||
// use nested domains, so keep this simple for now.
|
||||
@@ -200,12 +216,17 @@ impl CertResolver {
|
||||
if let Some(mut sni_name) = server_name {
|
||||
loop {
|
||||
if let Some(cert) = self.certs.get(sni_name) {
|
||||
return Some(cert.clone());
|
||||
return cert.clone();
|
||||
}
|
||||
if let Some((_, rest)) = sni_name.split_once('.') {
|
||||
sni_name = rest;
|
||||
} else {
|
||||
return None;
|
||||
// The customer has some custom DNS mapping - just return
|
||||
// a default certificate.
|
||||
//
|
||||
// This will error if the customer uses anything stronger
|
||||
// than sslmode=require. That's a choice they can make.
|
||||
return self.default.clone();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
//
|
||||
// Main entry point for the safekeeper executable
|
||||
//
|
||||
use std::env::{VarError, var};
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::str::FromStr;
|
||||
@@ -354,29 +353,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
// Load JWT auth token to connect to other safekeepers for pull_timeline.
|
||||
// First check if the env var is present, then check the arg with the path.
|
||||
// We want to deprecate and remove the env var method in the future.
|
||||
let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
info!("loaded JWT token for authentication with safekeepers");
|
||||
Some(SecretString::from(v))
|
||||
}
|
||||
Err(VarError::NotPresent) => {
|
||||
if let Some(auth_token_path) = args.auth_token_path.as_ref() {
|
||||
info!(
|
||||
"loading JWT token for authentication with safekeepers from {auth_token_path}"
|
||||
);
|
||||
let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
|
||||
Some(SecretString::from(auth_token.trim().to_owned()))
|
||||
} else {
|
||||
info!("no JWT token for authentication with safekeepers detected");
|
||||
None
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("JWT token for authentication with safekeepers is not unicode");
|
||||
None
|
||||
}
|
||||
let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() {
|
||||
info!("loading JWT token for authentication with safekeepers from {auth_token_path}");
|
||||
let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
|
||||
Some(SecretString::from(auth_token.trim().to_owned()))
|
||||
} else {
|
||||
info!("no JWT token for authentication with safekeepers detected");
|
||||
None
|
||||
};
|
||||
|
||||
let ssl_ca_certs = match args.ssl_ca_file.as_ref() {
|
||||
|
||||
@@ -401,7 +401,10 @@ pub async fn handle_request(
|
||||
request.timeline_id,
|
||||
));
|
||||
if existing_tli.is_ok() {
|
||||
bail!("Timeline {} already exists", request.timeline_id);
|
||||
info!("Timeline {} already exists", request.timeline_id);
|
||||
return Ok(PullTimelineResponse {
|
||||
safekeeper_host: None,
|
||||
});
|
||||
}
|
||||
|
||||
let mut http_client = reqwest::Client::builder();
|
||||
@@ -425,8 +428,25 @@ pub async fn handle_request(
|
||||
|
||||
let mut statuses = Vec::new();
|
||||
for (i, response) in responses.into_iter().enumerate() {
|
||||
let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
|
||||
statuses.push((status, i));
|
||||
match response {
|
||||
Ok(status) => {
|
||||
statuses.push((status, i));
|
||||
}
|
||||
Err(e) => {
|
||||
info!("error fetching status from {}: {e}", http_hosts[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Allow missing responses from up to one safekeeper (say due to downtime)
|
||||
// e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
|
||||
// offline and C comes online. Then we want a pull on C with A and B as hosts to work.
|
||||
let min_required_successful = (http_hosts.len() - 1).max(1);
|
||||
if statuses.len() < min_required_successful {
|
||||
bail!(
|
||||
"only got {} successful status responses. required: {min_required_successful}",
|
||||
statuses.len()
|
||||
)
|
||||
}
|
||||
|
||||
// Find the most advanced safekeeper
|
||||
@@ -536,6 +556,6 @@ async fn pull_timeline(
|
||||
.await?;
|
||||
|
||||
Ok(PullTimelineResponse {
|
||||
safekeeper_host: host,
|
||||
safekeeper_host: Some(host),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use crate::metrics::{
|
||||
WAL_RECEIVERS,
|
||||
};
|
||||
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::timeline::{TimelineError, WalResidentTimeline};
|
||||
|
||||
const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
|
||||
|
||||
@@ -357,9 +357,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
|
||||
.await
|
||||
.context("create timeline")?
|
||||
} else {
|
||||
self.global_timelines
|
||||
.get(self.ttid)
|
||||
.context("get timeline")?
|
||||
let timeline_res = self.global_timelines.get(self.ttid);
|
||||
match timeline_res {
|
||||
Ok(tl) => tl,
|
||||
Err(TimelineError::NotFound(_)) => {
|
||||
return Err(CopyStreamHandlerEnd::TimelineNoCreate);
|
||||
}
|
||||
other => other.context("get_timeline")?,
|
||||
}
|
||||
};
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ impl WalProposer {
|
||||
|
||||
let config = Config {
|
||||
ttid,
|
||||
safekeepers_list: addrs,
|
||||
safekeeper_connstrings: addrs,
|
||||
safekeeper_reconnect_timeout: 1000,
|
||||
safekeeper_connection_timeout: 5000,
|
||||
sync_safekeepers,
|
||||
|
||||
@@ -207,7 +207,7 @@ impl SimulationApi {
|
||||
// initialize connection state for each safekeeper
|
||||
let sk_conns = args
|
||||
.config
|
||||
.safekeepers_list
|
||||
.safekeeper_connstrings
|
||||
.iter()
|
||||
.map(|s| {
|
||||
SafekeeperConn::new(
|
||||
|
||||
@@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector;
|
||||
use storage_controller::service::{
|
||||
Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
|
||||
MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
||||
PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
|
||||
PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
|
||||
SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service,
|
||||
};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -132,6 +133,10 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
priority_reconciler_concurrency: Option<usize>,
|
||||
|
||||
/// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper)
|
||||
#[arg(long)]
|
||||
safekeeper_reconciler_concurrency: Option<usize>,
|
||||
|
||||
/// Tenant API rate limit, as requests per second per tenant.
|
||||
#[arg(long, default_value = "10")]
|
||||
tenant_rate_limit: NonZeroU32,
|
||||
@@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
priority_reconciler_concurrency: args
|
||||
.priority_reconciler_concurrency
|
||||
.unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
|
||||
safekeeper_reconciler_concurrency: args
|
||||
.safekeeper_reconciler_concurrency
|
||||
.unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT),
|
||||
tenant_rate_limit: args.tenant_rate_limit,
|
||||
split_threshold: args.split_threshold,
|
||||
max_split_shards: args.max_split_shards,
|
||||
|
||||
@@ -194,6 +194,7 @@ pub(crate) enum LeadershipStatus {
|
||||
|
||||
pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
|
||||
pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
|
||||
pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
|
||||
|
||||
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
|
||||
// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
|
||||
@@ -382,6 +383,9 @@ pub struct Config {
|
||||
/// How many high-priority Reconcilers may be spawned concurrently
|
||||
pub priority_reconciler_concurrency: usize,
|
||||
|
||||
/// How many safekeeper reconciles may happen concurrently (per safekeeper)
|
||||
pub safekeeper_reconciler_concurrency: usize,
|
||||
|
||||
/// How many API requests per second to allow per tenant, across all
|
||||
/// tenant-scoped API endpoints. Further API requests queue until ready.
|
||||
pub tenant_rate_limit: NonZeroU32,
|
||||
@@ -3659,7 +3663,7 @@ impl Service {
|
||||
locations: ShardMutationLocations,
|
||||
http_client: reqwest::Client,
|
||||
jwt: Option<String>,
|
||||
create_req: TimelineCreateRequest,
|
||||
mut create_req: TimelineCreateRequest,
|
||||
) -> Result<TimelineInfo, ApiError> {
|
||||
let latest = locations.latest.node;
|
||||
|
||||
@@ -3678,6 +3682,15 @@ impl Service {
|
||||
.await
|
||||
.map_err(|e| passthrough_api_error(&latest, e))?;
|
||||
|
||||
// If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use
|
||||
// the initdb generated by the latest location, rather than generating their own. This avoids racing uploads
|
||||
// of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries.
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode {
|
||||
*existing_initdb_timeline_id = Some(create_req.new_timeline_id);
|
||||
}
|
||||
}
|
||||
|
||||
// We propagate timeline creations to all attached locations such that a compute
|
||||
// for the new timeline is able to start regardless of the current state of the
|
||||
// tenant shard reconciliation.
|
||||
@@ -3720,6 +3733,10 @@ impl Service {
|
||||
// Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
|
||||
// use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard
|
||||
// that will get the first creation request, and propagate the LSN to all the >0 shards.
|
||||
//
|
||||
// This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than
|
||||
// independently generating their own initdb. This guarantees that shards cannot end up with different initial
|
||||
// states if e.g. they have different postgres binary versions.
|
||||
let timeline_info = create_one(
|
||||
shard_zero_tid,
|
||||
shard_zero_locations,
|
||||
@@ -3729,11 +3746,16 @@ impl Service {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Propagate the LSN that shard zero picked, if caller didn't provide one
|
||||
// Update the create request for shards >= 0
|
||||
match &mut create_req.mode {
|
||||
models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
|
||||
// Propagate the LSN that shard zero picked, if caller didn't provide one
|
||||
*ancestor_start_lsn = timeline_info.ancestor_lsn;
|
||||
},
|
||||
models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => {
|
||||
// For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3
|
||||
*existing_initdb_timeline_id = Some(create_req.new_timeline_id)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
@@ -5159,7 +5181,8 @@ impl Service {
|
||||
}
|
||||
|
||||
// We don't expect any new_shard_count shards to exist here, but drop them just in case
|
||||
tenants.retain(|_id, s| s.shard.count != *new_shard_count);
|
||||
tenants
|
||||
.retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count));
|
||||
|
||||
detach_locations
|
||||
};
|
||||
|
||||
@@ -3,7 +3,10 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
|
||||
use clashmap::{ClashMap, Entry};
|
||||
use safekeeper_api::models::PullTimelineRequest;
|
||||
use safekeeper_client::mgmt_api;
|
||||
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
|
||||
use tokio::sync::{
|
||||
Semaphore,
|
||||
mpsc::{self, UnboundedReceiver, UnboundedSender},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
@@ -206,18 +209,27 @@ impl ReconcilerHandle {
|
||||
}
|
||||
|
||||
pub(crate) struct SafekeeperReconciler {
|
||||
service: Arc<Service>,
|
||||
inner: SafekeeperReconcilerInner,
|
||||
concurrency_limiter: Arc<Semaphore>,
|
||||
rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// Thin wrapper over `Service` to not clutter its inherent functions
|
||||
#[derive(Clone)]
|
||||
struct SafekeeperReconcilerInner {
|
||||
service: Arc<Service>,
|
||||
}
|
||||
|
||||
impl SafekeeperReconciler {
|
||||
fn spawn(cancel: CancellationToken, service: Arc<Service>) -> ReconcilerHandle {
|
||||
// We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let concurrency = service.config.safekeeper_reconciler_concurrency;
|
||||
let mut reconciler = SafekeeperReconciler {
|
||||
service,
|
||||
inner: SafekeeperReconcilerInner { service },
|
||||
rx,
|
||||
concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
|
||||
cancel: cancel.clone(),
|
||||
};
|
||||
let handle = ReconcilerHandle {
|
||||
@@ -230,31 +242,44 @@ impl SafekeeperReconciler {
|
||||
}
|
||||
async fn run(&mut self) {
|
||||
loop {
|
||||
// TODO add parallelism with semaphore here
|
||||
let req = tokio::select! {
|
||||
req = self.rx.recv() => req,
|
||||
_ = self.cancel.cancelled() => break,
|
||||
};
|
||||
let Some((req, req_cancel)) = req else { break };
|
||||
|
||||
let permit_res = tokio::select! {
|
||||
req = self.concurrency_limiter.clone().acquire_owned() => req,
|
||||
_ = self.cancel.cancelled() => break,
|
||||
};
|
||||
let Ok(_permit) = permit_res else { return };
|
||||
|
||||
let inner = self.inner.clone();
|
||||
if req_cancel.is_cancelled() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let kind = req.kind;
|
||||
let tenant_id = req.tenant_id;
|
||||
let timeline_id = req.timeline_id;
|
||||
let node_id = req.safekeeper.skp.id;
|
||||
self.reconcile_one(req, req_cancel)
|
||||
.instrument(tracing::info_span!(
|
||||
"reconcile_one",
|
||||
?kind,
|
||||
%tenant_id,
|
||||
?timeline_id,
|
||||
%node_id,
|
||||
))
|
||||
.await;
|
||||
tokio::task::spawn(async move {
|
||||
let kind = req.kind;
|
||||
let tenant_id = req.tenant_id;
|
||||
let timeline_id = req.timeline_id;
|
||||
let node_id = req.safekeeper.skp.id;
|
||||
inner
|
||||
.reconcile_one(req, req_cancel)
|
||||
.instrument(tracing::info_span!(
|
||||
"reconcile_one",
|
||||
?kind,
|
||||
%tenant_id,
|
||||
?timeline_id,
|
||||
%node_id,
|
||||
))
|
||||
.await;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperReconcilerInner {
|
||||
async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
|
||||
let req_host = req.safekeeper.skp.host.clone();
|
||||
match req.kind {
|
||||
@@ -281,10 +306,11 @@ impl SafekeeperReconciler {
|
||||
req,
|
||||
async |client| client.pull_timeline(&pull_req).await,
|
||||
|resp| {
|
||||
tracing::info!(
|
||||
"pulled timeline from {} onto {req_host}",
|
||||
resp.safekeeper_host,
|
||||
);
|
||||
if let Some(host) = resp.safekeeper_host {
|
||||
tracing::info!("pulled timeline from {host} onto {req_host}");
|
||||
} else {
|
||||
tracing::info!("timeline already present on safekeeper on {req_host}");
|
||||
}
|
||||
},
|
||||
req_cancel,
|
||||
)
|
||||
|
||||
@@ -1194,8 +1194,7 @@ class NeonEnv:
|
||||
else:
|
||||
cfg["broker"]["listen_addr"] = self.broker.listen_addr()
|
||||
|
||||
if self.control_plane_api is not None:
|
||||
cfg["control_plane_api"] = self.control_plane_api
|
||||
cfg["control_plane_api"] = self.control_plane_api
|
||||
|
||||
if self.control_plane_hooks_api is not None:
|
||||
cfg["control_plane_hooks_api"] = self.control_plane_hooks_api
|
||||
@@ -1280,7 +1279,8 @@ class NeonEnv:
|
||||
)
|
||||
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests
|
||||
# This feature is pending rollout.
|
||||
# tenant_config["rel_size_v2_enabled"] = True
|
||||
|
||||
if self.pageserver_remote_storage is not None:
|
||||
ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import math # Add this import
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
@@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence(
|
||||
- insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are
|
||||
- verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension
|
||||
"""
|
||||
project = neon_api.create_project(pg_version)
|
||||
project = neon_api.create_project(
|
||||
pg_version,
|
||||
f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}",
|
||||
)
|
||||
project_id = project["project"]["id"]
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
endpoint_id = project["endpoints"][0]["id"]
|
||||
|
||||
@@ -1,156 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
import timeit
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import USE_LFC
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(100000)
|
||||
@pytest.mark.parametrize("n_readers", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("n_writers", [0, 1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("chunk_size", [1, 8, 16])
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_lfc_prefetch(neon_simple_env: NeonEnv, n_readers: int, n_writers: int, chunk_size: int):
|
||||
"""
|
||||
Test prefetch under different kinds of workload
|
||||
"""
|
||||
env = neon_simple_env
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
"neon.max_file_cache_size=100MB",
|
||||
"neon.file_cache_size_limit=100MB",
|
||||
"effective_io_concurrency=100",
|
||||
"shared_buffers=128MB",
|
||||
"enable_bitmapscan=off",
|
||||
"enable_seqscan=off",
|
||||
f"neon.file_cache_chunk_size={chunk_size}",
|
||||
"neon.store_prefetch_result_in_lfc=on",
|
||||
],
|
||||
)
|
||||
n_records = 100000 # 800Mb table
|
||||
top_n = n_records // 4 # 200Mb - should be larger than LFC size
|
||||
test_time = 100.0 # seconds
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"create table account(id integer primary key, balance integer default 0, filler text default repeat('?',1000)) with (fillfactor=10)"
|
||||
)
|
||||
cur.execute(f"insert into account values (generate_series(1,{n_records}))")
|
||||
cur.execute("vacuum account")
|
||||
|
||||
def reader():
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
i = 0
|
||||
cur.execute("set statement_timeout=0")
|
||||
while running:
|
||||
cur.execute(
|
||||
f"select sum(balance) from (select balance from account order by id limit {top_n}) s"
|
||||
)
|
||||
sum = cur.fetchall()[0][0]
|
||||
assert sum == 0 # check consistency
|
||||
i += 1
|
||||
log.info(f"Did {i} index scans")
|
||||
|
||||
def writer():
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
i = 0
|
||||
cur.execute("set statement_timeout=0")
|
||||
while running:
|
||||
r1 = random.randint(1, top_n)
|
||||
r2 = random.randint(1, top_n)
|
||||
# avoid deadlock by ordering src and dst
|
||||
src = min(r1, r2)
|
||||
dst = max(r1, r2)
|
||||
cur.execute(
|
||||
f"update account set balance=balance-1 where id={src}; update account set balance=balance+1 where id={dst}"
|
||||
)
|
||||
i += 1
|
||||
log.info(f"Did {i} updates")
|
||||
|
||||
readers = [threading.Thread(target=reader) for _ in range(n_readers)]
|
||||
writers = [threading.Thread(target=writer) for _ in range(n_writers)]
|
||||
|
||||
running = True
|
||||
for t in readers:
|
||||
t.start()
|
||||
for t in writers:
|
||||
t.start()
|
||||
|
||||
time.sleep(test_time)
|
||||
running = False
|
||||
for t in readers:
|
||||
t.join()
|
||||
for t in writers:
|
||||
t.join()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_lfc_async_prefetch_performance(neon_simple_env: NeonEnv, zenbenchmark):
|
||||
"""
|
||||
Demonstrate performance advantages of storing prefetch results in LFC
|
||||
"""
|
||||
env = neon_simple_env
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
"neon.max_file_cache_size=100MB",
|
||||
"neon.file_cache_size_limit=100MB",
|
||||
"effective_io_concurrency=100",
|
||||
"shared_buffers=1MB",
|
||||
"enable_bitmapscan=off",
|
||||
"enable_seqscan=off",
|
||||
"autovacuum=off",
|
||||
],
|
||||
)
|
||||
n_records = 100000 # 800Mb table
|
||||
n_iterations = 1000
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"create table account(id integer primary key, balance integer default 0, filler text default repeat('?',1000)) with (fillfactor=10)"
|
||||
)
|
||||
cur.execute(f"insert into account values (generate_series(1,{n_records}))")
|
||||
cur.execute("vacuum account")
|
||||
|
||||
start = timeit.default_timer()
|
||||
with zenbenchmark.record_duration("do_not_store_prefetch_results"):
|
||||
cur.execute("set neon.store_prefetch_result_in_lfc=off")
|
||||
for _ in range(n_iterations):
|
||||
cur.execute(
|
||||
"select sum(balance) from (select balance from account where id between 1000 and 2000 limit 100) s"
|
||||
)
|
||||
cur.execute(
|
||||
"select sum(balance) from (select balance from account where id between 6000 and 7000 limit 100) s"
|
||||
)
|
||||
end = timeit.default_timer()
|
||||
do_not_store_prefetch_results_duration = end - start
|
||||
|
||||
start = timeit.default_timer()
|
||||
with zenbenchmark.record_duration("store_prefetch_results"):
|
||||
cur.execute("set neon.store_prefetch_result_in_lfc=on")
|
||||
for _ in range(n_iterations):
|
||||
cur.execute(
|
||||
"select sum(balance) from (select balance from account where id between 1000 and 2000 limit 100) s"
|
||||
)
|
||||
cur.execute(
|
||||
"select sum(balance) from (select balance from account where id between 6000 and 7000 limit 100) s"
|
||||
)
|
||||
end = timeit.default_timer()
|
||||
store_prefetch_results_duration = end - start
|
||||
|
||||
assert do_not_store_prefetch_results_duration >= store_prefetch_results_duration
|
||||
@@ -62,7 +62,9 @@ def test_ro_replica_lag(
|
||||
|
||||
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project = neon_api.create_project(
|
||||
pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
|
||||
)
|
||||
project_id = project["project"]["id"]
|
||||
log.info("Project ID: %s", project_id)
|
||||
log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])
|
||||
@@ -195,7 +197,9 @@ def test_replication_start_stop(
|
||||
pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}"
|
||||
error_occurred = False
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project = neon_api.create_project(
|
||||
pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
|
||||
)
|
||||
project_id = project["project"]["id"]
|
||||
log.info("Project ID: %s", project_id)
|
||||
log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])
|
||||
|
||||
@@ -206,7 +206,7 @@ class NeonProject:
|
||||
self.neon_api = neon_api
|
||||
self.pg_bin = pg_bin
|
||||
proj = self.neon_api.create_project(
|
||||
pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}"
|
||||
pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
|
||||
)
|
||||
self.id: str = proj["project"]["id"]
|
||||
self.name: str = proj["project"]["name"]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user