Compare commits

..

3 Commits

Author SHA1 Message Date
Jan Christian Grünhage
f5c1f749db nomerge: simulate run-kind=storage-rc-pr 2025-02-28 11:24:45 +01:00
Alexey Masterov
04a33e8af0 Change TEST_EXTENSIONS_TAG variable 2025-02-28 11:13:15 +01:00
Alexey Masterov
71f0235600 TEst extensions upgrade should work correctly if some neon images of compute tag are not accessible 2025-02-28 11:09:06 +01:00
238 changed files with 3387 additions and 10708 deletions

View File

@@ -32,6 +32,3 @@ config-variables:
- NEON_DEV_AWS_ACCOUNT_ID
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID
- SLACK_ON_CALL_DEVPROD_STREAM
- SLACK_RUST_CHANNEL_ID

View File

@@ -84,13 +84,7 @@ runs:
--header "Authorization: Bearer ${API_KEY}"
)
role_name=$(echo "$roles" | jq --raw-output '
(.roles | map(select(.protected == false))) as $roles |
if any($roles[]; .name == "neondb_owner")
then "neondb_owner"
else $roles[0].name
end
')
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
@@ -113,13 +107,13 @@ runs:
)
if [ -z "${reset_password}" ]; then
sleep $i
sleep 1
continue
fi
password=$(echo $reset_password | jq --raw-output '.role.password')
if [ "${password}" == "null" ]; then
sleep $i # increasing backoff
sleep 1
continue
fi

View File

@@ -44,11 +44,6 @@ inputs:
description: 'Postgres version to use for tests'
required: false
default: 'v16'
sanitizers:
description: 'enabled or disabled'
required: false
default: 'disabled'
type: string
benchmark_durations:
description: 'benchmark durations JSON'
required: false
@@ -64,7 +59,7 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
@@ -117,7 +112,6 @@ runs:
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FAILED: ${{ inputs.rerun_failed }}
PG_VERSION: ${{ inputs.pg_version }}
SANITIZERS: ${{ inputs.sanitizers }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report

View File

@@ -280,7 +280,7 @@ jobs:
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
@@ -347,7 +347,6 @@ jobs:
real_s3_region: eu-central-1
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
sanitizers: ${{ inputs.sanitizers }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
@@ -360,6 +359,7 @@ jobs:
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
SANITIZERS: ${{ inputs.sanitizers }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -19,7 +19,7 @@ on:
description: "Tag of the last compute release"
value: ${{ jobs.tags.outputs.compute }}
run-kind:
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
value: ${{ jobs.tags.outputs.run-kind }}
permissions: {}
@@ -46,7 +46,7 @@ jobs:
env:
RUN_KIND: >-
${{
false
'storage-rc-pr'
|| (inputs.github-event-name == 'push' && github.ref_name == 'main') && 'push-main'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release') && 'storage-release'
|| (inputs.github-event-name == 'push' && github.ref_name == 'release-compute') && 'compute-release'
@@ -55,7 +55,6 @@ jobs:
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|| (inputs.github-event-name == 'pull_request') && 'pr'
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|| 'unknown'
}}
run: |
@@ -86,9 +85,6 @@ jobs:
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
workflow-dispatch)
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
;;
*)
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
exit 1

View File

@@ -141,8 +141,6 @@ jobs:
--ignore test_runner/performance/test_physical_replication.py
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
--ignore test_runner/performance/test_perf_many_relations.py
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

View File

@@ -692,15 +692,15 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
vm-compute-node-image-arch:
vm-compute-node-image:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: [ self-hosted, large ]
strategy:
fail-fast: false
matrix:
arch: [ amd64, arm64 ]
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
debian: bullseye
- pg: v15
@@ -717,7 +717,7 @@ jobs:
- name: Downloading vm-builder
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
chmod +x vm-builder
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,37 +738,12 @@ jobs:
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
-target-arch=linux/${{ matrix.arch }}
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-target-arch=linux/amd64
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
- pg: v15
- pg: v16
- pg: v17
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
@@ -1061,7 +1036,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:
@@ -1175,7 +1150,7 @@ jobs:
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}} \
@@ -1183,7 +1158,7 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
-f deployStorage=true \
-f deployStorageBroker=false \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.meta.outputs.build-tag}}
@@ -1231,7 +1206,7 @@ jobs:
payload: |
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
text: |
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:

View File

@@ -7,7 +7,7 @@ on:
required: false
type: string
schedule:
- cron: '0 10 * * *'
- cron: '0 0 * * *'
jobs:
cargo-deny:
@@ -50,9 +50,8 @@ jobs:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
text: |
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
Pinging @oncall-devprod.

View File

@@ -52,9 +52,8 @@ jobs:
- name: Test extension upgrade
timeout-minutes: 20
env:
NEW_COMPUTE_TAG: latest
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh

View File

@@ -1,147 +0,0 @@
name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: true
jobs:
oltp:
strategy:
fail-fast: false # allow other variants to continue even if one fails
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary to download artefacts
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Create Neon Branch for large tenant
if: ${{ matrix.target == 'new_branch' }}
id: create-neon-branch-oltp-target
uses: ./.github/actions/neon-branch-create
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Periodic large oltp perf testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 */3 * * *' # Runs every 3 hours
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:
@@ -78,10 +78,8 @@ jobs:
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
@@ -91,7 +89,7 @@ jobs:
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
- name: Poll Test Status
id: poll_step

View File

@@ -1,8 +1,8 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd & PerfCorr
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute

307
Cargo.lock generated
View File

@@ -783,28 +783,6 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-extra"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
dependencies = [
"axum",
"axum-core",
"bytes",
"futures-util",
"headers",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"mime",
"pin-project-lite",
"serde",
"tower 0.5.2",
"tower-layer",
"tower-service",
]
[[package]]
name = "azure_core"
version = "0.21.0"
@@ -947,9 +925,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
[[package]]
name = "base64"
version = "0.21.7"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
[[package]]
name = "base64"
@@ -1127,9 +1105,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cc"
version = "1.2.16"
version = "1.1.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
dependencies = [
"jobserver",
"libc",
@@ -1327,7 +1305,6 @@ dependencies = [
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"axum-extra",
"base64 0.13.1",
"bytes",
"camino",
@@ -1339,7 +1316,6 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"notify",
@@ -2321,7 +2297,7 @@ name = "framed-websockets"
version = "0.1.0"
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"bytemuck",
"bytes",
"futures-core",
@@ -2434,9 +2410,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-timer"
version = "3.0.3"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
@@ -2539,27 +2515,6 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "governor"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
dependencies = [
"cfg-if",
"dashmap 6.1.0",
"futures-sink",
"futures-timer",
"futures-util",
"no-std-compat",
"nonzero_ext",
"parking_lot 0.12.1",
"portable-atomic",
"quanta",
"rand 0.8.5",
"smallvec",
"spinning_top",
]
[[package]]
name = "group"
version = "0.12.1"
@@ -2677,7 +2632,7 @@ version = "7.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"byteorder",
"crossbeam-channel",
"flate2",
@@ -2685,30 +2640,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "headers"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
dependencies = [
"base64 0.21.7",
"bytes",
"headers-core",
"http 1.1.0",
"httpdate",
"mime",
"sha1",
]
[[package]]
name = "headers-core"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
dependencies = [
"http 1.1.0",
]
[[package]]
name = "heck"
version = "0.5.0"
@@ -2846,10 +2777,12 @@ name = "http-utils"
version = "0.1.0"
dependencies = [
"anyhow",
"backtrace",
"bytes",
"fail",
"futures",
"flate2",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"metrics",
@@ -2862,7 +2795,6 @@ dependencies = [
"serde_path_to_error",
"thiserror 1.0.69",
"tokio",
"tokio-rustls 0.26.0",
"tokio-stream",
"tokio-util",
"tracing",
@@ -3349,9 +3281,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jemalloc_pprof"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
dependencies = [
"anyhow",
"libc",
@@ -3435,7 +3367,7 @@ version = "9.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"js-sys",
"pem",
"ring",
@@ -3550,9 +3482,9 @@ dependencies = [
[[package]]
name = "mappings"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
dependencies = [
"anyhow",
"libc",
@@ -3793,12 +3725,6 @@ dependencies = [
"memoffset 0.9.0",
]
[[package]]
name = "no-std-compat"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
[[package]]
name = "nom"
version = "7.1.3"
@@ -3809,12 +3735,6 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nonzero_ext"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
[[package]]
name = "notify"
version = "8.0.0"
@@ -4064,7 +3984,7 @@ dependencies = [
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry_sdk",
"prost 0.13.3",
"prost",
"reqwest",
"thiserror 1.0.69",
]
@@ -4077,7 +3997,7 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"prost 0.13.3",
"prost",
"tonic",
]
@@ -4191,7 +4111,6 @@ dependencies = [
"pageserver_api",
"pageserver_client",
"rand 0.8.5",
"reqwest",
"serde",
"serde_json",
"tokio",
@@ -4281,9 +4200,6 @@ dependencies = [
"remote_storage",
"reqwest",
"rpds",
"rustls 0.23.18",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"scopeguard",
"send-future",
"serde",
@@ -4302,7 +4218,6 @@ dependencies = [
"tokio-epoll-uring",
"tokio-io-timeout",
"tokio-postgres",
"tokio-rustls 0.26.0",
"tokio-stream",
"tokio-tar",
"tokio-util",
@@ -4310,7 +4225,6 @@ dependencies = [
"tracing",
"url",
"utils",
"uuid",
"wal_decoder",
"walkdir",
"workspace_hack",
@@ -4393,9 +4307,9 @@ dependencies = [
[[package]]
name = "papaya"
version = "0.2.0"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
dependencies = [
"equivalent",
"seize",
@@ -4523,7 +4437,7 @@ version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"serde",
]
@@ -4677,12 +4591,6 @@ dependencies = [
"never-say-never",
]
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "postgres"
version = "0.19.7"
@@ -4837,10 +4745,8 @@ dependencies = [
"nix 0.26.4",
"once_cell",
"parking_lot 0.12.1",
"prost 0.12.6",
"prost-build 0.12.6",
"prost-derive 0.12.6",
"sha2",
"protobuf",
"protobuf-codegen-pure",
"smallvec",
"symbolic-demangle",
"tempfile",
@@ -4849,17 +4755,15 @@ dependencies = [
[[package]]
name = "pprof_util"
version = "0.7.0"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
dependencies = [
"anyhow",
"backtrace",
"flate2",
"inferno 0.12.0",
"num",
"paste",
"prost 0.13.3",
"prost",
]
[[package]]
@@ -4952,16 +4856,6 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "prost"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29"
dependencies = [
"bytes",
"prost-derive 0.12.6",
]
[[package]]
name = "prost"
version = "0.13.3"
@@ -4969,28 +4863,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
dependencies = [
"bytes",
"prost-derive 0.13.3",
]
[[package]]
name = "prost-build"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
dependencies = [
"bytes",
"heck",
"itertools 0.10.5",
"log",
"multimap",
"once_cell",
"petgraph",
"prettyplease",
"prost 0.12.6",
"prost-types 0.12.6",
"regex",
"syn 2.0.90",
"tempfile",
"prost-derive",
]
[[package]]
@@ -5007,26 +4880,13 @@ dependencies = [
"once_cell",
"petgraph",
"prettyplease",
"prost 0.13.3",
"prost-types 0.13.3",
"prost",
"prost-types",
"regex",
"syn 2.0.90",
"tempfile",
]
[[package]]
name = "prost-derive"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
dependencies = [
"anyhow",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 2.0.90",
]
[[package]]
name = "prost-derive"
version = "0.13.3"
@@ -5040,22 +4900,38 @@ dependencies = [
"syn 2.0.90",
]
[[package]]
name = "prost-types"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0"
dependencies = [
"prost 0.12.6",
]
[[package]]
name = "prost-types"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
dependencies = [
"prost 0.13.3",
"prost",
]
[[package]]
name = "protobuf"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
[[package]]
name = "protobuf-codegen"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6"
dependencies = [
"protobuf",
]
[[package]]
name = "protobuf-codegen-pure"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865"
dependencies = [
"protobuf",
"protobuf-codegen",
]
[[package]]
@@ -5176,21 +5052,6 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "quanta"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.0+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]]
name = "quick-xml"
version = "0.26.0"
@@ -5321,15 +5182,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "raw-cpuid"
version = "11.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -5664,16 +5516,16 @@ dependencies = [
[[package]]
name = "ring"
version = "0.17.13"
version = "0.17.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866"
dependencies = [
"cc",
"cfg-if",
"getrandom 0.2.11",
"libc",
"spin",
"untrusted",
"windows-sys 0.52.0",
"windows-sys 0.48.0",
]
[[package]]
@@ -5900,7 +5752,7 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
]
[[package]]
@@ -5909,15 +5761,15 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
dependencies = [
"base64 0.21.7",
"base64 0.21.1",
"rustls-pki-types",
]
[[package]]
name = "rustls-pki-types"
version = "1.11.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
[[package]]
name = "rustls-webpki"
@@ -6148,9 +6000,9 @@ dependencies = [
[[package]]
name = "seize"
version = "0.5.0"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
dependencies = [
"libc",
"windows-sys 0.52.0",
@@ -6543,15 +6395,6 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "spinning_top"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
dependencies = [
"lock_api",
]
[[package]]
name = "spki"
version = "0.6.0"
@@ -6603,7 +6446,7 @@ dependencies = [
"metrics",
"once_cell",
"parking_lot 0.12.1",
"prost 0.13.3",
"prost",
"rustls 0.23.18",
"tokio",
"tonic",
@@ -6621,7 +6464,6 @@ dependencies = [
"bytes",
"chrono",
"clap",
"clashmap",
"control_plane",
"cron",
"diesel",
@@ -6629,7 +6471,6 @@ dependencies = [
"diesel_migrations",
"fail",
"futures",
"governor",
"hex",
"http-utils",
"humantime",
@@ -7376,7 +7217,7 @@ dependencies = [
"hyper-util",
"percent-encoding",
"pin-project",
"prost 0.13.3",
"prost",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
"tokio",
@@ -7396,8 +7237,8 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
dependencies = [
"prettyplease",
"proc-macro2",
"prost-build 0.13.3",
"prost-types 0.13.3",
"prost-build",
"prost-types",
"quote",
"syn 2.0.90",
]
@@ -7444,12 +7285,10 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
dependencies = [
"base64 0.22.1",
"bitflags 2.8.0",
"bytes",
"http 1.1.0",
"http-body 1.0.0",
"mime",
"pin-project-lite",
"tower-layer",
"tower-service",
@@ -7803,6 +7642,7 @@ dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"backtrace",
"bincode",
"byteorder",
"bytes",
@@ -7930,7 +7770,7 @@ dependencies = [
"pageserver_api",
"postgres_ffi",
"pprof",
"prost 0.13.3",
"prost",
"remote_storage",
"serde",
"serde_json",
@@ -8356,7 +8196,7 @@ dependencies = [
"ahash",
"anyhow",
"base64 0.13.1",
"base64 0.21.7",
"base64 0.21.1",
"base64ct",
"bytes",
"camino",
@@ -8387,7 +8227,6 @@ dependencies = [
"hyper-util",
"indexmap 1.9.3",
"indexmap 2.0.1",
"itertools 0.10.5",
"itertools 0.12.1",
"lazy_static",
"libc",
@@ -8406,7 +8245,7 @@ dependencies = [
"parquet",
"prettyplease",
"proc-macro2",
"prost 0.13.3",
"prost",
"quote",
"rand 0.8.5",
"regex",

View File

@@ -53,6 +53,7 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
assert-json-diff = "2"
async-stream = "0.3"
@@ -67,7 +68,6 @@ aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.8.1", features = ["ws"] }
axum-extra = { version = "0.10.0", features = ["typed-header"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.71"
@@ -95,7 +95,6 @@ futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
governor = "0.8"
hashbrown = "0.14"
hashlink = "0.9.1"
hdrhistogram = "7.5.2"
@@ -114,10 +113,11 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
jemalloc_pprof = "0.6"
jsonwebtoken = "9"
lasso = "0.7"
libc = "0.2"
@@ -139,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
procfs = "0.16"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prost = "0.13"
@@ -155,7 +155,6 @@ rpds = "0.13"
rustc-hash = "1.1.0"
rustls = { version = "0.23.16", default-features = false }
rustls-pemfile = "2"
rustls-pki-types = "1.11"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
@@ -193,7 +192,7 @@ toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
# This revision uses opentelemetry 0.27. There's no tag for it.
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }

View File

@@ -11,16 +11,15 @@ ICU_PREFIX_DIR := /usr/local/icu
#
BUILD_TYPE ?= debug
WITH_SANITIZERS ?= no
PG_CFLAGS = -fsigned-char
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS += -O2 -g3 $(CFLAGS)
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS += -O0 -g3 $(CFLAGS)
PG_CFLAGS = -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -160,8 +159,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling pg_trgm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"

View File

@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
# zstd is available only from PG15
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
else\
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
fi\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
@@ -1933,7 +1933,6 @@ RUN apt update && \
locales \
procps \
ca-certificates \
rsyslog \
$VERSION_INSTALLS && \
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1979,13 +1978,6 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
# Make the libraries we built available
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# rsyslog config permissions
# directory for rsyslogd pid file
RUN mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd && \
chown -R postgres:postgres /etc/rsyslog.d/
ENV LANG=en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -29,7 +29,6 @@
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
import 'sql_exporter/lfc_chunk_size.libsonnet',
import 'sql_exporter/lfc_hits.libsonnet',
import 'sql_exporter/lfc_misses.libsonnet',
import 'sql_exporter/lfc_used.libsonnet',

View File

@@ -1,5 +1 @@
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;

View File

@@ -1,10 +0,0 @@
{
metric_name: 'lfc_chunk_size',
type: 'gauge',
help: 'LFC chunk size, measured in 8KiB pages',
key_labels: null,
values: [
'lfc_chunk_size_pages',
],
query: importstr 'sql_exporter/lfc_chunk_size.sql',
}

View File

@@ -1 +0,0 @@
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';

View File

@@ -1,20 +1,10 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
tup_updated AS updated, tup_deleted AS deleted, datname
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2
AND datname <> 'postgres'
AND NOT datistemplate
ORDER BY oid
LIMIT 10
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
);

View File

@@ -39,10 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -58,7 +54,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -73,12 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -142,12 +132,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -39,10 +39,6 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -58,7 +54,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -73,12 +69,6 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -138,11 +128,6 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -17,7 +17,6 @@ aws-sdk-kms.workspace = true
aws-smithy-types.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
axum-extra.workspace = true
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -26,7 +25,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true

View File

@@ -33,27 +33,39 @@
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::collections::HashMap;
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::mpsc;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
use std::thread;
use std::time::Duration;
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Parser;
use compute_api::responses::ComputeCtlConfig;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
use compute_tools::compute::{
ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
};
use compute_tools::configurator::launch_configurator;
use compute_tools::disk_quota::set_disk_quota;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::server::Server;
use compute_tools::logger::*;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{Resource, setrlimit};
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
use signal_hook::iterator::Signals;
use tracing::{error, info};
use tracing::{error, info, warn};
use url::Url;
use utils::failpoint_support;
@@ -152,41 +164,29 @@ fn main() -> Result<()> {
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_spec = try_spec_from_cli(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
compute_id: cli.compute_id,
connstr,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
ext_remote_storage: cli.remote_ext_config.clone(),
resize_swap_on_bind: cli.resize_swap_on_bind,
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
#[cfg(target_os = "linux")]
filecache_connstr: cli.filecache_connstr,
#[cfg(target_os = "linux")]
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
build_tag,
let compute = wait_spec(build_tag, &cli, cli_spec)?;
live_config_allowed: cli_spec.live_config_allowed,
},
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
start_postgres(&cli, compute)?
let exit_code = compute_node.run()?;
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
scenario.teardown();
deinit_and_exit(exit_code);
deinit_and_exit(wait_pg_result);
}
async fn init() -> Result<String> {
@@ -207,6 +207,56 @@ async fn init() -> Result<String> {
Ok(build_tag)
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
.extract(&startup_tracing_carrier)
.attach();
info!("startup tracing context attached");
Some(guard)
} else {
None
}
}
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, try to get cluster spec from the cli argument
if let Some(ref spec_json) = cli.spec_json {
@@ -257,7 +307,357 @@ struct CliSpecParams {
live_config_allowed: bool,
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
fn wait_spec(
build_tag: String,
cli: &Cli,
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
new_state.pspec = Some(pspec);
spec_set = true;
} else {
spec_set = false;
}
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
.context("cannot build postgres config from connstr")?;
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
.context("cannot build tokio postgres config from connstr")?;
let compute_node = ComputeNode {
compute_id: cli.compute_id.clone(),
connstr,
conn_conf,
tokio_conn_conf,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: cli.remote_ext_config.clone(),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
let compute = Arc::new(compute_node);
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch the external HTTP server first, so that we can serve control plane
// requests while configuration is still in progress.
Server::External(cli.external_http_port).launch(&compute);
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port).launch(&compute);
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if state.status == ComputeStatus::ConfigurationPending {
info!("got spec, continue configuration");
// Spec is already set by the http server handler.
break;
}
}
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
}
launch_lsn_lease_bg_task_for_static(&compute);
Ok(compute)
}
fn start_postgres(
cli: &Cli,
compute: Arc<ComputeNode>,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
// Create a tracing span for the startup operation.
//
// We could otherwise just annotate the function with #[instrument], but if
// we're being configured from a /configure HTTP request, we want the
// startup to be considered part of the /configure request.
let _this_entered = {
// Temporarily enter the /configure request's span, so that the new span
// becomes its child.
let _parent_entered = state.startup_span.take().map(|p| p.entered());
tracing::info_span!("start_postgres")
}
.entered();
state.set_status(ComputeStatus::Init, &compute.state_changed);
info!(
"running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features
);
// before we release the mutex, fetch some parameters for later.
let &ComputeSpec {
swap_size_bytes,
disk_quota_bytes,
#[cfg(target_os = "linux")]
disable_lfc_resizing,
..
} = &state.pspec.as_ref().unwrap().spec;
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_mib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Set disk quota if the compute spec says so
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
{
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");
}
Err(err) => {
let err = err.context("failed to set disk quota");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Start Postgres
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
delay_exit = true;
None
}
};
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
None
} else {
Some(cli.filecache_connstr.clone())
};
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
let vm_monitor = tokio::spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: Some(cli.cgroup.clone()),
pgconnstr,
addr: cli.vm_monitor_addr.clone(),
})),
token.clone(),
));
Some(vm_monitor)
} else {
None
};
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
// Process has exited. Wait for the log collecting task to finish.
let _ = tokio::runtime::Handle::current()
.block_on(logs_handle)
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
if let Some(handle) = vm_monitor {
// Kills all threads spawned by the monitor
token.cancel();
// Kills the actual task running the monitor
handle.abort();
}
}
}
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
info!("syncing safekeepers on shutdown");
let storage_auth_token = pspec.storage_auth_token.clone();
let lsn = compute.sync_safekeepers(storage_auth_token)?;
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -58,14 +58,14 @@ pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
let pgbin = &compute.params.pgbin;
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
// Replace the DB in the connection string and disable it to parts.
// This is the only option to handle DBs with special characters.
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
.map_err(|_| SchemaDumpError::Unexpected)?;
let conf =
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
let host = conf
.get_hosts()
.first()

File diff suppressed because it is too large Load Diff

View File

@@ -1,16 +1,12 @@
use anyhow::Result;
use std::fmt::Write as FmtWrite;
use std::fs::{File, OpenOptions};
use std::io;
use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use anyhow::Result;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
};
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -59,20 +55,10 @@ pub fn write_postgres_conf(
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(
"safekeepers_connstrings is not zero, gen: {:?}",
spec.safekeepers_generation
);
// If generation is given, prepend sk list with g#number:
if let Some(generation) = spec.safekeepers_generation {
write!(neon_safekeepers_value, "g#{}:", generation)?;
}
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
writeln!(
file,
"neon.safekeepers={}",
escape_conf_value(&neon_safekeepers_value)
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {
@@ -99,7 +85,6 @@ pub fn write_postgres_conf(
writeln!(file, "lc_numeric='C.UTF-8'")?;
}
writeln!(file, "neon.endpoint_type={}", spec.mode.to_type_str())?;
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {
@@ -141,54 +126,6 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl: end")?;
}
// If audit logging is enabled, configure pgaudit.
//
// Note, that this is called after the settings from spec are written.
// This way we always override the settings from the spec
// and don't allow the user or the control plane admin to change them.
if let ComputeAudit::Hipaa = spec.audit_log_level {
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
writeln!(file, "pgaudit.log='all'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;
// Set log rotation to 5 minutes
// TODO: tune this after performance testing
writeln!(file, "pgaudit.log_rotation_age=5")?;
// Add audit shared_preload_libraries, if they are not present.
//
// The caller who sets the flag is responsible for ensuring that the necessary
// shared_preload_libraries are present in the compute image,
// otherwise the compute start will fail.
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
let mut extra_shared_preload_libraries = String::new();
if !libs.contains("pgaudit") {
extra_shared_preload_libraries.push_str(",pgaudit");
}
if !libs.contains("pgauditlogtofile") {
extra_shared_preload_libraries.push_str(",pgauditlogtofile");
}
writeln!(
file,
"shared_preload_libraries='{}{}'",
libs, extra_shared_preload_libraries
)?;
} else {
// Typically, this should be unreacheable,
// because we always set at least some shared_preload_libraries in the spec
// but let's handle it explicitly anyway.
writeln!(
file,
"shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
)?;
}
writeln!(file, "# Managed by compute_ctl audit settings: end")?;
}
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
if spec.drop_subscriptions_before_start {

View File

@@ -1,10 +0,0 @@
# Load imfile module to read log files
module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
global(workDirectory="/var/log")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}

View File

@@ -202,24 +202,8 @@ pub async fn download_extension(
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
for paths in [sharedir_paths, libdir_paths] {
let (zip_dir, real_dir) = paths;
let dir = match std::fs::read_dir(&zip_dir) {
Ok(dir) => dir,
Err(e) => match e.kind() {
// In the event of a SQL-only extension, there would be nothing
// to move from the lib/ directory, so note that in the log and
// move on.
std::io::ErrorKind::NotFound => {
info!("nothing to move from {}", zip_dir);
continue;
}
_ => return Err(anyhow::anyhow!(e)),
},
};
info!("mv {zip_dir:?}/* {real_dir:?}");
for file in dir {
for file in std::fs::read_dir(zip_dir)? {
let old_file = file?.path();
let new_file =
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
@@ -269,31 +253,27 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, e.g.,
// Do request to extension storage proxy, i.e.
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HTTP GET and return the response body as bytes.
// using HHTP GET
// and return the response body as bytes
//
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
let filename = Path::new(ext_path)
.file_name()
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
.to_str()
.unwrap_or("unknown")
.to_string();
info!("Downloading extension file '{}' from uri {}", filename, uri);
info!("Download extension {} from uri {}", ext_path, uri);
match do_extension_server_request(&uri).await {
Ok(resp) => {
info!("Successfully downloaded remote extension data {}", ext_path);
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
.with_label_values(&[&StatusCode::OK.to_string()])
.inc();
Ok(resp)
}
Err((msg, status)) => {
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&status, &filename])
.with_label_values(&[&status])
.inc();
bail!(msg);
}

View File

@@ -1,9 +1,7 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) mod request_id;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;
pub(crate) use request_id::RequestId;

View File

@@ -1,86 +0,0 @@
use std::{
fmt::Display,
ops::{Deref, DerefMut},
};
use axum::{extract::FromRequestParts, response::IntoResponse};
use http::{StatusCode, request::Parts};
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
/// Extract the request ID from the `X-Request-Id` header.
#[derive(Debug, Clone, Default)]
pub(crate) struct RequestId(pub String);
#[derive(Debug)]
/// Rejection used for [`RequestId`].
///
/// Contains one variant for each way the [`RequestId`] extractor can
/// fail.
pub(crate) enum RequestIdRejection {
/// The request is missing the header.
MissingRequestId,
/// The value of the header is invalid UTF-8.
InvalidUtf8,
}
impl RequestIdRejection {
pub fn status(&self) -> StatusCode {
match self {
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
}
}
pub fn message(&self) -> String {
match self {
RequestIdRejection::MissingRequestId => "request ID is missing",
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
}
.to_string()
}
}
impl IntoResponse for RequestIdRejection {
fn into_response(self) -> axum::response::Response {
JsonResponse::error(self.status(), self.message())
}
}
impl<S> FromRequestParts<S> for RequestId
where
S: Send + Sync,
{
type Rejection = RequestIdRejection;
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
match parts.headers.get(X_REQUEST_ID) {
Some(value) => match value.to_str() {
Ok(request_id) => Ok(Self(request_id.to_string())),
Err(_) => Err(RequestIdRejection::InvalidUtf8),
},
None => Err(RequestIdRejection::MissingRequestId),
}
}
}
impl Deref for RequestId {
type Target = String;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for RequestId {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Display for RequestId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}

View File

@@ -1,2 +0,0 @@
/// Constant for `X-Request-Id` header.
pub const X_REQUEST_ID: &str = "x-request-id";

View File

@@ -1,145 +0,0 @@
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
};
use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use serde::Deserialize;
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::warn;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug, Deserialize)]
pub(in crate::http) struct Claims {
compute_id: String,
}
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
compute_id: String,
jwks: JwkSet,
validation: Validation,
}
impl Authorize {
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
let mut validation = Validation::new(Algorithm::EdDSA);
// Nothing is currently required
validation.required_spec_claims = HashSet::new();
validation.validate_exp = true;
// Unused by the control plane
validation.validate_aud = false;
// Unused by the control plane
validation.validate_nbf = false;
Self {
compute_id,
jwks,
validation,
}
}
}
impl AsyncAuthorizeRequest<Body> for Authorize {
type RequestBody = Body;
type ResponseBody = Body;
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
let compute_id = self.compute_id.clone();
let jwks = self.jwks.clone();
let validation = self.validation.clone();
Box::pin(async move {
let request_id = request.extract_parts::<RequestId>().await.unwrap();
// TODO: Remove this check after a successful rollout
if jwks.keys.is_empty() {
warn!(%request_id, "Authorization has not been configured");
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|_| {
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
})?;
let data = match Self::verify(&jwks, bearer.token(), &validation) {
Ok(claims) => claims,
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
};
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid claims in authorization token",
));
}
// Make claims available to any subsequent middleware or request
// handlers
request.extensions_mut().insert(data.claims);
Ok(request)
})
}
}
impl Authorize {
/// Verify the token using the JSON Web Key set and return the token data.
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
debug_assert!(!jwks.keys.is_empty());
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
};
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
}
}
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -1 +0,0 @@
pub(in crate::http) mod authorize;

View File

@@ -7,8 +7,6 @@ use serde::Serialize;
use tracing::error;
mod extract;
mod headers;
mod middleware;
mod routes;
pub mod server;

View File

@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.params.live_config_allowed {
if !compute.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),

View File

@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
ext_server_params: Query<ExtensionServerParams>,
params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.params.ext_remote_storage.is_none() {
if compute.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
remote_extensions.get_ext(
&filename,
ext_server_params.is_library,
&compute.params.build_tag,
&compute.params.pgversion,
params.is_library,
&compute.build_tag,
&compute.pgversion,
)
};

View File

@@ -10,58 +10,48 @@ use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use http::StatusCode;
use jsonwebtoken::jwk::JwkSet;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use tower_http::request_id::PropagateRequestIdLayer;
use tower_http::trace::TraceLayer;
use tracing::{Span, debug, error, info};
use uuid::Uuid;
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
},
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
};
use crate::compute::ComputeNode;
const X_REQUEST_ID: &str = "x-request-id";
/// `compute_ctl` has two servers: internal and external. The internal server
/// binds to the loopback interface and handles communication from clients on
/// the compute. The external server is what receives communication from the
/// control plane, the metrics scraper, etc. We make the distinction because
/// certain routes in `compute_ctl` only need to be exposed to local processes
/// like Postgres via the neon extension and local_proxy.
#[derive(Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub enum Server {
Internal {
port: u16,
},
External {
port: u16,
jwks: JwkSet,
compute_id: String,
},
Internal(u16),
External(u16),
}
impl Display for Server {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Server::Internal { .. } => f.write_str("internal"),
Server::External { .. } => f.write_str("external"),
Server::Internal(_) => f.write_str("internal"),
Server::External(_) => f.write_str("external"),
}
}
}
impl From<&Server> for Router<Arc<ComputeNode>> {
fn from(server: &Server) -> Self {
impl From<Server> for Router<Arc<ComputeNode>> {
fn from(server: Server) -> Self {
let mut router = Router::<Arc<ComputeNode>>::new();
router = match server {
Server::Internal { .. } => {
Server::Internal(_) => {
router = router
.route(
"/extension_server/{*filename}",
@@ -79,71 +69,59 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
router
}
Server::External {
jwks, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
jwks.clone(),
)));
router
.merge(unauthenticated_router)
.merge(authenticated_router)
}
Server::External(_) => router
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate)),
};
router
.fallback(Server::handle_404)
.method_not_allowed_fallback(Server::handle_405)
.layer(
ServiceBuilder::new()
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(%request_id, "{} {}", request.method(), request.uri());
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
);
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
}
}
@@ -167,15 +145,15 @@ impl Server {
match self {
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
// allow binding to localhost
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
}
}
fn port(&self) -> u16 {
fn port(self) -> u16 {
match self {
Server::Internal { port, .. } => *port,
Server::External { port, .. } => *port,
Server::Internal(port) => port,
Server::External(port) => port,
}
}
@@ -202,9 +180,7 @@ impl Server {
);
}
let router = Router::from(&self)
.with_state(compute)
.into_make_service_with_connect_info::<SocketAddr>();
let router = Router::from(self).with_state(compute);
if let Err(e) = axum::serve(listener, router).await {
error!("compute_ctl {} HTTP server error: {}", self, e);

View File

@@ -21,7 +21,6 @@ mod migration;
pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod rsyslog;
pub mod spec;
mod spec_apply;
pub mod swap;

View File

@@ -1,5 +1,3 @@
use std::collections::HashMap;
use tracing::info;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -44,50 +42,3 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
pub fn inlinify(s: &str) -> String {
s.replace('\n', "\u{200B}")
}
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
info!("got startup tracing context from env variables");
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
} else {
None
}
}

View File

@@ -54,7 +54,9 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
register_int_counter_vec!(
"compute_ctl_remote_ext_requests_total",
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
&["http_status", "filename"]
// Do not use any labels like extension name yet.
// We can add them later if needed.
&["http_status"]
)
.expect("failed to define a metric")
});

View File

@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
// should be handled gracefully.
fn watch_compute_activity(compute: &ComputeNode) {
// Suppose that `connstr` doesn't change
let connstr = compute.params.connstr.clone();
let connstr = compute.connstr.clone();
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
// During startup and configuration we connect to every Postgres database,

View File

@@ -186,40 +186,15 @@ impl DatabaseExt for Database {
/// Postgres SQL queries and DATABASE_URL.
pub trait Escaping {
fn pg_quote(&self) -> String;
fn pg_quote_dollar(&self) -> (String, String);
}
impl Escaping for PgIdent {
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
/// always quotes provided string with `""` and escapes every `"`.
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
/// clause, use `escape_literal()` instead.
fn pg_quote(&self) -> String {
format!("\"{}\"", self.replace('"', "\"\""))
}
/// This helper is intended to be used for dollar-escaping strings for usage
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
/// it also returns a tag that is intended to be used inside the outer
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
fn pg_quote_dollar(&self) -> (String, String) {
let mut tag: String = "".to_string();
let mut outer_tag = "x".to_string();
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}
let escaped = format!("${tag}${self}${tag}$");
(escaped, outer_tag)
let result = format!("\"{}\"", self.replace('"', "\"\""));
result
}
}
@@ -251,13 +226,10 @@ pub async fn get_existing_dbs_async(
// invalid state. See:
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
let rowstream = client
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
// because the latter automatically wraps the result in double quotes,
// if the role name contains special characters.
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
datdba::regrole::text AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit = - 2 AS invalid
FROM

View File

@@ -1,77 +0,0 @@
use std::process::Command;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result};
use tracing::info;
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
.arg("rsyslogd")
.output()
.expect("Failed to execute pgrep");
if !output.stdout.is_empty() {
let pid = std::str::from_utf8(&output.stdout)
.expect("Invalid UTF-8 in process output")
.trim()
.to_string();
Some(pid)
} else {
None
}
}
// Restart rsyslogd to apply the new configuration.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
// Rsyslogd shouldn't lose any messages, because of the restart,
// because it tracks the last read position in the log files
// and will continue reading from that position.
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
.output()
.context("Failed to stop rsyslogd")?;
Ok(())
}
pub fn configure_audit_rsyslog(
log_directory: &str,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag,
remote_endpoint = remote_endpoint
);
info!("rsyslog config_content: {}", config_content);
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(rsyslog_conf_path)?;
file.write_all(config_content.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
rsyslog_conf_path
);
// start the service, using the configuration
restart_rsyslog()?;
Ok(())
}

View File

@@ -6,22 +6,21 @@ use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
};
@@ -188,7 +187,7 @@ impl ComputeNode {
}
for phase in [
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -278,19 +277,6 @@ impl ComputeNode {
phases.push(FinalizeDropLogicalSubscriptions);
}
// Keep DisablePostgresDBPgAudit phase at the end,
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log => { /* not implemented yet */ }
ComputeAudit::Disabled => {}
}
for phase in phases {
debug!("Applying phase {:?}", &phase);
apply_operations(
@@ -469,7 +455,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreateNeonSuperuser,
CreateSuperUser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -477,9 +463,6 @@ pub enum ApplySpecPhase {
CreateAndAlterDatabases,
CreateSchemaNeon,
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
CreatePgauditExtension,
CreatePgauditlogtofileExtension,
DisablePostgresDBPgAudit,
HandleOtherExtensions,
HandleNeonExtension,
CreateAvailabilityCheck,
@@ -596,10 +579,14 @@ async fn get_operations<'a>(
apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase {
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(),
comment: None,
}))),
ApplySpecPhase::CreateSuperUser => {
let query = construct_superuser_query(spec);
Ok(Box::new(once(Operation {
query,
comment: None,
})))
}
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -733,15 +720,14 @@ async fn get_operations<'a>(
// We do not check whether the DB exists or not,
// Postgres will take care of it for us
"delete_db" => {
let (db_name, outer_tag) = op.name.pg_quote_dollar();
// In Postgres we can't drop a database if it is a template.
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
include_str!("sql/unset_template_for_drop_dbs.sql"),
datname = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&op.name),
datname = &op.name.pg_quote()
);
// Use FORCE to drop database even if there are active connections.
@@ -848,8 +834,6 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
db.name.pg_quote()
@@ -909,11 +893,9 @@ async fn get_operations<'a>(
PerDatabasePhase::DropLogicalSubscriptions => {
match &db {
DB::UserDB(db) => {
let (db_name, outer_tag) = db.name.pg_quote_dollar();
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscriptions.sql"),
datname_str = db_name,
outer_tag = outer_tag,
datname_str = escape_literal(&db.name),
);
let operations = vec![Operation {
@@ -952,7 +934,6 @@ async fn get_operations<'a>(
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
DB::UserDB(db) => db.owner.pg_quote(),
};
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
Some(vec![
// This will reassign all dependent objects to the db owner
@@ -967,9 +948,7 @@ async fn get_operations<'a>(
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
role_name = quoted,
),
comment: None,
},
@@ -994,14 +973,12 @@ async fn get_operations<'a>(
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
let operations = vec![
Operation {
query: format!(
include_str!("sql/set_public_schema_owner.sql"),
db_owner = db_owner,
outer_tag = outer_tag,
db_owner = db.owner.pg_quote()
),
comment: None,
},
@@ -1121,25 +1098,6 @@ async fn get_operations<'a>(
}
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
// Postgres is neon system database used by monitors
// and compute_ctl tuning functions and thus generates a lot of noise.
// We do not consider data stored in this database as sensitive.
ApplySpecPhase::DisablePostgresDBPgAudit => {
let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
Ok(Box::new(once(Operation {
query: query.to_string(),
comment: Some(query.to_string()),
})))
}
ApplySpecPhase::HandleNeonExtension => {
let operations = vec![
Operation {

View File

@@ -1,8 +0,0 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -1,4 +1,4 @@
DO ${outer_tag}$
DO $$
DECLARE
subname TEXT;
BEGIN
@@ -9,4 +9,4 @@ BEGIN
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;
$$;

View File

@@ -1,7 +1,8 @@
DO ${outer_tag}$
SET SESSION ROLE neon_superuser;
DO $$
DECLARE
schema TEXT;
grantor TEXT;
revoke_query TEXT;
BEGIN
FOR schema IN
@@ -14,25 +15,14 @@ BEGIN
-- ii) it's easy to add more schemas to the list if needed.
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
);
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
{role_name},
grantor
);
EXECUTE revoke_query;
END LOOP;
EXECUTE revoke_query;
END LOOP;
END;
${outer_tag}$;
$$;
RESET ROLE;

View File

@@ -1,4 +1,5 @@
DO ${outer_tag}$
DO
$$
DECLARE
schema_owner TEXT;
BEGIN
@@ -15,8 +16,8 @@ DO ${outer_tag}$
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
ALTER SCHEMA public OWNER TO {db_owner};
END IF;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -1,12 +1,12 @@
DO ${outer_tag}$
DO $$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {datname}
WHERE datname = {datname_str}
)
THEN
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
ALTER DATABASE {datname} is_template false;
END IF;
END
${outer_tag}$;
$$;

View File

@@ -61,23 +61,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn ident_pg_quote_dollar() {
let test_cases = vec![
("name", ("$$name$$", "x")),
("name$$", ("$x$name$$$x$", "xx")),
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
];
for (input, expected) in test_cases {
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
assert_eq!(escaped, expected.0);
assert_eq!(tag, expected.1);
}
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![

View File

@@ -40,7 +40,6 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -597,15 +596,7 @@ struct EndpointStartCmdArgs {
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
#[clap(
long,
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
)]
safekeepers_generation: Option<u32>,
#[clap(
long,
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
)]
#[clap(long)]
safekeepers: Option<String>,
#[clap(
@@ -626,9 +617,9 @@ struct EndpointStartCmdArgs {
)]
allow_multiple: bool,
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
#[arg(default_value = "90s")]
start_timeout: Duration,
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
#[derive(clap::Args)]
@@ -963,7 +954,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
@@ -978,7 +968,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_compute_hook_api: None,
generate_local_ssl_certs: false,
}
};
@@ -1361,7 +1350,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
let pageserver_id = args.endpoint_pageserver_id;
let remote_ext_config = &args.remote_ext_config;
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1437,13 +1425,11 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint
.start(
&auth_token,
safekeepers_generation,
safekeepers,
pageservers,
remote_ext_config.as_ref(),
stripe_size.0 as usize,
args.create_test_user,
args.start_timeout,
)
.await?;
}

View File

@@ -42,19 +42,17 @@ use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, anyhow, bail};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
};
use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use tracing::debug;
use url::Host;
@@ -578,17 +576,14 @@ impl Endpoint {
Ok(safekeeper_connstrings)
}
#[allow(clippy::too_many_arguments)]
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers_generation: Option<SafekeeperGeneration>,
safekeepers: Vec<NodeId>,
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
start_timeout: Duration,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -660,7 +655,6 @@ impl Endpoint {
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
@@ -669,7 +663,6 @@ impl Endpoint {
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
};
// this strange code is needed to support respec() in tests
@@ -777,18 +770,17 @@ impl Endpoint {
std::fs::write(pidfile_path, pid.to_string())?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
let start_at = Instant::now();
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
loop {
attempt += 1;
match self.get_status().await {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if Instant::now().duration_since(start_at) > start_timeout {
bail!(
"compute startup timed out {:?}; still in Init state",
start_timeout
);
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
@@ -815,11 +807,8 @@ impl Endpoint {
}
}
Err(e) => {
if Instant::now().duration_since(start_at) > start_timeout {
return Err(e).context(format!(
"timed out {:?} waiting to connect to compute_ctl HTTP",
start_timeout,
));
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
}
}
}

View File

@@ -81,10 +81,6 @@ pub struct LocalEnv {
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
/// Flag to generate SSL certificates for components that need it.
/// Also generates root CA certificate that is used to sign all other certificates.
pub generate_local_ssl_certs: bool,
}
/// On-disk state stored in `.neon/config`.
@@ -106,10 +102,6 @@ pub struct OnDiskConfig {
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_local_ssl_certs: bool,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
@@ -137,7 +129,6 @@ pub struct NeonLocalInitConf {
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Option<Url>>,
pub generate_local_ssl_certs: bool,
}
/// Broker config for cluster internal communication.
@@ -174,11 +165,6 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub long_reconcile_threshold: Option<Duration>,
#[serde(default)]
pub use_https_pageserver_api: bool,
pub timelines_onto_safekeepers: bool,
}
impl NeonStorageControllerConf {
@@ -202,8 +188,6 @@ impl Default for NeonStorageControllerConf {
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
long_reconcile_threshold: None,
use_https_pageserver_api: false,
timelines_onto_safekeepers: false,
}
}
}
@@ -233,7 +217,6 @@ pub struct PageServerConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
@@ -245,7 +228,6 @@ impl Default for PageServerConf {
id: NodeId(0),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
listen_https_addr: None,
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
@@ -261,7 +243,6 @@ pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
@@ -276,7 +257,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -286,7 +266,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
listen_https_addr: listen_https_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
@@ -431,41 +410,6 @@ impl LocalEnv {
}
}
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.crt"))
} else {
None
}
}
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
Some(self.base_data_dir.join("rootCA.key"))
} else {
None
}
}
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
let cert_path = self.ssl_ca_cert_path().unwrap();
let key_path = self.ssl_ca_key_path().unwrap();
if !fs::exists(cert_path.as_path())? {
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
}
Ok(())
}
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
self.generate_ssl_ca_cert()?;
generate_ssl_cert(
cert_path,
key_path,
self.ssl_ca_cert_path().unwrap().as_path(),
self.ssl_ca_key_path().unwrap().as_path(),
)
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
@@ -575,7 +519,6 @@ impl LocalEnv {
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
@@ -590,7 +533,6 @@ impl LocalEnv {
control_plane_api: control_plane_api.unwrap(),
control_plane_compute_hook_api,
branch_name_mappings,
generate_local_ssl_certs,
}
};
@@ -626,7 +568,6 @@ impl LocalEnv {
struct PageserverConfigTomlSubset {
listen_pg_addr: String,
listen_http_addr: String,
listen_https_addr: Option<String>,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
@@ -651,7 +592,6 @@ impl LocalEnv {
let PageserverConfigTomlSubset {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -669,7 +609,6 @@ impl LocalEnv {
},
listen_pg_addr,
listen_http_addr,
listen_https_addr,
pg_auth_type,
http_auth_type,
no_sync,
@@ -697,7 +636,6 @@ impl LocalEnv {
control_plane_api: Some(self.control_plane_api.clone()),
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
},
)
}
@@ -780,7 +718,6 @@ impl LocalEnv {
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
generate_local_ssl_certs,
} = conf;
// Find postgres binaries.
@@ -829,13 +766,8 @@ impl LocalEnv {
control_plane_api: control_plane_api.unwrap(),
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
};
if generate_local_ssl_certs {
env.generate_ssl_ca_cert()?;
}
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
@@ -919,80 +851,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
// -out rootCA.crt -keyout rootCA.key
let keygen_output = Command::new("openssl")
.args([
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
])
.args(["-subj", "/CN=Neon Local CA"])
.args(["-out", cert_path.to_str().unwrap()])
.args(["-keyout", key_path.to_str().unwrap()])
.output()
.context("failed to generate CA certificate")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
Ok(())
}
fn generate_ssl_cert(
cert_path: &Path,
key_path: &Path,
ca_cert_path: &Path,
ca_key_path: &Path,
) -> anyhow::Result<()> {
// Generate Certificate Signing Request (CSR).
let mut csr_path = cert_path.to_path_buf();
csr_path.set_extension(".csr");
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
let keygen_output = Command::new("openssl")
.args(["req", "-new", "-nodes"])
.args(["-newkey", "rsa:2048"])
.args(["-subj", "/CN=localhost"])
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
.args(["-keyout", key_path.to_str().unwrap()])
.args(["-out", csr_path.to_str().unwrap()])
.output()
.context("failed to generate CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Sign CSR with CA key.
//
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
// -out server.crt -days 36500 -copy_extensions copyall
let keygen_output = Command::new("openssl")
.args(["x509", "-req"])
.args(["-in", csr_path.to_str().unwrap()])
.args(["-CA", ca_cert_path.to_str().unwrap()])
.args(["-CAkey", ca_key_path.to_str().unwrap()])
.arg("-CAcreateserial")
.args(["-out", cert_path.to_str().unwrap()])
.args(["-days", "36500"])
.args(["-copy_extensions", "copyall"])
.output()
.context("failed to sign CSR")?;
if !keygen_output.status.success() {
bail!(
"openssl failed: '{}'",
String::from_utf8_lossy(&keygen_output.stderr)
);
}
// Remove CSR file as it's not needed anymore.
fs::remove_file(csr_path)?;
Ok(())
}

View File

@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{PgConnectionConfig, parse_host_port};
use reqwest::Certificate;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -50,29 +49,12 @@ impl PageServerNode {
let (host, port) =
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
Certificate::from_pem(&buf).expect("CA certificate should be valid")
});
let endpoint = if env.storage_controller.use_https_pageserver_api {
format!(
"https://{}",
conf.listen_https_addr.as_ref().expect(
"listen https address should be specified if use_https_pageserver_api is on"
)
)
} else {
format!("http://{}", conf.listen_http_addr)
};
Self {
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
conf: conf.clone(),
env: env.clone(),
http_client: mgmt_api::Client::new(
endpoint,
format!("http://{}", conf.listen_http_addr),
{
match conf.http_auth_type {
AuthType::Trust => None,
@@ -83,9 +65,7 @@ impl PageServerNode {
}
}
.as_deref(),
ssl_ca_cert,
)
.expect("Client constructs with no errors"),
),
}
}
@@ -240,13 +220,6 @@ impl PageServerNode {
.context("write identity toml")?;
drop(identity_toml);
if self.env.generate_local_ssl_certs {
self.env.generate_ssl_cert(
datadir.join("server.crt").as_path(),
datadir.join("server.key").as_path(),
)?;
}
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with
@@ -257,15 +230,6 @@ impl PageServerNode {
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
let http_port = http_port.unwrap_or(9898);
let https_port = match self.conf.listen_https_addr.as_ref() {
Some(https_addr) => {
let (_https_host, https_port) =
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
Some(https_port.unwrap_or(9899))
}
None => None,
};
// Intentionally hand-craft JSON: this acts as an implicit format compat test
// in case the pageserver-side structure is edited, and reflects the real life
// situation: the metadata is written by some other script.
@@ -276,7 +240,6 @@ impl PageServerNode {
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
https_port,
other: HashMap::from([(
"availability_zone_id".to_string(),
serde_json::json!(az_id),

View File

@@ -12,10 +12,13 @@ use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{TimelineCreateRequest, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_api::models::{
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
@@ -534,14 +537,6 @@ impl StorageController {
args.push("--start-as-candidate".to_string());
}
if self.config.use_https_pageserver_api {
args.push("--use-https-pageserver-api".to_string());
}
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
}
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
@@ -584,10 +579,6 @@ impl StorageController {
self.env.base_data_dir.display()
));
if self.config.timelines_onto_safekeepers {
args.push("--timelines-onto-safekeepers".to_string());
}
background_process::start_process(
COMMAND,
&instance_dir,
@@ -834,6 +825,41 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_migrate(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
) -> anyhow::Result<TenantShardMigrateResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
)
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
new_stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest {
new_shard_count,
new_stripe_size,
}),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))

View File

@@ -1,17 +1,16 @@
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use clap::{Parser, Subcommand};
use futures::StreamExt;
use pageserver_api::controller_api::{
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse,
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse,
};
use pageserver_api::models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
@@ -113,15 +112,6 @@ enum Command {
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
prewarm: bool,
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
override_scheduler: bool,
},
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
TenantShardWatch {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Migrate the secondary location for a tenant shard to a specific pageserver.
TenantShardMigrateSecondary {
@@ -279,10 +269,6 @@ struct Cli {
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[arg(long)]
/// Trusted root CA certificate to use in https APIs.
ssl_ca_file: Option<PathBuf>,
#[command(subcommand)]
command: Command,
}
@@ -393,17 +379,9 @@ async fn main() -> anyhow::Result<()> {
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let ssl_ca_cert = match &cli.ssl_ca_file {
Some(ssl_ca_file) => {
let buf = tokio::fs::read(ssl_ca_file).await?;
Some(reqwest::Certificate::from_pem(&buf)?)
}
None => None,
};
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
@@ -641,43 +619,19 @@ async fn main() -> anyhow::Result<()> {
Command::TenantShardMigrate {
tenant_shard_id,
node,
prewarm,
override_scheduler,
} => {
let migration_config = MigrationConfig {
prewarm,
override_scheduler,
..Default::default()
};
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config,
migration_config: None,
};
match storcon_client
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await
{
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
anyhow::bail!(
"Migration to {node} rejected, may require `--force` ({}) ",
msg
);
}
Err(e) => return Err(e.into()),
Ok(_) => {}
}
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
}
Command::TenantShardWatch { tenant_shard_id } => {
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
.await?;
}
Command::TenantShardMigrateSecondary {
tenant_shard_id,
@@ -685,8 +639,7 @@ async fn main() -> anyhow::Result<()> {
} => {
let req = TenantShardMigrateRequest {
node_id: node,
origin_node_id: None,
migration_config: MigrationConfig::default(),
migration_config: None,
};
storcon_client
@@ -1152,8 +1105,7 @@ async fn main() -> anyhow::Result<()> {
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
origin_node_id: Some(mv.from),
migration_config: MigrationConfig::default(),
migration_config: None,
}),
)
.await
@@ -1332,68 +1284,3 @@ async fn main() -> anyhow::Result<()> {
Ok(())
}
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
async fn watch_tenant_shard(
storcon_client: Client,
tenant_shard_id: TenantShardId,
until_migrated_to: Option<NodeId>,
) -> anyhow::Result<()> {
if let Some(until_migrated_to) = until_migrated_to {
println!(
"Waiting for tenant shard {} to be migrated to node {}",
tenant_shard_id, until_migrated_to
);
}
loop {
let desc = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
None,
)
.await?;
// Output the current state of the tenant shard
let shard = desc
.shards
.iter()
.find(|s| s.tenant_shard_id == tenant_shard_id)
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
let summary = format!(
"attached: {} secondary: {} {}",
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or("none".to_string()),
shard
.node_secondary
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(","),
if shard.is_reconciling {
"(reconciler active)"
} else {
"(reconciler idle)"
}
);
println!("{}", summary);
// Maybe drop out if we finished migration
if let Some(until_migrated_to) = until_migrated_to {
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
println!(
"Tenant shard {} is now on node {}",
tenant_shard_id, until_migrated_to
);
break;
}
}
tokio::time::sleep(WATCH_INTERVAL).await;
}
Ok(())
}

View File

@@ -27,14 +27,6 @@ yanked = "warn"
id = "RUSTSEC-2023-0071"
reason = "the marvin attack only affects private key decryption, not public key signature verification"
[[advisories.ignore]]
id = "RUSTSEC-2024-0436"
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
[[advisories.ignore]]
id = "RUSTSEC-2025-0014"
reason = "The humantime is widely used and is not easy to replace right now. It is unmaintained, but it has no known vulnerabilities to care about. #11179"
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html

View File

@@ -6,11 +6,8 @@ generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
echo "${OLD_COMPUTE_TAG}"
echo "${NEW_COMPUTE_TAG}"
echo "${TEST_EXTENSIONS_TAG}"
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
@@ -85,7 +82,7 @@ EXTENSIONS='[
{"extname": "pg_repack", "extdir": "pg_repack-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -93,7 +90,7 @@ create_extensions "${EXTNAMES}"
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
docker compose --profile test-extensions down
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"

View File

@@ -1,201 +0,0 @@
# Sparse Keyspace for Relation Directories
## Summary
This is an RFC describing a new storage strategy for storing relation directories.
## Motivation
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
```rust
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
```
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
```rust
#[derive(Debug, Serialize, Deserialize, Default)]
pub(crate) struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)
//
// TODO: Store it as a btree or radix tree or something else that spans multiple
// key-value pairs, if you have a lot of relations
pub(crate) rels: HashSet<(Oid, u8)>,
}
```
The current codebase has the following three access patterns for the relation directory.
1. Check if a relation exists.
2. List all relations.
3. Create/drop a relation.
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
to seamlessly migrate users to use the new keyspace.
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
## Key Mapping
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
into the key.
```plain
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
```
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
implemented as follows.
1. Check if a relation exists: check if the key maps to "exists".
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
be removed during image layer generation upon compaction.
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
## Changes to Sparse Keyspace
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
## Coexistence of the Old and New Keyspaces
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
path needs to combine the data from both keyspaces.
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
migration can happen seamlessly and imposes no potential downtime for the user.
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
1. Check if a relation exists
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
return it to the user.
- Otherwise, deserialize the old reldir key and get the result.
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
Combine them to obtain the final result.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
- The delete tombstone will be removed during image layer generation upon compaction.
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
us `O(1)` complexity after fully opt-in the sparse keyspace.
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
We will introduce a config item and an index_part record to record the current status of the migration process.
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
once v2 is enabled, the user cannot go back to v1.
## Next Steps
### Full Migration
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
code path, we must ensure the timeline has no old reldir data.
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
process discovers the following keys at this LSN.
```plain
db1/reldir_key -> (table 1, table 2, table 3)
...db1 rel keys
db2/reldir_key -> (table 4, table 5, table 6)
...db2 rel keys
sparse_reldir_db2_table7 -> exists
sparse_reldir_db1_table8 -> deleted
```
It will generate the following keys:
```plain
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
...db1 rel keys
db2/reldir_key -> ()
...db2 rel keys
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
sparse_reldir_db1_table1 -> exists
sparse_reldir_db1_table2 -> exists
sparse_reldir_db1_table3 -> exists
sparse_reldir_db2_table4 -> exists
sparse_reldir_db2_table5 -> exists
sparse_reldir_db2_table6 -> exists
sparse_reldir_db2_table7 -> exists
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
# are no correctness issue.
```
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
### Consolidate Relation Size Keys
We have relsize at the end of all relation nodes.
```plain
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
```
This means that computing logical size requires us to do several single-key gets across the keyspace,
potentially requiring downloading many layer files. We could consolidate them into a single
keyspace, improving logical size calculation performance.
### Migrate DBDir Keys
We assume the number of databases created by the users will be small, and therefore, the current way
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
the sparse keyspace to support large amount of databases.

View File

@@ -134,10 +134,8 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
}

View File

@@ -101,17 +101,6 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
/// 2) If walproposer wants to update list of safekeepers to connect to
/// taking them from some safekeeper mconf, it should check what value
/// is newer by comparing the generation.
///
/// Note: it could be SafekeeperGeneration, but this needs linking
/// compute_ctl with postgres_ffi.
#[serde(default)]
pub safekeepers_generation: Option<u32>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
@@ -155,16 +144,6 @@ pub struct ComputeSpec {
/// over the same replication content from publisher.
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
#[serde(default)]
pub audit_log_level: ComputeAudit,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -272,29 +251,6 @@ pub enum ComputeMode {
Replica,
}
impl ComputeMode {
/// Convert the compute mode to a string that can be used to identify the type of compute,
/// which means that if it's a static compute, the LSN will not be included.
pub fn to_type_str(&self) -> &'static str {
match self {
ComputeMode::Primary => "primary",
ComputeMode::Static(_) => "static",
ComputeMode::Replica => "replica",
}
}
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
Log,
Hipaa,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
pub struct Cluster {
pub cluster_id: Option<String>,

View File

@@ -6,9 +6,11 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
backtrace.workspace = true
bytes.workspace = true
inferno.workspace = true
fail.workspace = true
futures.workspace = true
flate2.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
@@ -22,7 +24,6 @@ serde_path_to_error.workspace = true
thiserror.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
url.workspace = true
uuid.workspace = true

View File

@@ -3,6 +3,8 @@ use std::io::Write as _;
use std::str::FromStr;
use std::time::Duration;
use ::pprof::ProfilerGuardBuilder;
use ::pprof::protos::Message as _;
use anyhow::{Context, anyhow};
use bytes::{Bytes, BytesMut};
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -10,8 +12,7 @@ use hyper::http::HeaderValue;
use hyper::{Body, Method, Request, Response};
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
use once_cell::sync::Lazy;
use pprof::ProfilerGuardBuilder;
use pprof::protos::Message as _;
use regex::Regex;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{Mutex, Notify, mpsc};
@@ -21,6 +22,7 @@ use tracing::{Instrument, debug, info, info_span, warn};
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::error::{ApiError, api_error_handler, route_error_handler};
use crate::pprof;
use crate::request::{get_query_param, parse_query_param};
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -399,10 +401,12 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
// Return the report in the requested format.
match format {
Format::Pprof => {
let body = report
let mut body = Vec::new();
report
.pprof()
.map_err(|err| ApiError::InternalServerError(err.into()))?
.encode_to_vec();
.write_to_vec(&mut body)
.map_err(|err| ApiError::InternalServerError(err.into()))?;
Response::builder()
.status(200)
@@ -445,6 +449,20 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
// Functions and mappings to strip when symbolizing pprof profiles. If true,
// also remove child frames.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
// Obtain profiler handle.
let mut prof_ctl = jemalloc_pprof::PROF_CTL
.as_ref()
@@ -477,27 +495,45 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
}
Format::Pprof => {
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let data = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
pprof::encode(&profile)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
.body(Body::from(data))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();
pprof::flamegraph(profile, &mut opts)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(svg))
.body(Body::from(body))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}

View File

@@ -2,11 +2,11 @@ pub mod endpoint;
pub mod error;
pub mod failpoints;
pub mod json;
pub mod pprof;
pub mod request;
pub mod server;
extern crate hyper0 as hyper;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt};
pub use routerify::{RouterBuilder, RouterService, ext::RequestExt};

View File

@@ -0,0 +1,238 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::ffi::c_void;
use std::io::Write as _;
use anyhow::bail;
use flate2::Compression;
use flate2::write::{GzDecoder, GzEncoder};
use itertools::Itertools as _;
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use regex::Regex;
/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
let mut gz = GzDecoder::new(Vec::new());
gz.write_all(bytes)?;
Ok(Profile::parse_from_bytes(&gz.finish()?)?)
}
/// Encodes a pprof profile as gzip-compressed Protobuf.
pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
let mut gz = GzEncoder::new(Vec::new(), Compression::default());
profile.write_to_writer(&mut gz)?;
Ok(gz.finish()?)
}
/// Symbolizes a pprof profile using the current binary.
pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
if !profile.function.is_empty() {
return Ok(profile); // already symbolized
}
// Collect function names.
let mut functions: HashMap<String, Function> = HashMap::new();
let mut strings: HashMap<String, i64> = profile
.string_table
.into_iter()
.enumerate()
.map(|(i, s)| (s, i as i64))
.collect();
// Helper to look up or register a string.
let mut string_id = |s: &str| -> i64 {
// Don't use .entry() to avoid unnecessary allocations.
if let Some(id) = strings.get(s) {
return *id;
}
let id = strings.len() as i64;
strings.insert(s.to_string(), id);
id
};
for loc in &mut profile.location {
if !loc.line.is_empty() {
continue;
}
// Resolve the line and function for each location.
backtrace::resolve(loc.address as *mut c_void, |symbol| {
let Some(symbol_name) = symbol.name() else {
return;
};
let function_name = format!("{symbol_name:#}");
let functions_len = functions.len();
let function_id = functions
.entry(function_name)
.or_insert_with_key(|function_name| {
let function_id = functions_len as u64 + 1;
let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
let filename = symbol
.filename()
.map(|path| path.to_string_lossy())
.unwrap_or(Cow::Borrowed(""));
Function {
id: function_id,
name: string_id(function_name),
system_name: string_id(&system_name),
filename: string_id(&filename),
..Default::default()
}
})
.id;
loc.line.push(Line {
function_id,
line: symbol.lineno().unwrap_or(0) as i64,
..Default::default()
});
});
}
// Store the resolved functions, and mark the mapping as resolved.
profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
profile.string_table = strings
.into_iter()
.sorted_by_key(|(_, i)| *i)
.map(|(s, _)| s)
.collect();
for mapping in &mut profile.mapping {
mapping.has_functions = true;
mapping.has_filenames = true;
}
Ok(profile)
}
/// Strips locations (stack frames) matching the given mappings (substring) or function names
/// (regex). The function bool specifies whether child frames should be stripped as well.
///
/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
/// string references.
pub fn strip_locations(
mut profile: Profile,
mappings: &[&str],
functions: &[(Regex, bool)],
) -> Profile {
// Strip mappings.
let mut strip_mappings: HashSet<u64> = HashSet::new();
profile.mapping.retain(|mapping| {
let Some(name) = profile.string_table.get(mapping.filename as usize) else {
return true;
};
if mappings.iter().any(|substr| name.contains(substr)) {
strip_mappings.insert(mapping.id);
return false;
}
true
});
// Strip functions.
let mut strip_functions: HashMap<u64, bool> = HashMap::new();
profile.function.retain(|function| {
let Some(name) = profile.string_table.get(function.name as usize) else {
return true;
};
for (regex, strip_children) in functions {
if regex.is_match(name) {
strip_functions.insert(function.id, *strip_children);
return false;
}
}
true
});
// Strip locations. The bool specifies whether child frames should be stripped too.
let mut strip_locations: HashMap<u64, bool> = HashMap::new();
profile.location.retain(|location| {
for line in &location.line {
if let Some(strip_children) = strip_functions.get(&line.function_id) {
strip_locations.insert(location.id, *strip_children);
return false;
}
}
if strip_mappings.contains(&location.mapping_id) {
strip_locations.insert(location.id, false);
return false;
}
true
});
// Strip sample locations.
for sample in &mut profile.sample {
// First, find the uppermost function with child removal and truncate the stack.
if let Some(truncate) = sample
.location_id
.iter()
.rposition(|id| strip_locations.get(id) == Some(&true))
{
sample.location_id.drain(..=truncate);
}
// Next, strip any individual frames without child removal.
sample
.location_id
.retain(|id| !strip_locations.contains_key(id));
}
profile
}
/// Generates an SVG flamegraph from a symbolized pprof profile.
pub fn flamegraph(
profile: Profile,
opts: &mut inferno::flamegraph::Options,
) -> anyhow::Result<Vec<u8>> {
if profile.mapping.iter().any(|m| !m.has_functions) {
bail!("profile not symbolized");
}
// Index locations, functions, and strings.
let locations: HashMap<u64, Location> =
profile.location.into_iter().map(|l| (l.id, l)).collect();
let functions: HashMap<u64, Function> =
profile.function.into_iter().map(|f| (f.id, f)).collect();
let strings = profile.string_table;
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
// since inferno expects it bottom-up.
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
for sample in profile.sample {
let mut stack = Vec::with_capacity(sample.location_id.len());
for location in sample.location_id.into_iter().rev() {
let Some(location) = locations.get(&location) else {
bail!("missing location {location}");
};
for line in location.line.iter().rev() {
let Some(function) = functions.get(&line.function_id) else {
bail!("missing function {}", line.function_id);
};
let Some(name) = strings.get(function.name as usize) else {
bail!("missing string {}", function.name);
};
stack.push(name.as_str());
}
}
let Some(&value) = sample.value.first() else {
bail!("missing value");
};
*stacks.entry(stack).or_default() += value;
}
// Construct stack lines for inferno.
let lines = stacks
.into_iter()
.map(|(stack, value)| (stack.into_iter().join(";"), value))
.map(|(stack, value)| format!("{stack} {value}"))
.sorted()
.collect_vec();
// Construct the flamegraph.
let mut bytes = Vec::new();
let lines = lines.iter().map(|line| line.as_str());
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
Ok(bytes)
}

View File

@@ -1,155 +0,0 @@
use std::{error::Error, sync::Arc};
use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use crate::error::ApiError;
/// A simple HTTP server over hyper library.
/// You may want to use it instead of [`hyper0::server::Server`] because:
/// 1. hyper0's Server was removed from hyper v1.
/// It's recommended to replace hyepr0's Server with a manual loop, which is done here.
/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way
/// to support it efficiently with the Accept trait that hyper0's Server uses.
/// That's one of the reasons why it was removed from v1.
/// <https://github.com/hyperium/hyper/blob/115339d3df50f20c8717680aa35f48858e9a6205/docs/ROADMAP.md#higher-level-client-and-server-problems>
pub struct Server {
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: tokio::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
}
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
listener: std::net::TcpListener,
tls_acceptor: Option<TlsAcceptor>,
) -> anyhow::Result<Self> {
// Note: caller of from_std is responsible for setting nonblocking mode.
listener.set_nonblocking(true)?;
let listener = tokio::net::TcpListener::from_std(listener)?;
Ok(Self {
request_service,
listener,
tls_acceptor,
})
}
pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> {
fn suppress_io_error(err: &std::io::Error) -> bool {
use std::io::ErrorKind::*;
matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe)
}
fn suppress_hyper_error(err: &hyper0::Error) -> bool {
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
return true;
}
if let Some(inner) = err.source() {
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
return suppress_io_error(io);
}
}
false
}
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
stream = self.listener.accept() => {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
continue;
}
};
let service = self.request_service.build(remote_addr);
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
};
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
if !suppress_io_error(&err) {
info!("Failed to accept TLS connection: {err:#}");
}
return;
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTPS connection: {err:#}");
}
}
}
None => {
// Handle HTTP connection.
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
if !suppress_hyper_error(&err) {
info!("Failed to serve HTTP connection: {err:#}");
}
}
}
};
}));
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
_ = cancel.cancelled() => {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
error!("Connection panicked: {err:#}");
}
}
break;
}
}
}
Ok(())
}
/// Serves HTTP connection with graceful shutdown.
async fn serve_connection<I>(
io: I,
service: RequestService<Body, ApiError>,
cancel: CancellationToken,
) -> Result<(), hyper0::Error>
where
I: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
let mut conn = Http::new().serve_connection(io, service).with_upgrades();
tokio::select! {
res = &mut conn => res,
_ = cancel.cancelled() => {
Pin::new(&mut conn).graceful_shutdown();
// Note: connection should still be awaited for graceful shutdown to complete.
conn.await
}
}
}
}

View File

@@ -35,7 +35,6 @@ pub struct NodeMetadata {
pub postgres_port: u16,
pub http_host: String,
pub http_port: u16,
pub https_port: Option<u16>,
// Deployment tools may write fields to the metadata file beyond what we
// use in this type: this type intentionally only names fields that require.
@@ -58,9 +57,6 @@ pub struct ConfigToml {
// types mapped 1:1 into the runtime PageServerConfig type
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
pub availability_zone: Option<String>,
#[serde(with = "humantime_serde")]
pub wait_lsn_timeout: Duration,
@@ -127,10 +123,6 @@ pub struct ConfigToml {
pub enable_read_path_debugging: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub validate_wal_contiguity: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub load_previous_heatmap: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -425,9 +417,6 @@ pub mod defaults {
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
utils::postgres_client::PostgresClientProtocol::Vanilla;
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
}
impl Default for ConfigToml {
@@ -437,9 +426,6 @@ impl Default for ConfigToml {
Self {
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
listen_https_addr: (None),
ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
availability_zone: (None),
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
.expect("cannot parse default wait lsn timeout")),
@@ -537,8 +523,6 @@ impl Default for ConfigToml {
None
},
validate_wal_contiguity: None,
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
}
}
}

View File

@@ -16,30 +16,6 @@ fn test_node_metadata_v1_backward_compatibilty() {
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: None,
other: HashMap::new(),
}
)
}
#[test]
fn test_node_metadata_v2_backward_compatibilty() {
let v2 = serde_json::to_vec(&serde_json::json!({
"host": "localhost",
"port": 23,
"http_host": "localhost",
"http_port": 42,
"https_port": 123,
}));
assert_eq!(
serde_json::from_slice::<NodeMetadata>(&v2.unwrap()).unwrap(),
NodeMetadata {
postgres_host: "localhost".to_string(),
postgres_port: 23,
http_host: "localhost".to_string(),
http_port: 42,
https_port: Some(123),
other: HashMap::new(),
}
)

View File

@@ -182,66 +182,20 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
/// Optionally, callers may specify the node they are migrating _from_, and the server will
/// reject the request if the shard is no longer attached there: this enables writing safer
/// clients that don't risk fighting with some other movement of the shard.
#[serde(default)]
pub origin_node_id: Option<NodeId>,
#[serde(default)]
pub migration_config: MigrationConfig,
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
/// score: this is usually not what you want, and if you use this then you'll also need to set the
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
///
/// Default: false
#[serde(default)]
pub override_scheduler: bool,
/// If true, the migration will be done gracefully by creating a secondary location first and
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
///
/// When doing a graceful migration, the migration API returns as soon as it is started.
///
/// Default: true
#[serde(default = "default_prewarm")]
pub prewarm: bool,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// overall for secondary warmup before cutting over
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// within each secondary download poll call to pageserver.
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
fn default_prewarm() -> bool {
true
}
impl Default for MigrationConfig {
fn default() -> Self {
Self {
override_scheduler: false,
prewarm: default_prewarm(),
secondary_warmup_timeout: None,
secondary_download_request_timeout: None,
}
}
}
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
@@ -489,7 +443,6 @@ pub struct SafekeeperDescribeResponse {
pub host: String,
pub port: i32,
pub http_port: i32,
pub https_port: Option<i32>,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
@@ -534,43 +487,4 @@ mod test {
err
);
}
/// Check that a minimal migrate request with no config results in the expected default settings
#[test]
fn test_migrate_request_decode_defaults() {
let json = r#"{
"node_id": 123
}"#;
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
assert_eq!(request.node_id, NodeId(123));
assert_eq!(request.origin_node_id, None);
assert!(!request.migration_config.override_scheduler);
assert!(request.migration_config.prewarm);
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
assert_eq!(
request.migration_config.secondary_download_request_timeout,
None
);
}
/// Check that a partially specified migration config results in the expected default settings
#[test]
fn test_migration_config_decode_defaults() {
// Specify just one field of the config
let json = r#"{
}"#;
let config: MigrationConfig = serde_json::from_str(json).unwrap();
// Check each field's expected default value
assert!(!config.override_scheduler);
assert!(config.prewarm);
assert_eq!(config.secondary_warmup_timeout, None);
assert_eq!(config.secondary_download_request_timeout, None);
assert_eq!(config.secondary_warmup_timeout, None);
// Consistency check that the Default impl agrees with our serde defaults
assert_eq!(MigrationConfig::default(), config);
}
}

View File

@@ -274,31 +274,6 @@ pub struct TimelineCreateRequest {
pub mode: TimelineCreateRequestMode,
}
/// Storage controller specific extensions to [`TimelineInfo`].
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateResponseStorcon {
#[serde(flatten)]
pub timeline_info: TimelineInfo,
pub safekeepers: Option<SafekeepersInfo>,
}
/// Safekeepers as returned in timeline creation request to storcon or pushed to
/// cplane in the post migration hook.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeepersInfo {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub generation: u32,
pub safekeepers: Vec<SafekeeperInfo>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperInfo {
pub id: NodeId,
pub hostname: String,
}
#[derive(Serialize, Deserialize, Clone)]
#[serde(untagged)]
pub enum TimelineCreateRequestMode {
@@ -1171,15 +1146,6 @@ pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelinePatchIndexPartRequest {
pub rel_size_migration: Option<RelSizeMigration>,
pub gc_compaction_last_completed_lsn: Option<Lsn>,
pub applied_gc_cutoff_lsn: Option<Lsn>,
#[serde(default)]
pub force_index_update: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelinesInfoAndOffloaded {
pub timelines: Vec<TimelineInfo>,
@@ -1199,21 +1165,6 @@ pub struct OffloadedTimelineInfo {
pub archived_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -1292,11 +1243,7 @@ pub struct TimelineInfo {
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// Whether the timeline is archived.
pub is_archived: Option<bool>,
/// The status of the rel_size migration.
pub rel_size_migration: Option<RelSizeMigration>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1476,14 +1423,8 @@ pub struct TenantScanRemoteStorageResponse {
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
/// Total size of layers on local disk for all timelines in a shard.
ResidentSize,
/// The logical size of the largest timeline within a _tenant_ (not shard). Only tracked on
/// shard 0, contains the sum across all shards.
MaxLogicalSize,
/// The logical size of the largest timeline within a _tenant_ (not shard), divided by number of
/// shards. Only tracked on shard 0, and estimates the per-shard logical size.
MaxLogicalSizePerShard,
}
impl Default for TenantSorting {
@@ -1513,20 +1454,14 @@ pub struct TopTenantShardsRequest {
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this shard.
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this shard.
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard). This is only
/// tracked on shard 0, and contains the sum of the logical size across all shards.
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
/// The largest logical size of a timeline within this _tenant_ (not shard) divided by number of
/// shards. This is only tracked on shard 0, and is only an estimate as we divide it evenly by
/// shard count, rounded up.
pub max_logical_size_per_shard: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]

View File

@@ -396,14 +396,6 @@ pub mod waldecoder {
self.lsn + self.inputbuf.remaining() as u64
}
/// Returns the LSN up to which the WAL decoder has processed.
///
/// If [`Self::poll_decode`] returned a record, then this will return
/// the end LSN of said record.
pub fn lsn(&self) -> Lsn {
self.lsn
}
pub fn feed_bytes(&mut self, buf: &[u8]) {
self.inputbuf.extend_from_slice(buf);
}

View File

@@ -34,13 +34,8 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
let socket = connect_socket::connect_socket(
config.host_addr,
&config.host,
config.port,
config.connect_timeout,
)
.await?;
let socket =
connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
}

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::fmt;
use std::net::IpAddr;
use std::sync::Arc;
use std::task::{Context, Poll};
use std::time::Duration;
@@ -138,7 +137,6 @@ impl InnerClient {
#[derive(Clone, Serialize, Deserialize)]
pub struct SocketConfig {
pub host_addr: Option<IpAddr>,
pub host: Host,
pub port: u16,
pub connect_timeout: Option<Duration>,
@@ -284,18 +282,6 @@ impl Client {
simple_query::batch_execute(self.inner(), query).await
}
pub async fn discard_all(&self) -> Result<ReadyForQueryStatus, Error> {
// clear the prepared statements that are about to be nuked from the postgres session
{
let mut typeinfo = self.inner.cached_typeinfo.lock();
typeinfo.typeinfo = None;
typeinfo.typeinfo_composite = None;
typeinfo.typeinfo_enum = None;
}
self.batch_execute("discard all").await
}
/// Begins a new database transaction.
///
/// The transaction will roll back by default - use the `commit` method to commit it.

View File

@@ -1,6 +1,5 @@
//! Connection configuration.
use std::net::IpAddr;
use std::time::Duration;
use std::{fmt, str};
@@ -66,7 +65,6 @@ pub enum AuthKeys {
/// Connection configuration.
#[derive(Clone, PartialEq, Eq)]
pub struct Config {
pub(crate) host_addr: Option<IpAddr>,
pub(crate) host: Host,
pub(crate) port: u16,
@@ -85,7 +83,6 @@ impl Config {
/// Creates a new configuration.
pub fn new(host: String, port: u16) -> Config {
Config {
host_addr: None,
host: Host::Tcp(host),
port,
password: None,
@@ -166,15 +163,6 @@ impl Config {
self
}
pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
self.host_addr = Some(addr);
self
}
pub fn get_host_addr(&self) -> Option<IpAddr> {
self.host_addr
}
/// Sets the SSL configuration.
///
/// Defaults to `prefer`.

View File

@@ -1,5 +1,3 @@
use std::net::IpAddr;
use postgres_protocol2::message::backend::Message;
use tokio::net::TcpStream;
use tokio::sync::mpsc;
@@ -27,14 +25,13 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
match connect_once(&config.host, config.port, tls, config).await {
Ok((client, connection)) => Ok((client, connection)),
Err(e) => Err(e),
}
}
async fn connect_once<T>(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
tls: T,
@@ -43,7 +40,7 @@ async fn connect_once<T>(
where
T: TlsConnect<TcpStream>,
{
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
let socket = connect_socket(host, port, config.connect_timeout).await?;
let RawConnection {
stream,
parameters,
@@ -53,7 +50,6 @@ where
} = connect_raw(socket, tls, config).await?;
let socket_config = SocketConfig {
host_addr,
host: host.clone(),
port,
connect_timeout: config.connect_timeout,

View File

@@ -1,6 +1,5 @@
use std::future::Future;
use std::io;
use std::net::{IpAddr, SocketAddr};
use std::time::Duration;
use tokio::net::{self, TcpStream};
@@ -10,20 +9,15 @@ use crate::Error;
use crate::config::Host;
pub(crate) async fn connect_socket(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
connect_timeout: Option<Duration>,
) -> Result<TcpStream, Error> {
match host {
Host::Tcp(host) => {
let addrs = match host_addr {
Some(addr) => vec![SocketAddr::new(addr, port)],
None => net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?
.collect(),
};
let addrs = net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?;
let mut last_err = None;

View File

@@ -131,14 +131,6 @@ impl Configuration {
}
}
pub fn new(members: MemberSet) -> Self {
Configuration {
generation: INITIAL_GENERATION,
members,
new_members: None,
}
}
/// Is `sk_id` member of the configuration?
pub fn contains(&self, sk_id: NodeId) -> bool {
self.members.contains(sk_id) || self.new_members.as_ref().is_some_and(|m| m.contains(sk_id))

View File

@@ -18,7 +18,7 @@ pub struct SafekeeperStatus {
pub id: NodeId,
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -283,7 +283,7 @@ pub struct SafekeeperUtilization {
}
/// pull_timeline request body.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct PullTimelineRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,

View File

@@ -15,6 +15,7 @@ arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
anyhow.workspace = true
backtrace.workspace = true
bincode.workspace = true
bytes.workspace = true
camino.workspace = true

View File

@@ -3,24 +3,20 @@ use std::env;
use sentry::ClientInitGuard;
pub use sentry::release_name;
use tracing::{error, info};
#[must_use]
pub fn init_sentry(
release_name: Option<Cow<'static, str>>,
extra_options: &[(&str, &str)],
) -> Option<ClientInitGuard> {
let Ok(dsn) = env::var("SENTRY_DSN") else {
info!("not initializing Sentry, no SENTRY_DSN given");
return None;
};
let dsn = env::var("SENTRY_DSN").ok()?;
let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
let guard = sentry::init((
dsn,
sentry::ClientOptions {
release: release_name.clone(),
environment: Some(environment.clone().into()),
release: release_name,
environment: Some(environment.into()),
..Default::default()
},
));
@@ -29,19 +25,5 @@ pub fn init_sentry(
scope.set_extra(key, value.into());
}
});
if let Some(dsn) = guard.dsn() {
info!(
"initialized Sentry for project {}, environment {}, release {} (using API {})",
dsn.project_id(),
environment,
release_name.unwrap_or(Cow::Borrowed("None")),
dsn.envelope_api_url(),
);
} else {
// This should panic during sentry::init(), but we may as well cover it.
error!("failed to initialize Sentry, invalid DSN");
}
Some(guard)
}

View File

@@ -48,9 +48,6 @@ pprof.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
rustls-pemfile.workspace = true
rustls-pki-types.workspace = true
rustls.workspace = true
scopeguard.workspace = true
send-future.workspace = true
serde.workspace = true
@@ -65,7 +62,6 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util"
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
@@ -102,7 +98,6 @@ criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
indoc.workspace = true
uuid.workspace = true
[[bench]]
name = "bench_layer_map"
@@ -120,10 +115,6 @@ harness = false
name = "upload_queue"
harness = false
[[bench]]
name = "bench_metrics"
harness = false
[[bin]]
name = "test_helper_slow_client_reads"
required-features = [ "testing" ]

View File

@@ -7,6 +7,7 @@ use std::time::Instant;
use criterion::measurement::WallTime;
use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
use pageserver_api::key::Key;
@@ -71,6 +72,41 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
.collect()
}
// Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
let mut parts = Vec::new();
// We add a partition boundary at the start of each image layer,
// no matter what lsn range it covers. This is just the easiest
// thing to do. A better thing to do would be to get a real
// partitioning from some database. Even better, remove the need
// for key partitions by deciding where to create image layers
// directly based on a coverage-based difficulty map.
let mut keys: Vec<_> = layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
Some(kr.start.next())
}
})
.collect();
keys.sort();
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
for key in keys {
parts.push(KeySpace {
ranges: vec![current_key..key],
});
current_key = key;
}
KeyPartitioning { parts }
}
// Benchmark using metadata extracted from our performance test environment, from
// a project where we have run pgbench many timmes. The pgbench database was initialized
// between each test run.
@@ -112,6 +148,41 @@ fn bench_from_real_project(c: &mut Criterion) {
// Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Choose inputs for get_difficulty_map
let latest_lsn = layer_map
.iter_historic_layers()
.map(|l| l.get_lsn_range().end)
.max()
.unwrap();
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
// Check correctness of get_difficulty_map
// TODO put this in a dedicated test outside of this mod
{
println!("running correctness check");
let now = Instant::now();
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
assert!(result_bruteforce.len() == partitioning.parts.len());
println!("Finished bruteforce in {:?}", now.elapsed());
let now = Instant::now();
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
assert!(result_fast.len() == partitioning.parts.len());
println!("Finished fast in {:?}", now.elapsed());
// Assert results are equal. Manually iterate for easier debugging.
let zip = std::iter::zip(
&partitioning.parts,
std::iter::zip(result_bruteforce, result_fast),
);
for (_part, (bruteforce, fast)) in zip {
assert_eq!(bruteforce, fast);
}
println!("No issues found");
}
// Define and name the benchmark function
let mut group = c.benchmark_group("real_map");
group.bench_function("uniform_queries", |b| {
@@ -121,6 +192,11 @@ fn bench_from_real_project(c: &mut Criterion) {
}
});
});
group.bench_function("get_difficulty_map", |b| {
b.iter(|| {
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
});
});
group.finish();
}

View File

@@ -1,366 +0,0 @@
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use utils::id::{TenantId, TimelineId};
//
// Demonstrates that repeat label values lookup is a multicore scalability bottleneck
// that is worth avoiding.
//
criterion_group!(
label_values,
label_values::bench_naive_usage,
label_values::bench_cache_label_values_lookup
);
mod label_values {
use super::*;
pub fn bench_naive_usage(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__naive_usage");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric_vec
.with_label_values(&[tenant_id, shard_id, timeline_id])
.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_cache_label_values_lookup(c: &mut Criterion) {
let mut g = c.benchmark_group("label_values__cache_label_values_lookup");
for ntimelines in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("ntimelines", ntimelines),
&ntimelines,
|b, ntimelines| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*ntimelines + 1);
let timelines = (0..*ntimelines)
.map(|_| {
(
TenantId::generate().to_string(),
"0000".to_string(),
TimelineId::generate().to_string(),
)
})
.collect::<Vec<_>>();
let metric_vec = metrics::UIntGaugeVec::new(
metrics::opts!("testmetric", "testhelp"),
&["tenant_id", "shard_id", "timeline_id"],
)
.unwrap();
std::thread::scope(|s| {
for (tenant_id, shard_id, timeline_id) in &timelines {
s.spawn(|| {
let metric = metric_vec.with_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even a single metric can be a scalability bottleneck
// if multiple threads in it concurrently but there's nothing we can do
// about it without changing the metrics framework to use e.g. sharded counte atomics.
//
criterion_group!(
single_metric_multicore_scalability,
single_metric_multicore_scalability::bench,
);
mod single_metric_multicore_scalability {
use super::*;
pub fn bench(c: &mut Criterion) {
let mut g = c.benchmark_group("single_metric_multicore_scalability");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
metric.inc();
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
}
//
// Demonstrates that even if we cache label value, the propagation of such a cached metric value
// by Clone'ing it is a scalability bottleneck.
// The reason is that it's an Arc internally and thus there's contention on the reference count atomics.
//
// We can avoid that by having long-lived references per thread (= indirection).
//
criterion_group!(
propagation_of_cached_label_value,
propagation_of_cached_label_value::bench_naive,
propagation_of_cached_label_value::bench_long_lived_reference_per_thread,
);
mod propagation_of_cached_label_value {
use std::sync::Arc;
use super::*;
pub fn bench_naive(c: &mut Criterion) {
let mut g = c.benchmark_group("propagation_of_cached_label_value__naive");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = metric.clone();
// simulate some work
criterion::black_box(propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
g.finish();
}
pub fn bench_long_lived_reference_per_thread(c: &mut Criterion) {
let mut g =
c.benchmark_group("propagation_of_cached_label_value__long_lived_reference_per_thread");
for nthreads in [1, 4, 8] {
g.bench_with_input(
BenchmarkId::new("nthreads", nthreads),
&nthreads,
|b, nthreads| {
b.iter_custom(|iters| {
let barrier = std::sync::Barrier::new(*nthreads + 1);
let metric = metrics::UIntGauge::new("testmetric", "testhelp").unwrap();
std::thread::scope(|s| {
for _ in 0..*nthreads {
s.spawn(|| {
// This is the technique.
let this_threads_metric_reference = Arc::new(metric.clone());
barrier.wait();
for _ in 0..iters {
// propagating the metric means we'd clone it into the child RequestContext
let propagated = Arc::clone(&this_threads_metric_reference);
// simulate some work (include the pointer chase!)
criterion::black_box(&*propagated);
}
barrier.wait();
});
}
barrier.wait();
let start = std::time::Instant::now();
barrier.wait();
start.elapsed()
})
})
},
);
}
}
}
criterion_main!(
label_values,
single_metric_multicore_scalability,
propagation_of_cached_label_value
);
/*
RUST_BACKTRACE=full cargo bench --bench bench_metrics -- --discard-baseline --noplot
Results on an im4gn.2xlarge instance
label_values__naive_usage/ntimelines/1 time: [178.71 ns 178.74 ns 178.76 ns]
label_values__naive_usage/ntimelines/4 time: [532.94 ns 539.59 ns 546.31 ns]
label_values__naive_usage/ntimelines/8 time: [1.1082 µs 1.1109 µs 1.1135 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [6.4116 ns 6.4119 ns 6.4123 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [6.3482 ns 6.3819 ns 6.4079 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [6.4213 ns 6.5279 ns 6.6293 ns]
single_metric_multicore_scalability/nthreads/1 time: [6.0102 ns 6.0104 ns 6.0106 ns]
single_metric_multicore_scalability/nthreads/4 time: [38.127 ns 38.275 ns 38.416 ns]
single_metric_multicore_scalability/nthreads/8 time: [73.698 ns 74.882 ns 75.864 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [14.424 ns 14.425 ns 14.426 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [100.71 ns 102.53 ns 104.35 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns 216.87 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns]
Results on an i3en.3xlarge instance
label_values__naive_usage/ntimelines/1 time: [117.32 ns 117.53 ns 117.74 ns]
label_values__naive_usage/ntimelines/4 time: [736.58 ns 741.12 ns 745.61 ns]
label_values__naive_usage/ntimelines/8 time: [1.4513 µs 1.4596 µs 1.4665 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [8.0964 ns 8.0979 ns 8.0995 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [8.1620 ns 8.2912 ns 8.4491 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [14.148 ns 14.237 ns 14.324 ns]
single_metric_multicore_scalability/nthreads/1 time: [8.0993 ns 8.1013 ns 8.1046 ns]
single_metric_multicore_scalability/nthreads/4 time: [80.039 ns 80.672 ns 81.297 ns]
single_metric_multicore_scalability/nthreads/8 time: [153.58 ns 154.23 ns 154.90 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [13.924 ns 13.926 ns 13.928 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [143.66 ns 145.27 ns 146.59 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [296.51 ns 297.90 ns 299.30 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.013 ns 14.149 ns 14.308 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.311 ns 14.625 ns 14.984 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [25.981 ns 26.227 ns 26.476 ns]
Results on an Standard L16s v3 (16 vcpus, 128 GiB memory) Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
label_values__naive_usage/ntimelines/1 time: [101.63 ns 101.84 ns 102.06 ns]
label_values__naive_usage/ntimelines/4 time: [417.55 ns 424.73 ns 432.63 ns]
label_values__naive_usage/ntimelines/8 time: [874.91 ns 889.51 ns 904.25 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [5.7724 ns 5.7760 ns 5.7804 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [7.8878 ns 7.9401 ns 8.0034 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [7.2621 ns 7.6354 ns 8.0337 ns]
single_metric_multicore_scalability/nthreads/1 time: [5.7710 ns 5.7744 ns 5.7785 ns]
single_metric_multicore_scalability/nthreads/4 time: [66.629 ns 66.994 ns 67.336 ns]
single_metric_multicore_scalability/nthreads/8 time: [130.85 ns 131.98 ns 132.91 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [11.540 ns 11.546 ns 11.553 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [131.22 ns 131.90 ns 132.56 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [260.99 ns 262.75 ns 264.26 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [11.544 ns 11.550 ns 11.557 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [11.568 ns 11.642 ns 11.763 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [13.416 ns 14.121 ns 14.886 ns
Results on an M4 MAX MacBook Pro Total Number of Cores: 14 (10 performance and 4 efficiency)
label_values__naive_usage/ntimelines/1 time: [52.711 ns 53.026 ns 53.381 ns]
label_values__naive_usage/ntimelines/4 time: [323.99 ns 330.40 ns 337.53 ns]
label_values__naive_usage/ntimelines/8 time: [1.1615 µs 1.1998 µs 1.2399 µs]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.6635 ns 1.6715 ns 1.6809 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.7786 ns 1.7876 ns 1.8028 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.8195 ns 1.8371 ns 1.8665 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.7764 ns 1.7909 ns 1.8079 ns]
single_metric_multicore_scalability/nthreads/4 time: [33.875 ns 34.868 ns 35.923 ns]
single_metric_multicore_scalability/nthreads/8 time: [226.85 ns 235.30 ns 244.18 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [3.4337 ns 3.4491 ns 3.4660 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [69.486 ns 71.937 ns 74.472 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.47 ns 477.84 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns]
Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor
label_values__naive_usage/ntimelines/1 time: [64.510 ns 64.559 ns 64.610 ns]
label_values__naive_usage/ntimelines/4 time: [309.71 ns 326.09 ns 342.32 ns]
label_values__naive_usage/ntimelines/8 time: [776.92 ns 819.35 ns 856.93 ns]
label_values__cache_label_values_lookup/ntimelines/1 time: [1.2855 ns 1.2943 ns 1.3021 ns]
label_values__cache_label_values_lookup/ntimelines/4 time: [1.3865 ns 1.4139 ns 1.4441 ns]
label_values__cache_label_values_lookup/ntimelines/8 time: [1.5311 ns 1.5669 ns 1.6046 ns]
single_metric_multicore_scalability/nthreads/1 time: [1.1927 ns 1.1981 ns 1.2049 ns]
single_metric_multicore_scalability/nthreads/4 time: [24.346 ns 25.439 ns 26.634 ns]
single_metric_multicore_scalability/nthreads/8 time: [58.666 ns 60.137 ns 61.486 ns]
propagation_of_cached_label_value__naive/nthreads/1 time: [2.7067 ns 2.7238 ns 2.7402 ns]
propagation_of_cached_label_value__naive/nthreads/4 time: [62.723 ns 66.214 ns 69.787 ns]
propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.10 ns 175.68 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns]
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns]
*/

View File

@@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody;
use pageserver_api::models::*;
use pageserver_api::shard::TenantShardId;
pub use reqwest::Body as ReqwestBody;
use reqwest::{Certificate, IntoUrl, Method, StatusCode};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -38,9 +38,6 @@ pub enum Error {
#[error("Cancelled")]
Cancelled,
#[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
CreateClient(reqwest::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -72,17 +69,8 @@ pub enum ForceAwaitLogicalSize {
}
impl Client {
pub fn new(
mgmt_api_endpoint: String,
jwt: Option<&str>,
ssl_ca_cert: Option<Certificate>,
) -> Result<Self> {
let mut http_client = reqwest::Client::builder();
if let Some(ssl_ca_cert) = ssl_ca_cert {
http_client = http_client.add_root_certificate(ssl_ca_cert);
}
let http_client = http_client.build().map_err(Error::CreateClient)?;
Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt))
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
}
pub fn from_client(
@@ -113,10 +101,12 @@ impl Client {
debug_assert!(path.starts_with('/'));
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
let mut req = self.client.request(Method::GET, uri);
if let Some(value) = &self.authorization_header {
req = req.header(reqwest::header::AUTHORIZATION, value);
}
let req = self.client.request(Method::GET, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
req.send().await.map_err(Error::ReceiveBody)
}
@@ -490,7 +480,6 @@ impl Client {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
concurrency: Option<usize>,
recurse: bool,
) -> Result<()> {
let mut path = reqwest::Url::parse(&format!(
"{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
@@ -498,9 +487,6 @@ impl Client {
))
.expect("Cannot build URL");
path.query_pairs_mut()
.append_pair("recurse", &format!("{}", recurse));
if let Some(concurrency) = concurrency {
path.query_pairs_mut()
.append_pair("concurrency", &format!("{}", concurrency));

View File

@@ -15,7 +15,6 @@ hdrhistogram.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
rand.workspace = true
reqwest.workspace=true
serde.workspace = true
serde_json.workspace = true
tracing.workspace = true

View File

@@ -36,8 +36,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -77,8 +77,7 @@ async fn main_impl(
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -125,8 +125,7 @@ async fn main_impl(
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
if let Some(engine_str) = &args.set_io_engine {
mgmt_api_client.put_io_engine(engine_str).await?;

View File

@@ -83,8 +83,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
if let Some(engine_str) = &args.set_io_engine {
mgmt_api_client.put_io_engine(engine_str).await?;

View File

@@ -40,8 +40,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
None, // TODO: support ssl_ca_file for https APIs in pagebench.
)?);
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(

View File

@@ -33,9 +33,8 @@ use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
#[derive(Debug, thiserror::Error)]
pub enum BasebackupError {
@@ -43,26 +42,6 @@ pub enum BasebackupError {
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
#[error("basebackup during shutdown")]
Shutdown,
}
impl From<PageReconstructError> for BasebackupError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
impl From<GetVectoredError> for BasebackupError {
fn from(value: GetVectoredError) -> Self {
match value {
GetVectoredError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
/// Create basebackup with non-rel data in it.
@@ -148,7 +127,7 @@ where
timeline
.gate
.enter()
.map_err(|_| BasebackupError::Shutdown)?,
.map_err(|e| BasebackupError::Server(e.into()))?,
),
};
basebackup
@@ -344,7 +323,8 @@ where
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await?
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.partition(
self.timeline.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -356,10 +336,11 @@ where
let blocks = self
.timeline
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
for (key, block) in blocks {
let block = block?;
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
slru_builder.add_block(&key, block).await?;
}
}
@@ -368,8 +349,11 @@ where
let mut min_restart_lsn: Lsn = Lsn::MAX;
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
for ((spcnode, dbnode), has_relmap_file) in self
.timeline
.list_dbdirs(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -378,7 +362,8 @@ where
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
@@ -406,7 +391,8 @@ where
let aux_files = self
.timeline
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let aux_scan_time = start_time.elapsed();
let aux_estimated_size = aux_files
.values()
@@ -465,14 +451,16 @@ where
for xid in self
.timeline
.list_twophase_files(self.lsn, self.ctx)
.await?
.await
.map_err(|e| BasebackupError::Server(e.into()))?
{
self.add_twophase_file(xid).await?;
}
let repl_origins = self
.timeline
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let n_origins = repl_origins.len();
if n_origins != 0 {
//
@@ -517,7 +505,8 @@ where
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
// If the relation is empty, create an empty file
if nblocks == 0 {
@@ -543,7 +532,8 @@ where
// TODO: investigate using get_vectored for the entire startblk..endblk range.
// But this code path is not on the critical path for most basebackups (?).
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
segment_data.extend_from_slice(&img[..]);
}
@@ -577,7 +567,8 @@ where
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
if img.len()
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -631,7 +622,8 @@ where
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.is_empty()
{
return Ok(());
@@ -682,7 +674,8 @@ where
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
.await?;
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);

View File

@@ -25,12 +25,11 @@ use pageserver::task_mgr::{
};
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
page_cache, page_service, task_mgr, virtual_file,
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
task_mgr, virtual_file,
};
use postgres_backend::AuthType;
use remote_storage::GenericRemoteStorage;
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
@@ -344,15 +343,8 @@ fn start_pageserver(
info!("Starting pageserver http handler on {http_addr}");
let http_listener = tcp_listener::bind(http_addr)?;
let https_listener = match conf.listen_https_addr.as_ref() {
Some(https_addr) => {
info!("Starting pageserver https handler on {https_addr}");
Some(tcp_listener::bind(https_addr)?)
}
None => None,
};
let pg_addr = &conf.listen_pg_addr;
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
@@ -583,8 +575,9 @@ fn start_pageserver(
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
let (http_endpoint_listener, https_endpoint_listener) = {
let http_endpoint_listener = {
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
let cancel = CancellationToken::new();
let router_state = Arc::new(
http::routes::State::new(
@@ -599,51 +592,22 @@ fn start_pageserver(
)
.context("Failed to initialize router state")?,
);
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
.build()
.map_err(|err| anyhow!(err))?;
let service = http_utils::RouterService::new(router).unwrap();
let server = hyper0::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown({
let cancel = cancel.clone();
async move { cancel.clone().cancelled().await }
});
let service =
Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
let http_task = {
let server =
http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
let cancel = CancellationToken::new();
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"http endpoint listener",
server.serve(cancel.clone()),
));
HttpEndpointListener(CancellableTask { task, cancel })
};
let https_task = match https_listener {
Some(https_listener) => {
let certs = load_certs(&conf.ssl_cert_file)?;
let key = load_private_key(&conf.ssl_key_file)?;
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_single_cert(certs, key)?;
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
let server =
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
let cancel = CancellationToken::new();
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"https endpoint listener",
server.serve(cancel.clone()),
));
Some(HttpsEndpointListener(CancellableTask { task, cancel }))
}
None => None,
};
(http_task, https_task)
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"http endpoint listener",
server,
));
HttpEndpointListener(CancellableTask { task, cancel })
};
let consumption_metrics_tasks = {
@@ -719,7 +683,6 @@ fn start_pageserver(
shutdown_pageserver.cancel();
pageserver::shutdown_pageserver(
http_endpoint_listener,
https_endpoint_listener,
page_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
@@ -734,25 +697,6 @@ fn start_pageserver(
})
}
fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
rustls_pemfile::certs(&mut reader).collect()
}
fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
let key = rustls_pemfile::private_key(&mut reader)?;
key.ok_or(anyhow::anyhow!(
"no private key found in {}",
filename.as_str(),
))
}
async fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {

View File

@@ -53,11 +53,6 @@ pub struct PageServerConf {
pub listen_pg_addr: String,
/// Example (default): 127.0.0.1:9898
pub listen_http_addr: String,
/// Example: 127.0.0.1:9899
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
/// Current availability zone. Used for traffic metrics.
pub availability_zone: Option<String>,
@@ -199,13 +194,6 @@ pub struct PageServerConf {
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
/// safekeepers does not have gaps.
pub validate_wal_contiguity: bool,
/// When set, the previously written to disk heatmap is loaded on tenant attach and used
/// to avoid clobbering the heatmap from new, cold, attached locations.
pub load_previous_heatmap: bool,
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
pub generate_unarchival_heatmap: bool,
}
/// Token for authentication to safekeepers
@@ -322,9 +310,6 @@ impl PageServerConf {
let pageserver_api::config::ConfigToml {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
ssl_key_file,
ssl_cert_file,
availability_zone,
wait_lsn_timeout,
wal_redo_timeout,
@@ -373,8 +358,6 @@ impl PageServerConf {
get_vectored_concurrent_io,
enable_read_path_debugging,
validate_wal_contiguity,
load_previous_heatmap,
generate_unarchival_heatmap,
} = config_toml;
let mut conf = PageServerConf {
@@ -383,9 +366,6 @@ impl PageServerConf {
// ------------------------------------------------------------
listen_pg_addr,
listen_http_addr,
listen_https_addr,
ssl_key_file,
ssl_cert_file,
availability_zone,
wait_lsn_timeout,
wal_redo_timeout,
@@ -467,8 +447,6 @@ impl PageServerConf {
no_sync: no_sync.unwrap_or(false),
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
};
// ------------------------------------------------------------
@@ -502,9 +480,7 @@ impl PageServerConf {
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
let test_id = uuid::Uuid::new_v4();
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -517,8 +493,6 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
synthetic_size_calculation_interval: Duration::from_secs(60),
background_task_maximum_delay: Duration::ZERO,
load_previous_heatmap: Some(true),
generate_unarchival_heatmap: Some(true),
..Default::default()
};
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()

View File

@@ -89,112 +89,16 @@
//! [`RequestContext`] argument. Functions in the middle of the call chain
//! only need to pass it on.
use std::sync::Arc;
use once_cell::sync::Lazy;
use tracing::warn;
use utils::{id::TimelineId, shard::TenantShardId};
use crate::{
metrics::{StorageIoSizeMetrics, TimelineMetrics},
task_mgr::TaskKind,
tenant::Timeline,
};
use crate::task_mgr::TaskKind;
// The main structure of this module, see module-level comment.
#[derive(Debug)]
pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
read_path_debug: bool,
scope: Scope,
}
#[derive(Clone)]
pub(crate) enum Scope {
Global {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTenant {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTimeline {
io_size_metrics: crate::metrics::StorageIoSizeMetrics,
},
Timeline {
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// context creation contending for the ref counters of the Arc<TimelineMetrics>,
// which are shared among all tasks that operate on the timeline, especially
// concurrent page_service connections.
#[allow(clippy::redundant_allocation)]
arc_arc: Arc<Arc<TimelineMetrics>>,
},
#[cfg(test)]
UnitTest {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
}
static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
impl Scope {
pub(crate) fn new_global() -> Self {
Scope::Global {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
/// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
/// of a compaction iteration.
pub(crate) fn new_timeline(timeline: &Timeline) -> Self {
Scope::Timeline {
arc_arc: Arc::new(Arc::clone(&timeline.metrics)),
}
}
pub(crate) fn new_page_service_pagestream(
timeline_handle: &crate::tenant::timeline::handle::Handle<
crate::page_service::TenantManagerTypes,
>,
) -> Self {
Scope::Timeline {
arc_arc: Arc::clone(&timeline_handle.metrics),
}
}
pub(crate) fn new_secondary_timeline(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Self {
// TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
let timeline_id = timeline_id.to_string();
let io_size_metrics =
crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
Scope::SecondaryTimeline { io_size_metrics }
}
pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
// Before propagating metrics via RequestContext, the labels were inferred from file path.
// The only user of VirtualFile at tenant scope is the heatmap download & read.
// The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*).
// Thus, we do the same here, and extend that for anything secondary-tenant scoped.
//
// If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future,
// we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown,
// like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
// at this point, so, we were able to completely side-step tenant-scoped stuff there).
Scope::SecondaryTenant {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
#[cfg(test)]
pub(crate) fn new_unit_test() -> Self {
Scope::UnitTest {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
}
/// The kind of access to the page cache.
@@ -253,7 +157,6 @@ impl RequestContextBuilder {
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
read_path_debug: false,
scope: Scope::new_global(),
},
}
}
@@ -268,16 +171,10 @@ impl RequestContextBuilder {
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
read_path_debug: original.read_path_debug,
scope: original.scope.clone(),
},
}
}
pub fn task_kind(mut self, k: TaskKind) -> Self {
self.inner.task_kind = k;
self
}
/// Configure the DownloadBehavior of the context: whether to
/// download missing layers, and/or warn on the download.
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
@@ -302,11 +199,6 @@ impl RequestContextBuilder {
self
}
pub(crate) fn scope(mut self, s: Scope) -> Self {
self.inner.scope = s;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
@@ -389,50 +281,7 @@ impl RequestContext {
}
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
RequestContextBuilder::extend(self)
.task_kind(task_kind)
.download_behavior(download_behavior)
.build()
}
pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_timeline(timeline))
.build()
}
pub(crate) fn with_scope_page_service_pagestream(
&self,
timeline_handle: &crate::tenant::timeline::handle::Handle<
crate::page_service::TenantManagerTypes,
>,
) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_page_service_pagestream(timeline_handle))
.build()
}
pub fn with_scope_secondary_timeline(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
.build()
}
pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_secondary_tenant(tenant_shard_id))
.build()
}
#[cfg(test)]
pub fn with_scope_unit_test(&self) -> Self {
RequestContextBuilder::new(TaskKind::UnitTest)
.scope(Scope::new_unit_test())
.build()
Self::new(task_kind, download_behavior)
}
pub fn task_kind(&self) -> TaskKind {
@@ -454,38 +303,4 @@ impl RequestContext {
pub(crate) fn read_path_debug(&self) -> bool {
self.read_path_debug
}
pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
match &self.scope {
Scope::Global { io_size_metrics } => {
let is_unit_test = cfg!(test);
let is_regress_test_build = cfg!(feature = "testing");
if is_unit_test || is_regress_test_build {
panic!("all VirtualFile instances are timeline-scoped");
} else {
use once_cell::sync::Lazy;
use std::sync::Mutex;
use std::time::Duration;
use utils::rate_limit::RateLimit;
static LIMIT: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
let mut guard = LIMIT.lock().unwrap();
guard.call2(|rate_limit_stats| {
warn!(
%rate_limit_stats,
backtrace=%std::backtrace::Backtrace::force_capture(),
"all VirtualFile instances are timeline-scoped",
);
});
io_size_metrics
}
}
Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
#[cfg(test)]
Scope::UnitTest { io_size_metrics } => io_size_metrics,
}
}
}

View File

@@ -181,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
listen_pg_port: m.postgres_port,
listen_http_addr: m.http_host,
listen_http_port: m.http_port,
listen_https_port: m.https_port,
listen_https_port: None, // TODO: Support https.
availability_zone_id: az_id.expect("Checked above"),
})
}

View File

@@ -842,12 +842,6 @@ paths:
required: false
schema:
type: integer
- name: recurse
description: When set, will recurse with the downloads into ancestor timelines
in: query
required: false
schema:
type: boolean
post:
description: |
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter

View File

@@ -37,8 +37,7 @@ use pageserver_api::models::{
TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
TopTenantShardsRequest, TopTenantShardsResponse,
TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
};
use pageserver_api::shard::{ShardCount, TenantShardId};
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -55,7 +54,6 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::config::PageServerConf;
use crate::context;
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
use crate::deletion_queue::DeletionQueueClient;
use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -65,7 +63,6 @@ use crate::tenant::mgr::{
GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
};
use crate::tenant::remote_timeline_client::index::GcCompactionState;
use crate::tenant::remote_timeline_client::{
download_index_part, list_remote_tenant_shards, list_remote_timelines,
};
@@ -484,7 +481,6 @@ async fn build_timeline_info_common(
state,
is_archived: Some(is_archived),
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
walreceiver_status,
};
@@ -861,75 +857,6 @@ async fn timeline_archival_config_handler(
json_response(StatusCode::OK, ())
}
/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
/// measure only.
///
/// Some examples of safe patches:
/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
/// - Force set the index part to use reldir v2 (migrating/migrated).
///
/// Some examples of unsafe patches:
/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
/// errors.
/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
async fn timeline_patch_index_part_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
check_permission(&request, None)?; // require global permission for this request
let state = get_state(&request);
async {
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
if let Some(rel_size_migration) = request_data.rel_size_migration {
timeline
.update_rel_size_v2_status(rel_size_migration)
.map_err(ApiError::InternalServerError)?;
}
if let Some(gc_compaction_last_completed_lsn) =
request_data.gc_compaction_last_completed_lsn
{
timeline
.update_gc_compaction_state(GcCompactionState {
last_completed_lsn: gc_compaction_last_completed_lsn,
})
.map_err(ApiError::InternalServerError)?;
}
if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
{
let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
guard.store_and_unlock(applied_gc_cutoff_lsn);
}
}
if request_data.force_index_update {
timeline
.remote_client
.force_schedule_index_upload()
.context("force schedule index upload")
.map_err(ApiError::InternalServerError)?;
}
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_patch_index_part",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -954,13 +881,12 @@ async fn timeline_detail_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let ctx = &ctx.with_scope_timeline(&timeline);
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size.unwrap_or(false),
force_await_initial_logical_size.unwrap_or(false),
ctx,
&ctx,
)
.await
.context("get local timeline info")
@@ -1004,8 +930,7 @@ async fn get_lsn_by_timestamp_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
@@ -1077,8 +1002,7 @@ async fn get_timestamp_of_lsn_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
match result {
@@ -1433,8 +1357,7 @@ async fn timeline_layer_scan_disposable_keys(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let guard = timeline.layers.read().await;
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
@@ -1512,7 +1435,6 @@ async fn timeline_download_heatmap_layers_handler(
let desired_concurrency =
parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
@@ -1520,8 +1442,7 @@ async fn timeline_download_heatmap_layers_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let max_concurrency = get_config(&request)
.remote_storage_config
@@ -1530,7 +1451,9 @@ async fn timeline_download_heatmap_layers_handler(
.unwrap_or(DEFAULT_MAX_CONCURRENCY);
let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;
timeline
.start_heatmap_layers_download(concurrency, &ctx)
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -1569,8 +1492,7 @@ async fn layer_download_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let downloaded = timeline
.download_layer(&layer_name, &ctx)
.await
@@ -2306,8 +2228,8 @@ async fn timeline_compact_handler(
.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
if scheduled {
let tenant = state
.tenant_manager
@@ -2414,8 +2336,8 @@ async fn timeline_checkpoint_handler(
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
if wait_until_flushed {
timeline.freeze_and_flush().await
} else {
@@ -2470,8 +2392,7 @@ async fn timeline_download_remote_layers_handler_post(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
match timeline.spawn_download_all_remote_layers(body, &ctx).await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -2554,7 +2475,6 @@ async fn timeline_detach_ancestor_handler(
tracing::info!("all timeline upload queues are drained");
let timeline = tenant.get_timeline(timeline_id, true)?;
let ctx = &ctx.with_scope_timeline(&timeline);
let progress = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
@@ -2661,9 +2581,8 @@ async fn getpage_at_lsn_handler_inner(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
// Enable read path debugging
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
.scope(context::Scope::new_timeline(&timeline)).build();
// Use last_record_lsn if no lsn is provided
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -2697,8 +2616,8 @@ async fn timeline_collect_keyspace(
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let (dense_ks, sparse_ks) = timeline
.collect_keyspace(at_lsn, &ctx)
@@ -3223,7 +3142,6 @@ async fn post_top_tenants(
match order_by {
TenantSorting::ResidentSize => sizes.resident_size,
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
TenantSorting::MaxLogicalSizePerShard => sizes.max_logical_size_per_shard,
}
}
@@ -3336,7 +3254,7 @@ async fn put_tenant_timeline_import_basebackup(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let (timeline, timeline_ctx) = tenant
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.map_err(ApiError::InternalServerError)
.await?;
@@ -3355,13 +3273,7 @@ async fn put_tenant_timeline_import_basebackup(
info!("importing basebackup");
timeline
.import_basebackup_from_tar(
tenant.clone(),
&mut body,
base_lsn,
broker_client,
&timeline_ctx,
)
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
@@ -3401,7 +3313,6 @@ async fn put_tenant_timeline_import_wal(
let state = get_state(&request);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
@@ -3718,10 +3629,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
|r| api_handler(r, timeline_patch_index_part_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),

View File

@@ -64,7 +64,6 @@ pub struct CancellableTask {
pub cancel: CancellationToken,
}
pub struct HttpEndpointListener(pub CancellableTask);
pub struct HttpsEndpointListener(pub CancellableTask);
pub struct ConsumptionMetricsTasks(pub CancellableTask);
pub struct DiskUsageEvictionTask(pub CancellableTask);
impl CancellableTask {
@@ -78,7 +77,6 @@ impl CancellableTask {
#[allow(clippy::too_many_arguments)]
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
https_listener: Option<HttpsEndpointListener>,
page_service: page_service::Listener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
@@ -215,15 +213,6 @@ pub async fn shutdown_pageserver(
)
.await;
if let Some(https_listener) = https_listener {
timed(
https_listener.0.shutdown(),
"shutdown https",
Duration::from_secs(1),
)
.await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.

Some files were not shown because too many files have changed in this diff Show More