Compute release 2025-03-07

GitHub PR neondatabase/neon#11129
This commit is contained in:
Matthias van de Meent
2025-03-10 14:42:44 +01:00
committed by GitHub
157 changed files with 5680 additions and 2427 deletions

View File

@@ -32,3 +32,4 @@ config-variables:
- NEON_DEV_AWS_ACCOUNT_ID
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION
- BENCHMARK_LARGE_OLTP_PROJECTID

View File

@@ -84,7 +84,13 @@ runs:
--header "Authorization: Bearer ${API_KEY}"
)
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
role_name=$(echo "$roles" | jq --raw-output '
(.roles | map(select(.protected == false))) as $roles |
if any($roles[]; .name == "neondb_owner")
then "neondb_owner"
else $roles[0].name
end
')
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
env:
API_HOST: ${{ inputs.api_host }}
@@ -107,13 +113,13 @@ runs:
)
if [ -z "${reset_password}" ]; then
sleep 1
sleep $i
continue
fi
password=$(echo $reset_password | jq --raw-output '.role.password')
if [ "${password}" == "null" ]; then
sleep 1
sleep $i # increasing backoff
continue
fi

View File

@@ -44,6 +44,11 @@ inputs:
description: 'Postgres version to use for tests'
required: false
default: 'v16'
sanitizers:
description: 'enabled or disabled'
required: false
default: 'disabled'
type: string
benchmark_durations:
description: 'benchmark durations JSON'
required: false
@@ -59,7 +64,7 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
@@ -112,6 +117,7 @@ runs:
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FAILED: ${{ inputs.rerun_failed }}
PG_VERSION: ${{ inputs.pg_version }}
SANITIZERS: ${{ inputs.sanitizers }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report

View File

@@ -280,7 +280,7 @@ jobs:
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
path: /tmp/neon
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
@@ -347,6 +347,7 @@ jobs:
real_s3_region: eu-central-1
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
sanitizers: ${{ inputs.sanitizers }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
@@ -359,7 +360,6 @@ jobs:
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
SANITIZERS: ${{ inputs.sanitizers }}
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540

View File

@@ -141,6 +141,8 @@ jobs:
--ignore test_runner/performance/test_physical_replication.py
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
--ignore test_runner/performance/test_perf_many_relations.py
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

View File

@@ -692,15 +692,15 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
vm-compute-node-image:
vm-compute-node-image-arch:
needs: [ check-permissions, meta, compute-node-image ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: [ self-hosted, large ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
strategy:
fail-fast: false
matrix:
arch: [ amd64, arm64 ]
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
debian: bullseye
- pg: v15
@@ -717,7 +717,7 @@ jobs:
- name: Downloading vm-builder
run: |
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
chmod +x vm-builder
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,12 +738,37 @@ jobs:
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-target-arch=linux/amd64
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
-target-arch=linux/${{ matrix.arch }}
- name: Pushing vm-compute-node image
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
vm-compute-node-image:
needs: [ vm-compute-node-image-arch, meta ]
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
runs-on: ubuntu-22.04
strategy:
matrix:
version:
# see the comment for `compute-node-image-arch` job
- pg: v14
- pg: v15
- pg: v16
- pg: v17
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Create multi-arch compute-node image
run: |
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
test-images:
needs: [ check-permissions, meta, neon-image, compute-node-image ]
@@ -1036,7 +1061,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
permissions:

View File

@@ -52,8 +52,9 @@ jobs:
- name: Test extension upgrade
timeout-minutes: 20
env:
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
NEW_COMPUTE_TAG: latest
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh

View File

@@ -0,0 +1,147 @@
name: large oltp benchmark
on:
# uncomment to run on push for debugging your PR
push:
branches: [ bodobolero/synthetic_oltp_workload ]
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: large-oltp-bench-workflow
cancel-in-progress: true
jobs:
oltp:
strategy:
fail-fast: false # allow other variants to continue even if one fails
matrix:
include:
- target: new_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
- target: reuse_branch
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
PG_VERSION: 16 # pre-determined by pre-determined project
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
PLATFORM: ${{ matrix.target }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 8h, default timeout is 6h
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary to download artefacts
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Create Neon Branch for large tenant
if: ${{ matrix.target == 'new_branch' }}
id: create-neon-branch-oltp-target
uses: ./.github/actions/neon-branch-create
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${{ matrix.target }}" in
new_branch)
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
;;
reuse_branch)
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
;;
*)
echo >&2 "Unknown target=${{ matrix.target }}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Benchmark pgbench with custom-scripts
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
pg_version: ${{ env.PG_VERSION }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target == 'new_branch' }}
uses: ./.github/actions/neon-branch-delete
with:
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Periodic large oltp perf testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
# ┌───────────── minute (0 - 59)
# ┌───────────── hour (0 - 23)
# │ ┌───────────── day of the month (1 - 31)
# │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 */3 * * *' # Runs every 3 hours
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:
@@ -78,8 +78,10 @@ jobs:
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
@@ -89,7 +91,7 @@ jobs:
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
- name: Poll Test Status
id: poll_step

View File

@@ -1,8 +1,9 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd
/.github/ @neondatabase/developer-productivity
# DevProd & PerfCorr
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
/test_runner/ @neondatabase/performance-correctness
# Compute
/pgxn/ @neondatabase/compute

172
Cargo.lock generated
View File

@@ -783,6 +783,28 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-extra"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
dependencies = [
"axum",
"axum-core",
"bytes",
"futures-util",
"headers",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"mime",
"pin-project-lite",
"serde",
"tower 0.5.2",
"tower-layer",
"tower-service",
]
[[package]]
name = "azure_core"
version = "0.21.0"
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
[[package]]
name = "base64"
version = "0.21.1"
version = "0.21.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
[[package]]
name = "base64"
@@ -1305,6 +1327,7 @@ dependencies = [
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"axum-extra",
"base64 0.13.1",
"bytes",
"camino",
@@ -1316,6 +1339,7 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"notify",
@@ -2297,7 +2321,7 @@ name = "framed-websockets"
version = "0.1.0"
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
"bytemuck",
"bytes",
"futures-core",
@@ -2410,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-timer"
version = "3.0.2"
version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
[[package]]
name = "futures-util"
@@ -2515,6 +2539,27 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "governor"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
dependencies = [
"cfg-if",
"dashmap 6.1.0",
"futures-sink",
"futures-timer",
"futures-util",
"no-std-compat",
"nonzero_ext",
"parking_lot 0.12.1",
"portable-atomic",
"quanta",
"rand 0.8.5",
"smallvec",
"spinning_top",
]
[[package]]
name = "group"
version = "0.12.1"
@@ -2632,7 +2677,7 @@ version = "7.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
"byteorder",
"crossbeam-channel",
"flate2",
@@ -2640,6 +2685,30 @@ dependencies = [
"num-traits",
]
[[package]]
name = "headers"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
dependencies = [
"base64 0.21.7",
"bytes",
"headers-core",
"http 1.1.0",
"httpdate",
"mime",
"sha1",
]
[[package]]
name = "headers-core"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
dependencies = [
"http 1.1.0",
]
[[package]]
name = "heck"
version = "0.5.0"
@@ -2777,12 +2846,9 @@ name = "http-utils"
version = "0.1.0"
dependencies = [
"anyhow",
"backtrace",
"bytes",
"fail",
"flate2",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"metrics",
@@ -3281,9 +3347,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jemalloc_pprof"
version = "0.6.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
dependencies = [
"anyhow",
"libc",
@@ -3367,7 +3433,7 @@ version = "9.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
"js-sys",
"pem",
"ring",
@@ -3482,9 +3548,9 @@ dependencies = [
[[package]]
name = "mappings"
version = "0.6.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
dependencies = [
"anyhow",
"libc",
@@ -3725,6 +3791,12 @@ dependencies = [
"memoffset 0.9.0",
]
[[package]]
name = "no-std-compat"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
[[package]]
name = "nom"
version = "7.1.3"
@@ -3735,6 +3807,12 @@ dependencies = [
"minimal-lexical",
]
[[package]]
name = "nonzero_ext"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
[[package]]
name = "notify"
version = "8.0.0"
@@ -4225,6 +4303,7 @@ dependencies = [
"tracing",
"url",
"utils",
"uuid",
"wal_decoder",
"walkdir",
"workspace_hack",
@@ -4307,9 +4386,9 @@ dependencies = [
[[package]]
name = "papaya"
version = "0.1.8"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
dependencies = [
"equivalent",
"seize",
@@ -4437,7 +4516,7 @@ version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
"serde",
]
@@ -4591,6 +4670,12 @@ dependencies = [
"never-say-never",
]
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "postgres"
version = "0.19.7"
@@ -4755,12 +4840,14 @@ dependencies = [
[[package]]
name = "pprof_util"
version = "0.6.0"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
dependencies = [
"anyhow",
"backtrace",
"flate2",
"inferno 0.12.0",
"num",
"paste",
"prost",
@@ -5052,6 +5139,21 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "quanta"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.0+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]]
name = "quick-xml"
version = "0.26.0"
@@ -5182,6 +5284,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "raw-cpuid"
version = "11.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -5752,7 +5863,7 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
]
[[package]]
@@ -5761,7 +5872,7 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
dependencies = [
"base64 0.21.1",
"base64 0.21.7",
"rustls-pki-types",
]
@@ -6000,9 +6111,9 @@ dependencies = [
[[package]]
name = "seize"
version = "0.4.9"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
dependencies = [
"libc",
"windows-sys 0.52.0",
@@ -6395,6 +6506,15 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "spinning_top"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
dependencies = [
"lock_api",
]
[[package]]
name = "spki"
version = "0.6.0"
@@ -6471,6 +6591,7 @@ dependencies = [
"diesel_migrations",
"fail",
"futures",
"governor",
"hex",
"http-utils",
"humantime",
@@ -7285,10 +7406,12 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
dependencies = [
"base64 0.22.1",
"bitflags 2.8.0",
"bytes",
"http 1.1.0",
"http-body 1.0.0",
"mime",
"pin-project-lite",
"tower-layer",
"tower-service",
@@ -7642,7 +7765,6 @@ dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"backtrace",
"bincode",
"byteorder",
"bytes",
@@ -8196,7 +8318,7 @@ dependencies = [
"ahash",
"anyhow",
"base64 0.13.1",
"base64 0.21.1",
"base64 0.21.7",
"base64ct",
"bytes",
"camino",

View File

@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
assert-json-diff = "2"
async-stream = "0.3"
@@ -68,6 +67,7 @@ aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.8.1", features = ["ws"] }
axum-extra = { version = "0.10.0", features = ["typed-header"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.71"
@@ -95,6 +95,7 @@ futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
governor = "0.8"
hashbrown = "0.14"
hashlink = "0.9.1"
hdrhistogram = "7.5.2"
@@ -113,11 +114,10 @@ hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
inferno = "0.12.0"
ipnet = "2.10.0"
itertools = "0.10"
itoa = "1.0.11"
jemalloc_pprof = "0.6"
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
jsonwebtoken = "9"
lasso = "0.7"
libc = "0.2"
@@ -192,7 +192,7 @@ toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
# This revision uses opentelemetry 0.27. There's no tag for it.
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }

View File

@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
#
BUILD_TYPE ?= debug
WITH_SANITIZERS ?= no
PG_CFLAGS = -fsigned-char
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_CFLAGS += -O2 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS = -O0 -g3 $(CFLAGS)
PG_CFLAGS += -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling pg_trgm $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"

View File

@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
# zstd is available only from PG15
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
COPY compute/patches/duckdb_v120.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
else\
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
fi\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
| tar xzf - --strip-components=1 -C.\
@@ -1933,6 +1933,7 @@ RUN apt update && \
locales \
procps \
ca-certificates \
rsyslog \
$VERSION_INSTALLS && \
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
# Make the libraries we built available
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# rsyslog config permissions
# directory for rsyslogd pid file
RUN mkdir /var/run/rsyslogd && \
chown -R postgres:postgres /var/run/rsyslogd && \
chown -R postgres:postgres /etc/rsyslog.d/
ENV LANG=en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -29,6 +29,7 @@
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
import 'sql_exporter/lfc_chunk_size.libsonnet',
import 'sql_exporter/lfc_hits.libsonnet',
import 'sql_exporter/lfc_misses.libsonnet',
import 'sql_exporter/lfc_used.libsonnet',

View File

@@ -1 +1,5 @@
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;

View File

@@ -0,0 +1,10 @@
{
metric_name: 'lfc_chunk_size',
type: 'gauge',
help: 'LFC chunk size, measured in 8KiB pages',
key_labels: null,
values: [
'lfc_chunk_size_pages',
],
query: importstr 'sql_exporter/lfc_chunk_size.sql',
}

View File

@@ -0,0 +1 @@
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';

View File

@@ -1,10 +1,20 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
tup_updated AS updated, tup_deleted AS deleted, datname
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2
AND datname <> 'postgres'
AND NOT datistemplate
ORDER BY oid
LIMIT 10
);

View File

@@ -39,6 +39,10 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -54,7 +58,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -69,6 +73,12 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -132,6 +142,12 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -39,6 +39,10 @@ commands:
user: nobody
sysvInitAction: respawn
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
- name: rsyslogd
user: postgres
sysvInitAction: respawn
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
@@ -54,7 +58,7 @@ files:
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -69,6 +73,12 @@ files:
}
memory {}
}
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
# compute_ctl will rewrite this file with the actual configuration, if needed.
- filename: compute_rsyslog.conf
content: |
*.* /dev/null
$IncludeConfig /etc/rsyslog.d/*.conf
build: |
# Build cgroup-tools
#
@@ -128,6 +138,11 @@ merge: |
RUN set -e \
&& chmod 0644 /etc/cgconfig.conf
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/

View File

@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
aws-smithy-types.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
axum-extra.workspace = true
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
@@ -25,6 +26,7 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true

View File

@@ -33,39 +33,27 @@
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
use std::collections::HashMap;
use std::ffi::OsString;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
use std::sync::mpsc;
use std::thread;
use std::time::Duration;
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Parser;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
};
use compute_tools::configurator::launch_configurator;
use compute_tools::disk_quota::set_disk_quota;
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::server::Server;
use compute_tools::logger::*;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{Resource, setrlimit};
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
use signal_hook::iterator::Signals;
use tracing::{error, info, warn};
use tracing::{error, info};
use url::Url;
use utils::failpoint_support;
@@ -164,29 +152,41 @@ fn main() -> Result<()> {
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let cli_spec = try_spec_from_cli(&cli)?;
let cli_spec = try_spec_from_cli(&cli)?;
let compute = wait_spec(build_tag, &cli, cli_spec)?;
let compute_node = ComputeNode::new(
ComputeNodeParams {
compute_id: cli.compute_id,
connstr,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
ext_remote_storage: cli.remote_ext_config.clone(),
resize_swap_on_bind: cli.resize_swap_on_bind,
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
#[cfg(target_os = "linux")]
filecache_connstr: cli.filecache_connstr,
#[cfg(target_os = "linux")]
cgroup: cli.cgroup,
#[cfg(target_os = "linux")]
vm_monitor_addr: cli.vm_monitor_addr,
build_tag,
start_postgres(&cli, compute)?
live_config_allowed: cli_spec.live_config_allowed,
},
cli_spec.spec,
cli_spec.compute_ctl_config,
)?;
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
let exit_code = compute_node.run()?;
scenario.teardown();
deinit_and_exit(wait_pg_result);
deinit_and_exit(exit_code);
}
async fn init() -> Result<String> {
@@ -207,56 +207,6 @@ async fn init() -> Result<String> {
Ok(build_tag)
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
.extract(&startup_tracing_carrier)
.attach();
info!("startup tracing context attached");
Some(guard)
} else {
None
}
}
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
// First, try to get cluster spec from the cli argument
if let Some(ref spec_json) = cli.spec_json {
@@ -307,357 +257,7 @@ struct CliSpecParams {
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
cli: &Cli,
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
new_state.pspec = Some(pspec);
spec_set = true;
} else {
spec_set = false;
}
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
.context("cannot build postgres config from connstr")?;
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
.context("cannot build tokio postgres config from connstr")?;
let compute_node = ComputeNode {
compute_id: cli.compute_id.clone(),
connstr,
conn_conf,
tokio_conn_conf,
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage: cli.remote_ext_config.clone(),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
let compute = Arc::new(compute_node);
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have its memory allocated from the host, and
// the necessary binaries will already be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch the external HTTP server first, so that we can serve control plane
// requests while configuration is still in progress.
Server::External(cli.external_http_port).launch(&compute);
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port).launch(&compute);
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
if state.status == ComputeStatus::ConfigurationPending {
info!("got spec, continue configuration");
// Spec is already set by the http server handler.
break;
}
}
// Record for how long we slept waiting for the spec.
let now = Utc::now();
state.metrics.wait_for_spec_ms = now
.signed_duration_since(state.start_time)
.to_std()
.unwrap()
.as_millis() as u64;
// Reset start time, so that the total startup time that is calculated later will
// not include the time that we waited for the spec.
state.start_time = now;
}
launch_lsn_lease_bg_task_for_static(&compute);
Ok(compute)
}
fn start_postgres(
cli: &Cli,
compute: Arc<ComputeNode>,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
// Create a tracing span for the startup operation.
//
// We could otherwise just annotate the function with #[instrument], but if
// we're being configured from a /configure HTTP request, we want the
// startup to be considered part of the /configure request.
let _this_entered = {
// Temporarily enter the /configure request's span, so that the new span
// becomes its child.
let _parent_entered = state.startup_span.take().map(|p| p.entered());
tracing::info_span!("start_postgres")
}
.entered();
state.set_status(ComputeStatus::Init, &compute.state_changed);
info!(
"running compute with features: {:?}",
state.pspec.as_ref().unwrap().spec.features
);
// before we release the mutex, fetch some parameters for later.
let &ComputeSpec {
swap_size_bytes,
disk_quota_bytes,
#[cfg(target_os = "linux")]
disable_lfc_resizing,
..
} = &state.pspec.as_ref().unwrap().spec;
drop(state);
// Launch remaining service threads
let _monitor_handle = launch_monitor(&compute);
let _configurator_handle = launch_configurator(&compute);
let mut prestartup_failed = false;
let mut delay_exit = false;
// Resize swap to the desired size if the compute spec says so
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
// *before* starting postgres.
//
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_mib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Set disk quota if the compute spec says so
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
{
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");
}
Err(err) => {
let err = err.context("failed to set disk quota");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Start Postgres
let mut pg = None;
if !prestartup_failed {
pg = match compute.start_compute() {
Ok(pg) => {
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
Some(pg)
}
Err(err) => {
error!("could not start the compute node: {:#}", err);
compute.set_failed_status(err);
delay_exit = true;
None
}
};
} else {
warn!("skipping postgres startup because pre-startup step failed");
}
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
None
} else {
Some(cli.filecache_connstr.clone())
};
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
let vm_monitor = tokio::spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: Some(cli.cgroup.clone()),
pgconnstr,
addr: cli.vm_monitor_addr.clone(),
})),
token.clone(),
));
Some(vm_monitor)
} else {
None
};
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
// Process has exited. Wait for the log collecting task to finish.
let _ = tokio::runtime::Handle::current()
.block_on(logs_handle)
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
if let Some(handle) = vm_monitor {
// Kills all threads spawned by the monitor
token.cancel();
// Kills the actual task running the monitor
handle.abort();
}
}
}
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
info!("syncing safekeepers on shutdown");
let storage_auth_token = pspec.storage_auth_token.clone();
let lsn = compute.sync_safekeepers(storage_auth_token)?;
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -58,14 +58,14 @@ pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
let pgbin = &compute.pgbin;
let pgbin = &compute.params.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
// Replace the DB in the connection string and disable it to parts.
// This is the only option to handle DBs with special characters.
let conf =
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
.map_err(|_| SchemaDumpError::Unexpected)?;
let host = conf
.get_hosts()
.first()

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +1,16 @@
use anyhow::Result;
use std::fmt::Write as FmtWrite;
use std::fs::{File, OpenOptions};
use std::io;
use std::io::Write;
use std::io::prelude::*;
use std::path::Path;
use anyhow::Result;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -55,10 +59,20 @@ pub fn write_postgres_conf(
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(
"safekeepers_connstrings is not zero, gen: {:?}",
spec.safekeepers_generation
);
// If generation is given, prepend sk list with g#number:
if let Some(generation) = spec.safekeepers_generation {
write!(neon_safekeepers_value, "g#{}:", generation)?;
}
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
writeln!(
file,
"neon.safekeepers={}",
escape_conf_value(&spec.safekeeper_connstrings.join(","))
escape_conf_value(&neon_safekeepers_value)
)?;
}
if let Some(s) = &spec.tenant_id {
@@ -126,6 +140,54 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl: end")?;
}
// If audit logging is enabled, configure pgaudit.
//
// Note, that this is called after the settings from spec are written.
// This way we always override the settings from the spec
// and don't allow the user or the control plane admin to change them.
if let ComputeAudit::Hipaa = spec.audit_log_level {
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
writeln!(file, "pgaudit.log='all'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;
// Set log rotation to 5 minutes
// TODO: tune this after performance testing
writeln!(file, "pgaudit.log_rotation_age=5")?;
// Add audit shared_preload_libraries, if they are not present.
//
// The caller who sets the flag is responsible for ensuring that the necessary
// shared_preload_libraries are present in the compute image,
// otherwise the compute start will fail.
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
let mut extra_shared_preload_libraries = String::new();
if !libs.contains("pgaudit") {
extra_shared_preload_libraries.push_str(",pgaudit");
}
if !libs.contains("pgauditlogtofile") {
extra_shared_preload_libraries.push_str(",pgauditlogtofile");
}
writeln!(
file,
"shared_preload_libraries='{}{}'",
libs, extra_shared_preload_libraries
)?;
} else {
// Typically, this should be unreacheable,
// because we always set at least some shared_preload_libraries in the spec
// but let's handle it explicitly anyway.
writeln!(
file,
"shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
)?;
}
writeln!(file, "# Managed by compute_ctl audit settings: end")?;
}
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
if spec.drop_subscriptions_before_start {

View File

@@ -0,0 +1,10 @@
# Load imfile module to read log files
module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
global(workDirectory="/var/log")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}

View File

@@ -253,27 +253,31 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
}
}
// Do request to extension storage proxy, i.e.
// Do request to extension storage proxy, e.g.,
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
// using HHTP GET
// and return the response body as bytes
//
// using HTTP GET and return the response body as bytes.
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
let uri = format!("{}/{}", ext_remote_storage, ext_path);
let filename = Path::new(ext_path)
.file_name()
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
.to_str()
.unwrap_or("unknown")
.to_string();
info!("Download extension {} from uri {}", ext_path, uri);
info!("Downloading extension file '{}' from uri {}", filename, uri);
match do_extension_server_request(&uri).await {
Ok(resp) => {
info!("Successfully downloaded remote extension data {}", ext_path);
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&StatusCode::OK.to_string()])
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
.inc();
Ok(resp)
}
Err((msg, status)) => {
REMOTE_EXT_REQUESTS_TOTAL
.with_label_values(&[&status])
.with_label_values(&[&status, &filename])
.inc();
bail!(msg);
}

View File

@@ -1,7 +1,9 @@
pub(crate) mod json;
pub(crate) mod path;
pub(crate) mod query;
pub(crate) mod request_id;
pub(crate) use json::Json;
pub(crate) use path::Path;
pub(crate) use query::Query;
pub(crate) use request_id::RequestId;

View File

@@ -0,0 +1,86 @@
use std::{
fmt::Display,
ops::{Deref, DerefMut},
};
use axum::{extract::FromRequestParts, response::IntoResponse};
use http::{StatusCode, request::Parts};
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
/// Extract the request ID from the `X-Request-Id` header.
#[derive(Debug, Clone, Default)]
pub(crate) struct RequestId(pub String);
#[derive(Debug)]
/// Rejection used for [`RequestId`].
///
/// Contains one variant for each way the [`RequestId`] extractor can
/// fail.
pub(crate) enum RequestIdRejection {
/// The request is missing the header.
MissingRequestId,
/// The value of the header is invalid UTF-8.
InvalidUtf8,
}
impl RequestIdRejection {
pub fn status(&self) -> StatusCode {
match self {
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
}
}
pub fn message(&self) -> String {
match self {
RequestIdRejection::MissingRequestId => "request ID is missing",
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
}
.to_string()
}
}
impl IntoResponse for RequestIdRejection {
fn into_response(self) -> axum::response::Response {
JsonResponse::error(self.status(), self.message())
}
}
impl<S> FromRequestParts<S> for RequestId
where
S: Send + Sync,
{
type Rejection = RequestIdRejection;
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
match parts.headers.get(X_REQUEST_ID) {
Some(value) => match value.to_str() {
Ok(request_id) => Ok(Self(request_id.to_string())),
Err(_) => Err(RequestIdRejection::InvalidUtf8),
},
None => Err(RequestIdRejection::MissingRequestId),
}
}
}
impl Deref for RequestId {
type Target = String;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for RequestId {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Display for RequestId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}

View File

@@ -0,0 +1,2 @@
/// Constant for `X-Request-Id` header.
pub const X_REQUEST_ID: &str = "x-request-id";

View File

@@ -0,0 +1,145 @@
use std::{collections::HashSet, net::SocketAddr};
use anyhow::{Result, anyhow};
use axum::{RequestExt, body::Body, extract::ConnectInfo};
use axum_extra::{
TypedHeader,
headers::{Authorization, authorization::Bearer},
};
use futures::future::BoxFuture;
use http::{Request, Response, StatusCode};
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
use serde::Deserialize;
use tower_http::auth::AsyncAuthorizeRequest;
use tracing::warn;
use crate::http::{JsonResponse, extract::RequestId};
#[derive(Clone, Debug, Deserialize)]
pub(in crate::http) struct Claims {
compute_id: String,
}
#[derive(Clone, Debug)]
pub(in crate::http) struct Authorize {
compute_id: String,
jwks: JwkSet,
validation: Validation,
}
impl Authorize {
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
let mut validation = Validation::new(Algorithm::EdDSA);
// Nothing is currently required
validation.required_spec_claims = HashSet::new();
validation.validate_exp = true;
// Unused by the control plane
validation.validate_aud = false;
// Unused by the control plane
validation.validate_nbf = false;
Self {
compute_id,
jwks,
validation,
}
}
}
impl AsyncAuthorizeRequest<Body> for Authorize {
type RequestBody = Body;
type ResponseBody = Body;
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
let compute_id = self.compute_id.clone();
let jwks = self.jwks.clone();
let validation = self.validation.clone();
Box::pin(async move {
let request_id = request.extract_parts::<RequestId>().await.unwrap();
// TODO: Remove this check after a successful rollout
if jwks.keys.is_empty() {
warn!(%request_id, "Authorization has not been configured");
return Ok(request);
}
let connect_info = request
.extract_parts::<ConnectInfo<SocketAddr>>()
.await
.unwrap();
// In the event the request is coming from the loopback interface,
// allow all requests
if connect_info.ip().is_loopback() {
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
return Ok(request);
}
let TypedHeader(Authorization(bearer)) = request
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
.await
.map_err(|_| {
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
})?;
let data = match Self::verify(&jwks, bearer.token(), &validation) {
Ok(claims) => claims,
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
};
if data.claims.compute_id != compute_id {
return Err(JsonResponse::error(
StatusCode::UNAUTHORIZED,
"invalid claims in authorization token",
));
}
// Make claims available to any subsequent middleware or request
// handlers
request.extensions_mut().insert(data.claims);
Ok(request)
})
}
}
impl Authorize {
/// Verify the token using the JSON Web Key set and return the token data.
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
debug_assert!(!jwks.keys.is_empty());
for jwk in jwks.keys.iter() {
let decoding_key = match DecodingKey::from_jwk(jwk) {
Ok(key) => key,
Err(e) => {
warn!(
"Failed to construct decoding key from {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
};
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
Ok(data) => return Ok(data),
Err(e) => {
warn!(
"Failed to decode authorization token using {}: {}",
jwk.common.key_id.as_ref().unwrap(),
e
);
continue;
}
}
}
Err(anyhow!("Failed to verify authorization token"))
}
}

View File

@@ -0,0 +1 @@
pub(in crate::http) mod authorize;

View File

@@ -7,6 +7,8 @@ use serde::Serialize;
use tracing::error;
mod extract;
mod headers;
mod middleware;
mod routes;
pub mod server;

View File

@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
State(compute): State<Arc<ComputeNode>>,
request: Json<ConfigurationRequest>,
) -> Response {
if !compute.live_config_allowed {
if !compute.params.live_config_allowed {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"live configuration is not allowed for this compute node".to_string(),

View File

@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
/// Download a remote extension.
pub(in crate::http) async fn download_extension(
Path(filename): Path<String>,
params: Query<ExtensionServerParams>,
ext_server_params: Query<ExtensionServerParams>,
State(compute): State<Arc<ComputeNode>>,
) -> Response {
// Don't even try to download extensions if no remote storage is configured
if compute.ext_remote_storage.is_none() {
if compute.params.ext_remote_storage.is_none() {
return JsonResponse::error(
StatusCode::PRECONDITION_FAILED,
"remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
remote_extensions.get_ext(
&filename,
params.is_library,
&compute.build_tag,
&compute.pgversion,
ext_server_params.is_library,
&compute.params.build_tag,
&compute.params.pgversion,
)
};

View File

@@ -10,48 +10,58 @@ use axum::middleware::{self, Next};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use http::StatusCode;
use jsonwebtoken::jwk::JwkSet;
use tokio::net::TcpListener;
use tower::ServiceBuilder;
use tower_http::request_id::PropagateRequestIdLayer;
use tower_http::trace::TraceLayer;
use tracing::{Span, debug, error, info};
use tower_http::{
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
};
use tracing::{Span, error, info};
use uuid::Uuid;
use super::routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
use super::{
headers::X_REQUEST_ID,
middleware::authorize::Authorize,
routes::{
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
grants, insights, metrics, metrics_json, status, terminate,
},
};
use crate::compute::ComputeNode;
const X_REQUEST_ID: &str = "x-request-id";
/// `compute_ctl` has two servers: internal and external. The internal server
/// binds to the loopback interface and handles communication from clients on
/// the compute. The external server is what receives communication from the
/// control plane, the metrics scraper, etc. We make the distinction because
/// certain routes in `compute_ctl` only need to be exposed to local processes
/// like Postgres via the neon extension and local_proxy.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Debug)]
pub enum Server {
Internal(u16),
External(u16),
Internal {
port: u16,
},
External {
port: u16,
jwks: JwkSet,
compute_id: String,
},
}
impl Display for Server {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Server::Internal(_) => f.write_str("internal"),
Server::External(_) => f.write_str("external"),
Server::Internal { .. } => f.write_str("internal"),
Server::External { .. } => f.write_str("external"),
}
}
}
impl From<Server> for Router<Arc<ComputeNode>> {
fn from(server: Server) -> Self {
impl From<&Server> for Router<Arc<ComputeNode>> {
fn from(server: &Server) -> Self {
let mut router = Router::<Arc<ComputeNode>>::new();
router = match server {
Server::Internal(_) => {
Server::Internal { .. } => {
router = router
.route(
"/extension_server/{*filename}",
@@ -69,59 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {
router
}
Server::External(_) => router
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate)),
Server::External {
jwks, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
compute_id.clone(),
jwks.clone(),
)));
router
.merge(unauthenticated_router)
.merge(authenticated_router)
}
};
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
match request.uri().path() {
"/metrics" => {
debug!(%request_id, "{} {}", request.method(), request.uri())
}
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
};
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
router
.fallback(Server::handle_404)
.method_not_allowed_fallback(Server::handle_405)
.layer(
ServiceBuilder::new()
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
.layer(
TraceLayer::new_for_http()
.on_request(|request: &http::Request<_>, _span: &Span| {
let request_id = request
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
)
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
info!(%request_id, "{} {}", request.method(), request.uri());
})
.on_response(
|response: &http::Response<_>, latency: Duration, _span: &Span| {
let request_id = response
.headers()
.get(X_REQUEST_ID)
.unwrap()
.to_str()
.unwrap();
info!(
%request_id,
code = response.status().as_u16(),
latency = latency.as_millis()
);
},
),
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
}
}
@@ -145,15 +167,15 @@ impl Server {
match self {
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
// allow binding to localhost
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
}
}
fn port(self) -> u16 {
fn port(&self) -> u16 {
match self {
Server::Internal(port) => port,
Server::External(port) => port,
Server::Internal { port, .. } => *port,
Server::External { port, .. } => *port,
}
}
@@ -180,7 +202,9 @@ impl Server {
);
}
let router = Router::from(self).with_state(compute);
let router = Router::from(&self)
.with_state(compute)
.into_make_service_with_connect_info::<SocketAddr>();
if let Err(e) = axum::serve(listener, router).await {
error!("compute_ctl {} HTTP server error: {}", self, e);

View File

@@ -21,6 +21,7 @@ mod migration;
pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod rsyslog;
pub mod spec;
mod spec_apply;
pub mod swap;

View File

@@ -1,3 +1,5 @@
use std::collections::HashMap;
use tracing::info;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
pub fn inlinify(s: &str) -> String {
s.replace('\n', "\u{200B}")
}
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
//
// This is used to propagate the context for the 'start_compute' operation
// from the neon control plane. This allows linking together the wider
// 'start_compute' operation that creates the compute container, with the
// startup actions here within the container.
//
// There is no standard for passing context in env variables, but a lot of
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
//
// Switch to the startup context here, and exit it once the startup has
// completed and Postgres is up and running.
//
// If this pod is pre-created without binding it to any particular endpoint
// yet, this isn't the right place to enter the startup context. In that
// case, the control plane should pass the tracing context as part of the
// /configure API call.
//
// NOTE: This is supposed to only cover the *startup* actions. Once
// postgres is configured and up-and-running, we exit this span. Any other
// actions that are performed on incoming HTTP requests, for example, are
// performed in separate spans.
//
// XXX: If the pod is restarted, we perform the startup actions in the same
// context as the original startup actions, which probably doesn't make
// sense.
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
if let Ok(val) = std::env::var("TRACEPARENT") {
startup_tracing_carrier.insert("traceparent".to_string(), val);
}
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
info!("got startup tracing context from env variables");
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
} else {
None
}
}

View File

@@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
register_int_counter_vec!(
"compute_ctl_remote_ext_requests_total",
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
// Do not use any labels like extension name yet.
// We can add them later if needed.
&["http_status"]
&["http_status", "filename"]
)
.expect("failed to define a metric")
});

View File

@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
// should be handled gracefully.
fn watch_compute_activity(compute: &ComputeNode) {
// Suppose that `connstr` doesn't change
let connstr = compute.connstr.clone();
let connstr = compute.params.connstr.clone();
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
// During startup and configuration we connect to every Postgres database,

View File

@@ -186,15 +186,40 @@ impl DatabaseExt for Database {
/// Postgres SQL queries and DATABASE_URL.
pub trait Escaping {
fn pg_quote(&self) -> String;
fn pg_quote_dollar(&self) -> (String, String);
}
impl Escaping for PgIdent {
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
/// always quotes provided string with `""` and escapes every `"`.
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
/// clause, use `escape_literal()` instead.
fn pg_quote(&self) -> String {
let result = format!("\"{}\"", self.replace('"', "\"\""));
result
format!("\"{}\"", self.replace('"', "\"\""))
}
/// This helper is intended to be used for dollar-escaping strings for usage
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
/// it also returns a tag that is intended to be used inside the outer
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
fn pg_quote_dollar(&self) -> (String, String) {
let mut tag: String = "".to_string();
let mut outer_tag = "x".to_string();
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}
let escaped = format!("${tag}${self}${tag}$");
(escaped, outer_tag)
}
}
@@ -226,10 +251,13 @@ pub async fn get_existing_dbs_async(
// invalid state. See:
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
let rowstream = client
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
// because the latter automatically wraps the result in double quotes,
// if the role name contains special characters.
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
datdba::regrole::text AS owner,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit = - 2 AS invalid
FROM

View File

@@ -0,0 +1,77 @@
use std::process::Command;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result};
use tracing::info;
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
.arg("rsyslogd")
.output()
.expect("Failed to execute pgrep");
if !output.stdout.is_empty() {
let pid = std::str::from_utf8(&output.stdout)
.expect("Invalid UTF-8 in process output")
.trim()
.to_string();
Some(pid)
} else {
None
}
}
// Restart rsyslogd to apply the new configuration.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
// Rsyslogd shouldn't lose any messages, because of the restart,
// because it tracks the last read position in the log files
// and will continue reading from that position.
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
.output()
.context("Failed to stop rsyslogd")?;
Ok(())
}
pub fn configure_audit_rsyslog(
log_directory: &str,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
let config_content: String = format!(
include_str!("config_template/compute_audit_rsyslog_template.conf"),
log_directory = log_directory,
tag = tag,
remote_endpoint = remote_endpoint
);
info!("rsyslog config_content: {}", config_content);
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(rsyslog_conf_path)?;
file.write_all(config_content.as_bytes())?;
info!(
"rsyslog configuration file {} added successfully. Starting rsyslogd",
rsyslog_conf_path
);
// start the service, using the configuration
restart_rsyslog()?;
Ok(())
}

View File

@@ -6,21 +6,22 @@ use std::sync::Arc;
use anyhow::{Context, Result};
use compute_api::responses::ComputeStatus;
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
use futures::future::join_all;
use tokio::sync::RwLock;
use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
use crate::compute::{ComputeNode, ComputeState};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
};
@@ -187,7 +188,7 @@ impl ComputeNode {
}
for phase in [
CreateSuperUser,
CreateNeonSuperuser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -277,6 +278,19 @@ impl ComputeNode {
phases.push(FinalizeDropLogicalSubscriptions);
}
// Keep DisablePostgresDBPgAudit phase at the end,
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log => { /* not implemented yet */ }
ComputeAudit::Disabled => {}
}
for phase in phases {
debug!("Applying phase {:?}", &phase);
apply_operations(
@@ -455,7 +469,7 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreateSuperUser,
CreateNeonSuperuser,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -463,6 +477,9 @@ pub enum ApplySpecPhase {
CreateAndAlterDatabases,
CreateSchemaNeon,
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
CreatePgauditExtension,
CreatePgauditlogtofileExtension,
DisablePostgresDBPgAudit,
HandleOtherExtensions,
HandleNeonExtension,
CreateAvailabilityCheck,
@@ -579,14 +596,10 @@ async fn get_operations<'a>(
apply_spec_phase: &'a ApplySpecPhase,
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
match apply_spec_phase {
ApplySpecPhase::CreateSuperUser => {
let query = construct_superuser_query(spec);
Ok(Box::new(once(Operation {
query,
comment: None,
})))
}
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
query: include_str!("sql/create_neon_superuser.sql").to_string(),
comment: None,
}))),
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -720,14 +733,15 @@ async fn get_operations<'a>(
// We do not check whether the DB exists or not,
// Postgres will take care of it for us
"delete_db" => {
let (db_name, outer_tag) = op.name.pg_quote_dollar();
// In Postgres we can't drop a database if it is a template.
// So we need to unset the template flag first, but it could
// be a retry, so we could've already dropped the database.
// Check that database exists first to make it idempotent.
let unset_template_query: String = format!(
include_str!("sql/unset_template_for_drop_dbs.sql"),
datname_str = escape_literal(&op.name),
datname = &op.name.pg_quote()
datname = db_name,
outer_tag = outer_tag,
);
// Use FORCE to drop database even if there are active connections.
@@ -834,6 +848,8 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
query: format!(
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
db.name.pg_quote()
@@ -893,9 +909,11 @@ async fn get_operations<'a>(
PerDatabasePhase::DropLogicalSubscriptions => {
match &db {
DB::UserDB(db) => {
let (db_name, outer_tag) = db.name.pg_quote_dollar();
let drop_subscription_query: String = format!(
include_str!("sql/drop_subscriptions.sql"),
datname_str = escape_literal(&db.name),
datname_str = db_name,
outer_tag = outer_tag,
);
let operations = vec![Operation {
@@ -934,6 +952,7 @@ async fn get_operations<'a>(
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
DB::UserDB(db) => db.owner.pg_quote(),
};
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
Some(vec![
// This will reassign all dependent objects to the db owner
@@ -948,7 +967,9 @@ async fn get_operations<'a>(
Operation {
query: format!(
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
role_name = quoted,
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
),
comment: None,
},
@@ -973,12 +994,14 @@ async fn get_operations<'a>(
DB::SystemDB => return Ok(Box::new(empty())),
DB::UserDB(db) => db,
};
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
let operations = vec![
Operation {
query: format!(
include_str!("sql/set_public_schema_owner.sql"),
db_owner = db.owner.pg_quote()
db_owner = db_owner,
outer_tag = outer_tag,
),
comment: None,
},
@@ -1098,6 +1121,25 @@ async fn get_operations<'a>(
}
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
// Postgres is neon system database used by monitors
// and compute_ctl tuning functions and thus generates a lot of noise.
// We do not consider data stored in this database as sensitive.
ApplySpecPhase::DisablePostgresDBPgAudit => {
let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
Ok(Box::new(once(Operation {
query: query.to_string(),
comment: Some(query.to_string()),
})))
}
ApplySpecPhase::HandleNeonExtension => {
let operations = vec![
Operation {

View File

@@ -0,0 +1,8 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -1,4 +1,4 @@
DO $$
DO ${outer_tag}$
DECLARE
subname TEXT;
BEGIN
@@ -9,4 +9,4 @@ BEGIN
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
$$;
${outer_tag}$;

View File

@@ -1,6 +1,6 @@
SET SESSION ROLE neon_superuser;
DO $$
DO ${outer_tag}$
DECLARE
schema TEXT;
revoke_query TEXT;
@@ -16,13 +16,15 @@ BEGIN
WHERE schema_name IN ('public')
LOOP
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
schema
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY neon_superuser;',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
{role_name}
);
EXECUTE revoke_query;
END LOOP;
END;
$$;
${outer_tag}$;
RESET ROLE;

View File

@@ -1,5 +1,4 @@
DO
$$
DO ${outer_tag}$
DECLARE
schema_owner TEXT;
BEGIN
@@ -16,8 +15,8 @@ $$
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
ALTER SCHEMA public OWNER TO {db_owner};
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
END IF;
END IF;
END
$$;
${outer_tag}$;

View File

@@ -1,12 +1,12 @@
DO $$
DO ${outer_tag}$
BEGIN
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname = {datname_str}
WHERE datname = {datname}
)
THEN
ALTER DATABASE {datname} is_template false;
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
END IF;
END
$$;
${outer_tag}$;

View File

@@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
}
#[test]
fn ident_pg_quote_dollar() {
let test_cases = vec![
("name", ("$$name$$", "x")),
("name$$", ("$x$name$$$x$", "xx")),
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
];
for (input, expected) in test_cases {
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
assert_eq!(escaped, expected.0);
assert_eq!(tag, expected.1);
}
}
#[test]
fn generic_options_search() {
let generic_options: GenericOptions = Some(vec![

View File

@@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -596,7 +597,15 @@ struct EndpointStartCmdArgs {
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
#[clap(long)]
#[clap(
long,
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
)]
safekeepers_generation: Option<u32>,
#[clap(
long,
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
)]
safekeepers: Option<String>,
#[clap(
@@ -617,9 +626,9 @@ struct EndpointStartCmdArgs {
)]
allow_multiple: bool,
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
#[arg(default_value = "90s")]
start_timeout: Duration,
}
#[derive(clap::Args)]
@@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
let pageserver_id = args.endpoint_pageserver_id;
let remote_ext_config = &args.remote_ext_config;
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint
.start(
&auth_token,
safekeepers_generation,
safekeepers,
pageservers,
remote_ext_config.as_ref(),
stripe_size.0 as usize,
args.create_test_user,
args.start_timeout,
)
.await?;
}

View File

@@ -42,17 +42,19 @@ use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use anyhow::{Context, Result, anyhow, bail};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
use compute_api::spec::{
Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
};
use nix::sys::signal::{Signal, kill};
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
use tracing::debug;
use url::Host;
@@ -576,14 +578,17 @@ impl Endpoint {
Ok(safekeeper_connstrings)
}
#[allow(clippy::too_many_arguments)]
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers_generation: Option<SafekeeperGeneration>,
safekeepers: Vec<NodeId>,
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
start_timeout: Duration,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -655,6 +660,7 @@ impl Endpoint {
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
@@ -663,6 +669,7 @@ impl Endpoint {
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
audit_log_level: ComputeAudit::Disabled,
};
// this strange code is needed to support respec() in tests
@@ -770,17 +777,18 @@ impl Endpoint {
std::fs::write(pidfile_path, pid.to_string())?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
let start_at = Instant::now();
loop {
attempt += 1;
match self.get_status().await {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
if Instant::now().duration_since(start_at) > start_timeout {
bail!(
"compute startup timed out {:?}; still in Init state",
start_timeout
);
}
// keep retrying
}
@@ -807,8 +815,11 @@ impl Endpoint {
}
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
if Instant::now().duration_since(start_at) > start_timeout {
return Err(e).context(format!(
"timed out {:?} waiting to connect to compute_ctl HTTP",
start_timeout,
));
}
}
}

View File

@@ -6,8 +6,11 @@ generate_id() {
local -n resvar=$1
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
}
if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
echo "${OLD_COMPUTE_TAG}"
echo "${NEW_COMPUTE_TAG}"
echo "${TEST_EXTENSIONS_TAG}"
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
@@ -82,7 +85,7 @@ EXTENSIONS='[
{"extname": "pg_repack", "extdir": "pg_repack-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}"
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
docker compose --profile test-extensions down
COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
wait_for_ready
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"

View File

@@ -0,0 +1,201 @@
# Sparse Keyspace for Relation Directories
## Summary
This is an RFC describing a new storage strategy for storing relation directories.
## Motivation
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
```rust
// DbDir:
// 00 00000000 00000000 00000000 00 00000000
// RelDir:
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
```
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
```rust
#[derive(Debug, Serialize, Deserialize, Default)]
pub(crate) struct RelDirectory {
// Set of relations that exist. (relfilenode, forknum)
//
// TODO: Store it as a btree or radix tree or something else that spans multiple
// key-value pairs, if you have a lot of relations
pub(crate) rels: HashSet<(Oid, u8)>,
}
```
The current codebase has the following three access patterns for the relation directory.
1. Check if a relation exists.
2. List all relations.
3. Create/drop a relation.
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
to seamlessly migrate users to use the new keyspace.
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
## Key Mapping
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
into the key.
```plain
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
```
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
implemented as follows.
1. Check if a relation exists: check if the key maps to "exists".
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
be removed during image layer generation upon compaction.
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
## Changes to Sparse Keyspace
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
## Coexistence of the Old and New Keyspaces
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
path needs to combine the data from both keyspaces.
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
migration can happen seamlessly and imposes no potential downtime for the user.
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
1. Check if a relation exists
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
return it to the user.
- Otherwise, deserialize the old reldir key and get the result.
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
Combine them to obtain the final result.
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
- The delete tombstone will be removed during image layer generation upon compaction.
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
us `O(1)` complexity after fully opt-in the sparse keyspace.
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
We will introduce a config item and an index_part record to record the current status of the migration process.
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
once v2 is enabled, the user cannot go back to v1.
## Next Steps
### Full Migration
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
code path, we must ensure the timeline has no old reldir data.
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
process discovers the following keys at this LSN.
```plain
db1/reldir_key -> (table 1, table 2, table 3)
...db1 rel keys
db2/reldir_key -> (table 4, table 5, table 6)
...db2 rel keys
sparse_reldir_db2_table7 -> exists
sparse_reldir_db1_table8 -> deleted
```
It will generate the following keys:
```plain
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
...db1 rel keys
db2/reldir_key -> ()
...db2 rel keys
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
sparse_reldir_db1_table1 -> exists
sparse_reldir_db1_table2 -> exists
sparse_reldir_db1_table3 -> exists
sparse_reldir_db2_table4 -> exists
sparse_reldir_db2_table5 -> exists
sparse_reldir_db2_table6 -> exists
sparse_reldir_db2_table7 -> exists
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
# are no correctness issue.
```
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
### Consolidate Relation Size Keys
We have relsize at the end of all relation nodes.
```plain
// RelSize:
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
```
This means that computing logical size requires us to do several single-key gets across the keyspace,
potentially requiring downloading many layer files. We could consolidate them into a single
keyspace, improving logical size calculation performance.
### Migrate DBDir Keys
We assume the number of databases created by the users will be small, and therefore, the current way
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
the sparse keyspace to support large amount of databases.

View File

@@ -134,8 +134,10 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Debug, Deserialize, Serialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
/// Set of JSON web keys that the compute can use to authenticate
/// communication from the control plane.
pub jwks: JwkSet,
}

View File

@@ -101,6 +101,17 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
/// 2) If walproposer wants to update list of safekeepers to connect to
/// taking them from some safekeeper mconf, it should check what value
/// is newer by comparing the generation.
///
/// Note: it could be SafekeeperGeneration, but this needs linking
/// compute_ctl with postgres_ffi.
#[serde(default)]
pub safekeepers_generation: Option<u32>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
@@ -144,6 +155,16 @@ pub struct ComputeSpec {
/// over the same replication content from publisher.
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
#[serde(default)]
pub audit_log_level: ComputeAudit,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -251,6 +272,17 @@ pub enum ComputeMode {
Replica,
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
Log,
Hipaa,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
pub struct Cluster {
pub cluster_id: Option<String>,

View File

@@ -6,11 +6,8 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
backtrace.workspace = true
bytes.workspace = true
inferno.workspace = true
fail.workspace = true
flate2.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true

View File

@@ -3,8 +3,6 @@ use std::io::Write as _;
use std::str::FromStr;
use std::time::Duration;
use ::pprof::ProfilerGuardBuilder;
use ::pprof::protos::Message as _;
use anyhow::{Context, anyhow};
use bytes::{Bytes, BytesMut};
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -12,7 +10,8 @@ use hyper::http::HeaderValue;
use hyper::{Body, Method, Request, Response};
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
use once_cell::sync::Lazy;
use regex::Regex;
use pprof::ProfilerGuardBuilder;
use pprof::protos::Message as _;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{Mutex, Notify, mpsc};
@@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn};
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::error::{ApiError, api_error_handler, route_error_handler};
use crate::pprof;
use crate::request::{get_query_param, parse_query_param};
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
};
// Functions and mappings to strip when symbolizing pprof profiles. If true,
// also remove child frames.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
// Obtain profiler handle.
let mut prof_ctl = jemalloc_pprof::PROF_CTL
.as_ref()
@@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
}
Format::Pprof => {
let data = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
pprof::encode(&profile)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
.body(Body::from(data))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
Format::Svg => {
let body = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
let mut opts = inferno::flamegraph::Options::default();
opts.title = "Heap inuse".to_string();
opts.count_name = "bytes".to_string();
pprof::flamegraph(profile, &mut opts)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "image/svg+xml")
.body(Body::from(body))
.body(Body::from(svg))
.map_err(|err| ApiError::InternalServerError(err.into()))
}
}

View File

@@ -2,7 +2,6 @@ pub mod endpoint;
pub mod error;
pub mod failpoints;
pub mod json;
pub mod pprof;
pub mod request;
extern crate hyper0 as hyper;

View File

@@ -1,238 +0,0 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::ffi::c_void;
use std::io::Write as _;
use anyhow::bail;
use flate2::Compression;
use flate2::write::{GzDecoder, GzEncoder};
use itertools::Itertools as _;
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use regex::Regex;
/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
let mut gz = GzDecoder::new(Vec::new());
gz.write_all(bytes)?;
Ok(Profile::parse_from_bytes(&gz.finish()?)?)
}
/// Encodes a pprof profile as gzip-compressed Protobuf.
pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
let mut gz = GzEncoder::new(Vec::new(), Compression::default());
profile.write_to_writer(&mut gz)?;
Ok(gz.finish()?)
}
/// Symbolizes a pprof profile using the current binary.
pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
if !profile.function.is_empty() {
return Ok(profile); // already symbolized
}
// Collect function names.
let mut functions: HashMap<String, Function> = HashMap::new();
let mut strings: HashMap<String, i64> = profile
.string_table
.into_iter()
.enumerate()
.map(|(i, s)| (s, i as i64))
.collect();
// Helper to look up or register a string.
let mut string_id = |s: &str| -> i64 {
// Don't use .entry() to avoid unnecessary allocations.
if let Some(id) = strings.get(s) {
return *id;
}
let id = strings.len() as i64;
strings.insert(s.to_string(), id);
id
};
for loc in &mut profile.location {
if !loc.line.is_empty() {
continue;
}
// Resolve the line and function for each location.
backtrace::resolve(loc.address as *mut c_void, |symbol| {
let Some(symbol_name) = symbol.name() else {
return;
};
let function_name = format!("{symbol_name:#}");
let functions_len = functions.len();
let function_id = functions
.entry(function_name)
.or_insert_with_key(|function_name| {
let function_id = functions_len as u64 + 1;
let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
let filename = symbol
.filename()
.map(|path| path.to_string_lossy())
.unwrap_or(Cow::Borrowed(""));
Function {
id: function_id,
name: string_id(function_name),
system_name: string_id(&system_name),
filename: string_id(&filename),
..Default::default()
}
})
.id;
loc.line.push(Line {
function_id,
line: symbol.lineno().unwrap_or(0) as i64,
..Default::default()
});
});
}
// Store the resolved functions, and mark the mapping as resolved.
profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
profile.string_table = strings
.into_iter()
.sorted_by_key(|(_, i)| *i)
.map(|(s, _)| s)
.collect();
for mapping in &mut profile.mapping {
mapping.has_functions = true;
mapping.has_filenames = true;
}
Ok(profile)
}
/// Strips locations (stack frames) matching the given mappings (substring) or function names
/// (regex). The function bool specifies whether child frames should be stripped as well.
///
/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
/// string references.
pub fn strip_locations(
mut profile: Profile,
mappings: &[&str],
functions: &[(Regex, bool)],
) -> Profile {
// Strip mappings.
let mut strip_mappings: HashSet<u64> = HashSet::new();
profile.mapping.retain(|mapping| {
let Some(name) = profile.string_table.get(mapping.filename as usize) else {
return true;
};
if mappings.iter().any(|substr| name.contains(substr)) {
strip_mappings.insert(mapping.id);
return false;
}
true
});
// Strip functions.
let mut strip_functions: HashMap<u64, bool> = HashMap::new();
profile.function.retain(|function| {
let Some(name) = profile.string_table.get(function.name as usize) else {
return true;
};
for (regex, strip_children) in functions {
if regex.is_match(name) {
strip_functions.insert(function.id, *strip_children);
return false;
}
}
true
});
// Strip locations. The bool specifies whether child frames should be stripped too.
let mut strip_locations: HashMap<u64, bool> = HashMap::new();
profile.location.retain(|location| {
for line in &location.line {
if let Some(strip_children) = strip_functions.get(&line.function_id) {
strip_locations.insert(location.id, *strip_children);
return false;
}
}
if strip_mappings.contains(&location.mapping_id) {
strip_locations.insert(location.id, false);
return false;
}
true
});
// Strip sample locations.
for sample in &mut profile.sample {
// First, find the uppermost function with child removal and truncate the stack.
if let Some(truncate) = sample
.location_id
.iter()
.rposition(|id| strip_locations.get(id) == Some(&true))
{
sample.location_id.drain(..=truncate);
}
// Next, strip any individual frames without child removal.
sample
.location_id
.retain(|id| !strip_locations.contains_key(id));
}
profile
}
/// Generates an SVG flamegraph from a symbolized pprof profile.
pub fn flamegraph(
profile: Profile,
opts: &mut inferno::flamegraph::Options,
) -> anyhow::Result<Vec<u8>> {
if profile.mapping.iter().any(|m| !m.has_functions) {
bail!("profile not symbolized");
}
// Index locations, functions, and strings.
let locations: HashMap<u64, Location> =
profile.location.into_iter().map(|l| (l.id, l)).collect();
let functions: HashMap<u64, Function> =
profile.function.into_iter().map(|f| (f.id, f)).collect();
let strings = profile.string_table;
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
// since inferno expects it bottom-up.
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
for sample in profile.sample {
let mut stack = Vec::with_capacity(sample.location_id.len());
for location in sample.location_id.into_iter().rev() {
let Some(location) = locations.get(&location) else {
bail!("missing location {location}");
};
for line in location.line.iter().rev() {
let Some(function) = functions.get(&line.function_id) else {
bail!("missing function {}", line.function_id);
};
let Some(name) = strings.get(function.name as usize) else {
bail!("missing string {}", function.name);
};
stack.push(name.as_str());
}
}
let Some(&value) = sample.value.first() else {
bail!("missing value");
};
*stacks.entry(stack).or_default() += value;
}
// Construct stack lines for inferno.
let lines = stacks
.into_iter()
.map(|(stack, value)| (stack.into_iter().join(";"), value))
.map(|(stack, value)| format!("{stack} {value}"))
.sorted()
.collect_vec();
// Construct the flamegraph.
let mut bytes = Vec::new();
let lines = lines.iter().map(|line| line.as_str());
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
Ok(bytes)
}

View File

@@ -123,6 +123,10 @@ pub struct ConfigToml {
pub enable_read_path_debugging: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub validate_wal_contiguity: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub load_previous_heatmap: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -523,6 +527,8 @@ impl Default for ConfigToml {
None
},
validate_wal_contiguity: None,
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
}
}
}

View File

@@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo {
pub archived_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -1243,7 +1258,11 @@ pub struct TimelineInfo {
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// Whether the timeline is archived.
pub is_archived: Option<bool>,
/// The status of the rel_size migration.
pub rel_size_migration: Option<RelSizeMigration>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -34,8 +34,13 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
let socket =
connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
let socket = connect_socket::connect_socket(
config.host_addr,
&config.host,
config.port,
config.connect_timeout,
)
.await?;
cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
}

View File

@@ -1,5 +1,6 @@
use std::collections::HashMap;
use std::fmt;
use std::net::IpAddr;
use std::sync::Arc;
use std::task::{Context, Poll};
use std::time::Duration;
@@ -137,6 +138,7 @@ impl InnerClient {
#[derive(Clone, Serialize, Deserialize)]
pub struct SocketConfig {
pub host_addr: Option<IpAddr>,
pub host: Host,
pub port: u16,
pub connect_timeout: Option<Duration>,

View File

@@ -1,5 +1,6 @@
//! Connection configuration.
use std::net::IpAddr;
use std::time::Duration;
use std::{fmt, str};
@@ -65,6 +66,7 @@ pub enum AuthKeys {
/// Connection configuration.
#[derive(Clone, PartialEq, Eq)]
pub struct Config {
pub(crate) host_addr: Option<IpAddr>,
pub(crate) host: Host,
pub(crate) port: u16,
@@ -83,6 +85,7 @@ impl Config {
/// Creates a new configuration.
pub fn new(host: String, port: u16) -> Config {
Config {
host_addr: None,
host: Host::Tcp(host),
port,
password: None,
@@ -163,6 +166,15 @@ impl Config {
self
}
pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
self.host_addr = Some(addr);
self
}
pub fn get_host_addr(&self) -> Option<IpAddr> {
self.host_addr
}
/// Sets the SSL configuration.
///
/// Defaults to `prefer`.

View File

@@ -1,3 +1,5 @@
use std::net::IpAddr;
use postgres_protocol2::message::backend::Message;
use tokio::net::TcpStream;
use tokio::sync::mpsc;
@@ -25,13 +27,14 @@ where
.make_tls_connect(hostname)
.map_err(|e| Error::tls(e.into()))?;
match connect_once(&config.host, config.port, tls, config).await {
match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
Ok((client, connection)) => Ok((client, connection)),
Err(e) => Err(e),
}
}
async fn connect_once<T>(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
tls: T,
@@ -40,7 +43,7 @@ async fn connect_once<T>(
where
T: TlsConnect<TcpStream>,
{
let socket = connect_socket(host, port, config.connect_timeout).await?;
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
let RawConnection {
stream,
parameters,
@@ -50,6 +53,7 @@ where
} = connect_raw(socket, tls, config).await?;
let socket_config = SocketConfig {
host_addr,
host: host.clone(),
port,
connect_timeout: config.connect_timeout,

View File

@@ -1,5 +1,6 @@
use std::future::Future;
use std::io;
use std::net::{IpAddr, SocketAddr};
use std::time::Duration;
use tokio::net::{self, TcpStream};
@@ -9,15 +10,20 @@ use crate::Error;
use crate::config::Host;
pub(crate) async fn connect_socket(
host_addr: Option<IpAddr>,
host: &Host,
port: u16,
connect_timeout: Option<Duration>,
) -> Result<TcpStream, Error> {
match host {
Host::Tcp(host) => {
let addrs = net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?;
let addrs = match host_addr {
Some(addr) => vec![SocketAddr::new(addr, port)],
None => net::lookup_host((&**host, port))
.await
.map_err(Error::connect)?
.collect(),
};
let mut last_err = None;

View File

@@ -15,7 +15,6 @@ arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
anyhow.workspace = true
backtrace.workspace = true
bincode.workspace = true
bytes.workspace = true
camino.workspace = true

View File

@@ -3,20 +3,24 @@ use std::env;
use sentry::ClientInitGuard;
pub use sentry::release_name;
use tracing::{error, info};
#[must_use]
pub fn init_sentry(
release_name: Option<Cow<'static, str>>,
extra_options: &[(&str, &str)],
) -> Option<ClientInitGuard> {
let dsn = env::var("SENTRY_DSN").ok()?;
let Ok(dsn) = env::var("SENTRY_DSN") else {
info!("not initializing Sentry, no SENTRY_DSN given");
return None;
};
let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
let guard = sentry::init((
dsn,
sentry::ClientOptions {
release: release_name,
environment: Some(environment.into()),
release: release_name.clone(),
environment: Some(environment.clone().into()),
..Default::default()
},
));
@@ -25,5 +29,19 @@ pub fn init_sentry(
scope.set_extra(key, value.into());
}
});
if let Some(dsn) = guard.dsn() {
info!(
"initialized Sentry for project {}, environment {}, release {} (using API {})",
dsn.project_id(),
environment,
release_name.unwrap_or(Cow::Borrowed("None")),
dsn.envelope_api_url(),
);
} else {
// This should panic during sentry::init(), but we may as well cover it.
error!("failed to initialize Sentry, invalid DSN");
}
Some(guard)
}

View File

@@ -98,6 +98,7 @@ criterion.workspace = true
hex-literal.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
indoc.workspace = true
uuid.workspace = true
[[bench]]
name = "bench_layer_map"

View File

@@ -7,7 +7,6 @@ use std::time::Instant;
use criterion::measurement::WallTime;
use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
use pageserver_api::key::Key;
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
.collect()
}
// Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
let mut parts = Vec::new();
// We add a partition boundary at the start of each image layer,
// no matter what lsn range it covers. This is just the easiest
// thing to do. A better thing to do would be to get a real
// partitioning from some database. Even better, remove the need
// for key partitions by deciding where to create image layers
// directly based on a coverage-based difficulty map.
let mut keys: Vec<_> = layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
Some(kr.start.next())
}
})
.collect();
keys.sort();
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
for key in keys {
parts.push(KeySpace {
ranges: vec![current_key..key],
});
current_key = key;
}
KeyPartitioning { parts }
}
// Benchmark using metadata extracted from our performance test environment, from
// a project where we have run pgbench many timmes. The pgbench database was initialized
// between each test run.
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
// Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Choose inputs for get_difficulty_map
let latest_lsn = layer_map
.iter_historic_layers()
.map(|l| l.get_lsn_range().end)
.max()
.unwrap();
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
// Check correctness of get_difficulty_map
// TODO put this in a dedicated test outside of this mod
{
println!("running correctness check");
let now = Instant::now();
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
assert!(result_bruteforce.len() == partitioning.parts.len());
println!("Finished bruteforce in {:?}", now.elapsed());
let now = Instant::now();
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
assert!(result_fast.len() == partitioning.parts.len());
println!("Finished fast in {:?}", now.elapsed());
// Assert results are equal. Manually iterate for easier debugging.
let zip = std::iter::zip(
&partitioning.parts,
std::iter::zip(result_bruteforce, result_fast),
);
for (_part, (bruteforce, fast)) in zip {
assert_eq!(bruteforce, fast);
}
println!("No issues found");
}
// Define and name the benchmark function
let mut group = c.benchmark_group("real_map");
group.bench_function("uniform_queries", |b| {
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
}
});
});
group.bench_function("get_difficulty_map", |b| {
b.iter(|| {
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
});
});
group.finish();
}

View File

@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
#[derive(Debug, thiserror::Error)]
pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
#[error("basebackup during shutdown")]
Shutdown,
}
impl From<PageReconstructError> for BasebackupError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
impl From<GetVectoredError> for BasebackupError {
fn from(value: GetVectoredError) -> Self {
match value {
GetVectoredError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
/// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
timeline
.gate
.enter()
.map_err(|e| BasebackupError::Server(e.into()))?,
.map_err(|_| BasebackupError::Shutdown)?,
),
};
basebackup
@@ -323,8 +344,7 @@ where
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.partition(
self.timeline.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
let blocks = self
.timeline
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for (key, block) in blocks {
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
let block = block?;
slru_builder.add_block(&key, block).await?;
}
}
@@ -349,11 +368,8 @@ where
let mut min_restart_lsn: Lsn = Lsn::MAX;
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self
.timeline
.list_dbdirs(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
for ((spcnode, dbnode), has_relmap_file) in
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -362,8 +378,7 @@ where
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
let aux_files = self
.timeline
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let aux_scan_time = start_time.elapsed();
let aux_estimated_size = aux_files
.values()
@@ -451,16 +465,14 @@ where
for xid in self
.timeline
.list_twophase_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
{
self.add_twophase_file(xid).await?;
}
let repl_origins = self
.timeline
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let n_origins = repl_origins.len();
if n_origins != 0 {
//
@@ -505,8 +517,7 @@ where
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
@@ -532,8 +543,7 @@ where
// TODO: investigate using get_vectored for the entire startblk..endblk range.
// But this code path is not on the critical path for most basebackups (?).
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -567,8 +577,7 @@ where
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
if img.len()
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.is_empty()
{
return Ok(());
@@ -674,8 +682,7 @@ where
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);

View File

@@ -194,6 +194,13 @@ pub struct PageServerConf {
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
/// safekeepers does not have gaps.
pub validate_wal_contiguity: bool,
/// When set, the previously written to disk heatmap is loaded on tenant attach and used
/// to avoid clobbering the heatmap from new, cold, attached locations.
pub load_previous_heatmap: bool,
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
pub generate_unarchival_heatmap: bool,
}
/// Token for authentication to safekeepers
@@ -358,6 +365,8 @@ impl PageServerConf {
get_vectored_concurrent_io,
enable_read_path_debugging,
validate_wal_contiguity,
load_previous_heatmap,
generate_unarchival_heatmap,
} = config_toml;
let mut conf = PageServerConf {
@@ -447,6 +456,8 @@ impl PageServerConf {
no_sync: no_sync.unwrap_or(false),
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
};
// ------------------------------------------------------------
@@ -480,7 +491,9 @@ impl PageServerConf {
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
let test_id = uuid::Uuid::new_v4();
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -493,6 +506,8 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
synthetic_size_calculation_interval: Duration::from_secs(60),
background_task_maximum_delay: Duration::ZERO,
load_previous_heatmap: Some(true),
generate_unarchival_heatmap: Some(true),
..Default::default()
};
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()

View File

@@ -842,6 +842,12 @@ paths:
required: false
schema:
type: integer
- name: recurse
description: When set, will recurse with the downloads into ancestor timelines
in: query
required: false
schema:
type: boolean
post:
description: |
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter

View File

@@ -481,6 +481,7 @@ async fn build_timeline_info_common(
state,
is_archived: Some(is_archived),
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
walreceiver_status,
};

View File

@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_batch_global",
"Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
vec![
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
],
)
.expect("failed to define a metric")
});
pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_amortized_global",
"Layers visited to serve a single read (read amplification). Amortized across a batch: \
all visited layers are divided by number of reads.",
vec![
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
],
)
.expect("failed to define a metric")
});
pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
// We expect this to be low because of Postgres checkpoints. Let's see if that holds.
register_histogram!(
@@ -4074,6 +4097,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
// histograms
[
&LAYERS_PER_READ_GLOBAL,
&LAYERS_PER_READ_BATCH_GLOBAL,
&LAYERS_PER_READ_AMORTIZED_GLOBAL,
&DELTAS_PER_READ_GLOBAL,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,

View File

@@ -392,10 +392,6 @@ impl TimelineHandles {
.await
.map_err(|e| match e {
timeline::handle::GetError::TenantManager(e) => e,
timeline::handle::GetError::TimelineGateClosed => {
trace!("timeline gate closed");
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
}
timeline::handle::GetError::PerTimelineStateShutDown => {
trace!("per-timeline state shut down");
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes;
impl timeline::handle::Types for TenantManagerTypes {
type TenantManagerError = GetActiveTimelineError;
type TenantManager = TenantManagerWrapper;
type Timeline = Arc<Timeline>;
type Timeline = TenantManagerCacheItem;
}
impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
pub(crate) struct TenantManagerCacheItem {
pub(crate) timeline: Arc<Timeline>,
#[allow(dead_code)] // we store it to keep the gate open
pub(crate) gate_guard: GateGuard,
}
impl std::ops::Deref for TenantManagerCacheItem {
type Target = Arc<Timeline>;
fn deref(&self) -> &Self::Target {
&self.timeline
}
}
impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
Timeline::shard_timeline_id(self)
Timeline::shard_timeline_id(&self.timeline)
}
fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
&self.handles
&self.timeline.handles
}
fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
Timeline::get_shard_identity(self)
Timeline::get_shard_identity(&self.timeline)
}
}
@@ -448,7 +453,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
let tenant_id = self.tenant_id.get().expect("we set this in get()");
let timeout = ACTIVE_TENANT_TIMEOUT;
let wait_start = Instant::now();
@@ -491,7 +496,20 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
let timeline = tenant_shard
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
Ok(timeline)
let gate_guard = match timeline.gate.enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetActiveTimelineError::Timeline(
GetTimelineError::ShuttingDown,
));
}
};
Ok(TenantManagerCacheItem {
timeline,
gate_guard,
})
}
}
@@ -2095,6 +2113,7 @@ impl PageServerHandler {
// TODO: passthrough the error site to the final error message?
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
BasebackupError::Shutdown => QueryError::Shutdown,
}
}

View File

@@ -21,6 +21,7 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
// Otherwise, read the old reldir keyspace.
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
if self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
self.get_rel_size_v2_status()
{
// fetch directory listing (new)
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
forknum: *forknum,
}));
if !self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
return Ok(rels_v1);
}
@@ -599,28 +602,36 @@ impl Timeline {
let n_blocks = self
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
.await?;
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for blkno in 0..n_blocks {
let block = self
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
.await?;
segment.extend_from_slice(&block[..BLCKSZ as usize]);
}
Ok(segment.freeze())
}
/// Look up given SLRU page version.
pub(crate) async fn get_slru_page_at_lsn(
&self,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
assert!(self.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
self.get(key, lsn, ctx).await
let keyspace = KeySpace::single(
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
);
let batches = keyspace.partition(
self.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for batch in batches.parts {
let blocks = self
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
.await?;
for (_key, block) in blocks {
let block = block?;
segment.extend_from_slice(&block[..BLCKSZ as usize]);
}
}
Ok(segment.freeze())
}
/// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
let keyspace = KeySpace::single(
slru_block_to_key(SlruKind::Clog, segno, 0)
..slru_block_to_key(SlruKind::Clog, segno, nblocks),
);
let batches = keyspace.partition(
self.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
for batch in batches.parts.into_iter().rev() {
let blocks = self
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
.await?;
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
for (_key, clog_page) in blocks.into_iter().rev() {
let clog_page = clog_page?;
match f(timestamp) {
ControlFlow::Break(b) => return Ok(b),
ControlFlow::Continue(()) => (),
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
match f(timestamp) {
ControlFlow::Break(b) => return Ok(b),
ControlFlow::Continue(()) => (),
}
}
}
}
@@ -1720,6 +1753,35 @@ impl DatadirModification<'_> {
Ok(())
}
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
/// we enable it, we also need to persist it in `index_part.json`.
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
let status = self.tline.get_rel_size_v2_status();
let config = self.tline.get_rel_size_v2_enabled();
match (config, status) {
(false, RelSizeMigration::Legacy) => {
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
Ok(false)
}
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
Ok(true)
}
(true, RelSizeMigration::Legacy) => {
// The first time we enable it, we need to persist it in `index_part.json`
self.tline
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
tracing::info!("enabled rel_size_v2");
Ok(true)
}
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
// and we don't need to do anything
Ok(true)
}
}
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub async fn put_relmap_file(
&mut self,
@@ -1728,6 +1790,8 @@ impl DatadirModification<'_> {
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY, ctx).await?;
let mut dbdir = DbDirectory::des(&buf)?;
@@ -1748,7 +1812,7 @@ impl DatadirModification<'_> {
})?;
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
if self.tline.get_rel_size_v2_enabled() {
if v2_enabled {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
}
@@ -1900,12 +1964,12 @@ impl DatadirModification<'_> {
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
let v2_enabled = self.maybe_enable_rel_size_v2()?;
if self.tline.get_rel_size_v2_enabled() {
if v2_enabled {
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
@@ -1940,6 +2004,10 @@ impl DatadirModification<'_> {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
} else {
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -1953,6 +2021,7 @@ impl DatadirModification<'_> {
)),
);
}
// Put size
let size_key = rel_size_to_key(rel);
let buf = nblocks.to_le_bytes();
@@ -2031,6 +2100,7 @@ impl DatadirModification<'_> {
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
@@ -2043,7 +2113,7 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
dirty = true;
true
} else if self.tline.get_rel_size_v2_enabled() {
} else if v2_enabled {
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
// logic).
@@ -2074,7 +2144,7 @@ impl DatadirModification<'_> {
// Remove entry from relation size cache
self.tline.remove_cached_rel_size(&rel_tag);
// Delete size entry, as well as all blocks
// Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
self.delete(rel_key_range(rel_tag));
}
}

View File

@@ -31,8 +31,8 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pageserver_api::models;
pub use pageserver_api::models::TenantState;
use pageserver_api::models::{self, RelSizeMigration};
use pageserver_api::models::{
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
WalRedoManagerStatus,
@@ -1123,6 +1123,7 @@ impl Tenant {
CreateTimelineCause::Load,
idempotency.clone(),
index_part.gc_compaction.clone(),
index_part.rel_size_migration.clone(),
)?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
@@ -1149,7 +1150,7 @@ impl Tenant {
// a previous heatmap which contains all visible layers in the layer map.
// This previous heatmap will be used whenever a fresh heatmap is generated
// for the timeline.
if matches!(cause, LoadTimelineCause::Unoffload) {
if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
while let Some((tline, end_lsn)) = tline_ending_at {
let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
@@ -1581,6 +1582,10 @@ impl Tenant {
}
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
if !self.conf.load_previous_heatmap {
return None;
}
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
@@ -2446,6 +2451,7 @@ impl Tenant {
create_guard,
initdb_lsn,
None,
None,
)
.await
}
@@ -2501,6 +2507,7 @@ impl Tenant {
initdb_lsn: Lsn,
pg_version: u32,
ctx: &RequestContext,
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
@@ -2522,6 +2529,11 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
.await?;
}
for in_memory in in_memory_layer_desc {
tline
.force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
@@ -2771,6 +2783,7 @@ impl Tenant {
timeline_create_guard,
initdb_lsn,
None,
None,
)
.await
}
@@ -4122,6 +4135,7 @@ impl Tenant {
cause: CreateTimelineCause,
create_idempotency: CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
rel_size_v2_status: Option<RelSizeMigration>,
) -> anyhow::Result<Arc<Timeline>> {
let state = match cause {
CreateTimelineCause::Load => {
@@ -4154,6 +4168,7 @@ impl Tenant {
self.attach_wal_lag_cooldown.clone(),
create_idempotency,
gc_compaction_state,
rel_size_v2_status,
self.cancel.child_token(),
);
@@ -4856,6 +4871,7 @@ impl Tenant {
timeline_create_guard,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
Some(src_timeline.get_rel_size_v2_status()),
)
.await?;
@@ -5129,6 +5145,7 @@ impl Tenant {
timeline_create_guard,
pgdata_lsn,
None,
None,
)
.await?;
@@ -5207,13 +5224,14 @@ impl Tenant {
create_guard: TimelineCreateGuard,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
rel_size_v2_status: Option<RelSizeMigration>,
) -> anyhow::Result<UninitializedTimeline<'a>> {
let tenant_shard_id = self.tenant_shard_id;
let resources = self.build_timeline_resources(new_timeline_id);
resources
.remote_client
.init_upload_queue_for_empty_remote(new_metadata)?;
.init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
let timeline_struct = self
.create_timeline_struct(
@@ -5225,6 +5243,7 @@ impl Tenant {
CreateTimelineCause::Load,
create_guard.idempotency.clone(),
None,
rel_size_v2_status,
)
.context("Failed to create timeline data structure")?;
@@ -5913,6 +5932,8 @@ mod tests {
#[cfg(feature = "testing")]
use timeline::GcInfo;
#[cfg(feature = "testing")]
use timeline::InMemoryLayerTestDesc;
#[cfg(feature = "testing")]
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
use timeline::{CompactOptions, DeltaLayerTestDesc};
use utils::id::TenantId;
@@ -7925,6 +7946,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(), // delta layers
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8012,6 +8034,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(), // delta layers
vec![(
Lsn(0x20),
@@ -8227,6 +8250,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8307,6 +8331,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8380,6 +8405,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8512,6 +8538,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8705,6 +8732,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x40),
delta1,
@@ -8761,6 +8789,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(),
image_layers,
end_lsn,
@@ -8967,6 +8996,7 @@ mod tests {
Lsn(0x08),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x08)..Lsn(0x10),
@@ -8985,7 +9015,7 @@ mod tests {
delta3,
),
], // delta layers
vec![], // image layers
vec![], // image layers
Lsn(0x50),
)
.await?
@@ -8996,6 +9026,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x48),
@@ -9546,6 +9577,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9793,6 +9825,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
// delta1 and delta 2 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10028,6 +10061,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![], // delta layers
vec![(Lsn(0x18), img_layer)], // image layers
Lsn(0x18),
@@ -10274,6 +10308,7 @@ mod tests {
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
delta_layer_start_lsn..delta_layer_end_lsn,
delta_layer_spec,
@@ -10305,6 +10340,158 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
let harness =
TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
let (tenant, ctx) = harness.load().await;
let will_init_keys = [2, 6];
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let mut expected_key_values = HashMap::new();
let baseline_image_layer_lsn = Lsn(0x10);
let mut baseline_img_layer = Vec::new();
for i in 0..5 {
let key = get_key(i);
let value = format!("value {i}@{baseline_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
baseline_img_layer.push((key, Bytes::from(value)));
}
let nested_image_layer_lsn = Lsn(0x50);
let mut nested_img_layer = Vec::new();
for i in 5..10 {
let key = get_key(i);
let value = format!("value {i}@{nested_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
nested_img_layer.push((key, Bytes::from(value)));
}
let frozen_layer = {
let lsn_range = Lsn(0x40)..Lsn(0x60);
let mut data = Vec::new();
for i in 0..10 {
let key = get_key(i);
let key_in_nested = nested_img_layer
.iter()
.any(|(key_with_img, _)| *key_with_img == key);
let lsn = {
if key_in_nested {
Lsn(nested_image_layer_lsn.0 + 5)
} else {
lsn_range.start
}
};
let will_init = will_init_keys.contains(&i);
if will_init {
data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
expected_key_values.insert(key, "".to_string());
} else {
let delta = format!("@{lsn}");
data.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
}
}
InMemoryLayerTestDesc {
lsn_range,
is_open: false,
data,
}
};
let (open_layer, last_record_lsn) = {
let start_lsn = Lsn(0x70);
let mut data = Vec::new();
let mut end_lsn = Lsn(0);
for i in 0..10 {
let key = get_key(i);
let lsn = Lsn(start_lsn.0 + i as u64);
let delta = format!("@{lsn}");
data.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
end_lsn = std::cmp::max(end_lsn, lsn);
}
(
InMemoryLayerTestDesc {
lsn_range: start_lsn..Lsn::MAX,
is_open: true,
data,
},
end_lsn,
)
};
assert!(
nested_image_layer_lsn > frozen_layer.lsn_range.start
&& nested_image_layer_lsn < frozen_layer.lsn_range.end
);
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![open_layer, frozen_layer], // in-memory layers
Vec::new(), // delta layers
vec![
(baseline_image_layer_lsn, baseline_img_layer),
(nested_image_layer_lsn, nested_img_layer),
], // image layers
last_record_lsn,
)
.await?;
let keyspace = KeySpace::single(get_key(0)..get_key(10));
let results = tline
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
.await
.expect("No vectored errors");
for (key, res) in results {
let value = res.expect("No key errors");
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
assert_eq!(value, Bytes::from(expected_value.clone()));
tracing::info!("key={key} value={expected_value}");
}
Ok(())
}
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
(
k1.is_delta,
@@ -10420,6 +10607,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10804,6 +10992,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
// delta1/2/4 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11055,6 +11244,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
// delta1/2/4 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),

File diff suppressed because it is too large Load Diff

View File

@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
/// The latest state
head: LayerCoverageTuple<Value>,
/// TODO: this could be an ordered vec using binary search.
/// We push into this map everytime we add a layer, so might see some benefit
/// All previous states
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
}
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
buffer: BTreeMap<LayerKey, Option<Value>>,
/// All current layers. This is not used for search. Only to make rebuilds easier.
// TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
// [`Self::historic_coverage`] instead of doubling memory usage.
// [`Self::len`]: can require rebuild and serve from latest historic
// [`Self::iter`]: already requires rebuild => can serve from latest historic
layers: BTreeMap<LayerKey, Value>,
}

View File

@@ -194,7 +194,7 @@ pub(crate) use download::{
};
use index::GcCompactionState;
pub(crate) use index::LayerFileMetadata;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use regex::Regex;
use remote_storage::{
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
/// Initialize the upload queue for the case where the remote storage is empty,
/// i.e., it doesn't have an `IndexPart`.
///
/// `rel_size_v2_status` needs to be carried over during branching, and that's why
/// it's passed in here.
pub fn init_upload_queue_for_empty_remote(
&self,
local_metadata: &TimelineMetadata,
rel_size_v2_status: Option<RelSizeMigration>,
) -> anyhow::Result<()> {
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
// certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
.as_ref()
.map_or(0, |r| r.concurrency_limit());
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
let initialized_queue =
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
self.update_remote_physical_size_gauge(None);
info!("initialized upload queue as empty");
Ok(())
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
/// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
self: &Arc<Self>,
gc_compaction_state: GcCompactionState,
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
self: &Arc<Self>,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
// TODO: allow this operation to bypass the validation check because we might upload the index part
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
// upload will include the flag.
// self.schedule_index_upload(upload_queue);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///

View File

@@ -7,6 +7,7 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::shard::ShardIndex;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
pub(crate) last_completed_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
impl IndexPart {
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
/// used to understand later versions.

View File

@@ -869,8 +869,7 @@ impl<'a> TenantDownloader<'a> {
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
let layers_in_heatmap = heatmap_timeline
.layers
.iter()
.hot_layers()
.map(|l| (&l.name, l.metadata.generation))
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
@@ -1015,7 +1014,8 @@ impl<'a> TenantDownloader<'a> {
// Accumulate updates to the state
let mut touched = Vec::new();
for layer in timeline.layers {
let timeline_id = timeline.timeline_id;
for layer in timeline.into_hot_layers() {
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!("Cancelled -- dropping out of layer loop");
return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> {
}
match self
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
.download_layer(tenant_shard_id, &timeline_id, layer, ctx)
.await
{
Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> {
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_id = timeline.timeline_id;
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
let (result, touched) = self
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1316,11 @@ async fn init_timeline_state(
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
heatmap.hot_layers().map(|l| (&l.name, l)).collect();
let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
if let Some(last_heatmap) = last_heatmap {
last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
} else {
HashMap::new()
};

View File

@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
pub(crate) layers: Vec<HeatMapLayer>,
layers: Vec<HeatMapLayer>,
}
#[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
#[serde_as(as = "TimestampSeconds<i64>")]
pub(crate) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
#[serde(default)]
pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
name: LayerName,
metadata: LayerFileMetadata,
access_time: SystemTime,
cold: bool,
) -> Self {
Self {
name,
metadata,
access_time,
cold,
}
}
}
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
layers,
}
}
pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
self.layers.into_iter().filter(|l| !l.cold)
}
pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
self.layers.iter().filter(|l| !l.cold)
}
pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
self.layers.iter()
}
}
pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
layers: 0,
};
for timeline in &self.timelines {
for layer in &timeline.layers {
for layer in timeline.hot_layers() {
stats.layers += 1;
stats.bytes += layer.metadata.file_size;
}

View File

@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
use self::inmemory_layer::InMemoryLayerFileId;
use super::PageReconstructError;
use super::layer_map::InMemoryLayerDesc;
use super::timeline::{GetVectoredError, ReadPath};
use crate::config::PageServerConf;
use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
lsn_floor: Lsn,
}
#[derive(Debug, PartialEq, Eq, Hash)]
pub enum ReadableLayerWeak {
PersistentLayer(Arc<PersistentLayerDesc>),
InMemoryLayer(InMemoryLayerDesc),
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
}
ReadableLayer::InMemoryLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.await
}
}

View File

@@ -416,7 +416,7 @@ impl InMemoryLayer {
pub(crate) async fn get_values_reconstruct_data(
self: &Arc<InMemoryLayer>,
keyspace: KeySpace,
end_lsn: Lsn,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
let lsn_range = self.start_lsn..end_lsn;
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner
.index

View File

@@ -1563,10 +1563,10 @@ impl LayerInner {
self.access_stats.record_residence_event();
self.status.as_ref().unwrap().send_replace(Status::Evicted);
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
self.status.as_ref().unwrap().send_replace(Status::Evicted);
Ok(())
}

View File

@@ -49,6 +49,7 @@ async fn smoke_test() {
Lsn(0x10),
14,
&ctx,
Default::default(), // in-memory layers
Default::default(),
image_layers,
Lsn(0x100),

View File

@@ -473,21 +473,15 @@ async fn wait_for_active_tenant(
}
let mut update_rx = tenant.subscribe_for_state_updates();
loop {
tokio::select! {
_ = cancel.cancelled() => return ControlFlow::Break(()),
result = update_rx.changed() => if result.is_err() {
tokio::select! {
result = update_rx.wait_for(|s| s == &TenantState::Active) => {
if result.is_err() {
return ControlFlow::Break(());
}
}
match &*update_rx.borrow() {
TenantState::Active => {
debug!("Tenant state changed to active, continuing the task loop");
return ControlFlow::Continue(());
}
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
}
debug!("Tenant state changed to active, continuing the task loop");
ControlFlow::Continue(())
},
_ = cancel.cancelled() => ControlFlow::Break(()),
}
}

View File

@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
use pageserver_api::models::{
CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
};
use pageserver_api::reltag::{BlockNumber, RelTag};
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::l0_flush::{self, L0FlushGlobalState};
use crate::metrics::{
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
};
use crate::page_service::TenantManagerTypes;
use crate::pgdatadir_mapping::{
@@ -436,6 +437,8 @@ pub struct Timeline {
/// May host a background Tokio task which downloads all the layers from the current
/// heatmap on demand.
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
}
pub(crate) enum PreviousHeatmap {
@@ -1328,10 +1331,6 @@ impl Timeline {
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() {
// Record the total number of layers visited towards each key in the batch. While some
// layers may not intersect with a given read, and the cost of layer visits are
// amortized across the batch, each visited layer contributes directly to the observed
// latency for every read in the batch, which is what we care about.
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
static LOG_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1346,9 +1345,23 @@ impl Timeline {
});
}
// Records the number of layers visited in a few different ways:
//
// * LAYERS_PER_READ: all layers count towards every read in the batch, because each
// layer directly affects its observed latency.
//
// * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
// layer visits and access cost.
//
// * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
// read amplification after batching.
let layers_visited = layers_visited as f64;
let avg_layers_visited = layers_visited / results.len() as f64;
LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
for _ in &results {
self.metrics.layers_per_read.observe(layers_visited as f64);
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
self.metrics.layers_per_read.observe(layers_visited);
LAYERS_PER_READ_GLOBAL.observe(layers_visited);
LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
}
}
@@ -2368,6 +2381,9 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
/// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
/// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
/// possible that the index part persists the state while the config doesn't get persisted.
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2376,6 +2392,14 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
self.rel_size_v2_status
.load()
.as_ref()
.map(|s| s.as_ref().clone())
.unwrap_or(RelSizeMigration::Legacy)
}
fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2636,6 +2660,7 @@ impl Timeline {
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
create_idempotency: crate::tenant::CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
rel_size_v2_status: Option<RelSizeMigration>,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2794,6 +2819,8 @@ impl Timeline {
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
heatmap_layers_downloader: Mutex::new(None),
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
};
result.repartition_threshold =
@@ -2870,6 +2897,16 @@ impl Timeline {
.schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
}
pub(crate) fn update_rel_size_v2_status(
&self,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
self.rel_size_v2_status
.store(Some(Arc::new(rel_size_v2_status.clone())));
self.remote_client
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
}
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
self.gc_compaction_state.load_full().as_ref().clone()
}
@@ -3611,7 +3648,7 @@ impl Timeline {
let visible_non_resident = match previous_heatmap.as_deref() {
Some(PreviousHeatmap::Active {
heatmap, read_at, ..
}) => Some(heatmap.layers.iter().filter_map(|hl| {
}) => Some(heatmap.all_layers().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
@@ -3627,7 +3664,7 @@ impl Timeline {
return None;
}
Some((desc, hl.metadata.clone(), hl.access_time))
Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
})),
Some(PreviousHeatmap::Obsolete) => None,
None => None,
@@ -3643,6 +3680,7 @@ impl Timeline {
layer.layer_desc().clone(),
layer.metadata(),
last_activity_ts,
false, // these layers are not cold
))
}
LayerVisibilityHint::Covered => {
@@ -3669,12 +3707,14 @@ impl Timeline {
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
// or hours later:
// - Cold layers go last for convenience when a human inspects the heatmap.
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
// only exist for a few minutes before being compacted into L1s.
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
// the layer is likely to be covered by an image layer during compaction.
layers.sort_by_key(|(desc, _meta, _atime)| {
layers.sort_by_key(|(desc, _meta, _atime, cold)| {
std::cmp::Reverse((
*cold,
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
desc.lsn_range.end,
))
@@ -3682,7 +3722,9 @@ impl Timeline {
let layers = layers
.into_iter()
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
.map(|(desc, meta, atime, cold)| {
HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
})
.collect();
Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3702,6 +3744,7 @@ impl Timeline {
name: vl.layer_desc().layer_name(),
metadata: vl.metadata(),
access_time: now,
cold: true,
};
heatmap_layers.push(hl);
}
@@ -3914,39 +3957,22 @@ impl Timeline {
let guard = timeline.layers.read().await;
let layers = guard.layer_map()?;
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
});
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
}
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
guard.upgrade(layer),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
// It's safe to drop the layer map lock after planning the next round of reads.
@@ -5555,6 +5581,14 @@ pub struct DeltaLayerTestDesc {
pub data: Vec<(Key, Lsn, Value)>,
}
#[cfg(test)]
#[derive(Clone)]
pub struct InMemoryLayerTestDesc {
pub lsn_range: Range<Lsn>,
pub data: Vec<(Key, Lsn, Value)>,
pub is_open: bool,
}
#[cfg(test)]
impl DeltaLayerTestDesc {
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6567,6 +6601,92 @@ impl Timeline {
Ok(())
}
/// Force create an in-memory layer and place them into the layer map.
#[cfg(test)]
pub(super) async fn force_create_in_memory_layer(
self: &Arc<Timeline>,
mut in_memory: InMemoryLayerTestDesc,
check_start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
use utils::bin_ser::BeSer;
// Validate LSNs
if let Some(check_start_lsn) = check_start_lsn {
assert!(in_memory.lsn_range.start >= check_start_lsn);
}
let last_record_lsn = self.get_last_record_lsn();
let layer_end_lsn = if in_memory.is_open {
in_memory
.data
.iter()
.map(|(_key, lsn, _value)| lsn)
.max()
.cloned()
} else {
Some(in_memory.lsn_range.end)
};
if let Some(end) = layer_end_lsn {
assert!(
end <= last_record_lsn,
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
end,
last_record_lsn,
);
}
in_memory.data.iter().for_each(|(_key, lsn, _value)| {
assert!(*lsn >= in_memory.lsn_range.start);
assert!(*lsn < in_memory.lsn_range.end);
});
// Build the batch
in_memory
.data
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
let data = in_memory
.data
.into_iter()
.map(|(key, lsn, value)| {
let value_size = value.serialized_size().unwrap() as usize;
(key.to_compact(), lsn, value_size, value)
})
.collect::<Vec<_>>();
let batch = SerializedValueBatch::from_values(data);
// Create the in-memory layer and write the batch into it
let layer = InMemoryLayer::create(
self.conf,
self.timeline_id,
self.tenant_shard_id,
in_memory.lsn_range.start,
&self.gate,
ctx,
)
.await
.unwrap();
layer.put_batch(batch, ctx).await.unwrap();
if !in_memory.is_open {
layer.freeze(in_memory.lsn_range.end).await;
}
info!("force created in-memory layer {:?}", in_memory.lsn_range);
// Link the layer to the layer map
{
let mut guard = self.layers.write().await;
let layer_map = guard.open_mut().unwrap();
layer_map.force_insert_in_memory_layer(Arc::new(layer));
}
Ok(())
}
/// Return all keys at the LSN in the image layers
#[cfg(test)]
pub(crate) async fn inspect_image_layers(
@@ -6926,6 +7046,7 @@ mod tests {
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use std::iter::Iterator;
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
@@ -6939,8 +7060,8 @@ mod tests {
use crate::tenant::{PreviousHeatmap, Timeline};
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
assert_eq!(lhs.layers.len(), rhs.layers.len());
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
for (l, r) in lhs_rhs {
assert_eq!(l.name, r.name);
assert_eq!(l.metadata, r.metadata);
@@ -6999,6 +7120,7 @@ mod tests {
Lsn(0x10),
14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
image_layers,
Lsn(0x100),
@@ -7017,10 +7139,11 @@ mod tests {
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
// L0 should come last
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in &heatmap.layers {
for layer in heatmap_layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
@@ -7132,6 +7255,7 @@ mod tests {
Lsn(0x10),
14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
image_layers,
Lsn(0x100),
@@ -7148,7 +7272,7 @@ mod tests {
.expect("Infallible while timeline is not shut down");
// Both layers should be in the heatmap
assert!(!heatmap.layers.is_empty());
assert!(heatmap.all_layers().count() > 0);
// Now simulate a migration.
timeline
@@ -7174,7 +7298,7 @@ mod tests {
.await
.expect("Infallible while timeline is not shut down");
assert!(post_eviction_heatmap.layers.is_empty());
assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)

View File

@@ -7,6 +7,7 @@
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::time::Instant;
use super::layer_manager::LayerManager;
use super::{
@@ -15,10 +16,11 @@ use super::{
Timeline,
};
use anyhow::{Context, anyhow, bail};
use anyhow::{Context, anyhow};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use futures::FutureExt;
use itertools::Itertools;
use once_cell::sync::Lazy;
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -234,6 +236,12 @@ impl GcCompactionQueue {
// it enough in staging yet.
return Ok(());
}
if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
// If the gc watermark is not set, we don't need to trigger auto compaction.
// This check is the same as in `gc_compaction_split_jobs` but we don't log
// here and we can also skip the computation of the trigger condition earlier.
return Ok(());
}
let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
// Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
@@ -321,7 +329,7 @@ impl GcCompactionQueue {
l1_size, l2_size, l2_lsn, gc_cutoff
);
} else {
info!(
debug!(
"did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
l1_size, l2_size, l2_lsn, gc_cutoff
);
@@ -357,8 +365,7 @@ impl GcCompactionQueue {
GcCompactJob::from_compact_options(options.clone()),
options.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
.await?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
self.notify_and_unblock(id);
@@ -437,6 +444,7 @@ impl GcCompactionQueue {
));
};
let has_pending_tasks;
let mut yield_for_l0 = false;
let Some((id, item)) = ({
let mut guard = self.inner.lock().unwrap();
if let Some((id, item)) = guard.queued.pop_front() {
@@ -486,13 +494,23 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
}
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
let compaction_result =
timeline.compact_with_options(cancel, options, ctx).await?;
self.notify_and_unblock(id);
if compaction_result == CompactionOutcome::YieldForL0 {
yield_for_l0 = true;
}
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
// TODO: error handling, clear the queue if any task fails?
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
if compaction_result == CompactionOutcome::YieldForL0 {
// We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
// again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
// we need to clean things up before returning from the function.
yield_for_l0 = true;
}
}
GcCompactionQueueItem::Notify(id, l2_lsn) => {
self.notify_and_unblock(id);
@@ -521,7 +539,10 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.running = None;
}
Ok(if has_pending_tasks {
Ok(if yield_for_l0 {
tracing::info!("give up gc-compaction: yield for L0 compaction");
CompactionOutcome::YieldForL0
} else if has_pending_tasks {
CompactionOutcome::Pending
} else {
CompactionOutcome::Done
@@ -719,17 +740,41 @@ struct CompactionStatisticsNumSize {
#[derive(Debug, Serialize, Default)]
pub struct CompactionStatistics {
/// Delta layer visited (maybe compressed, physical size)
delta_layer_visited: CompactionStatisticsNumSize,
/// Image layer visited (maybe compressed, physical size)
image_layer_visited: CompactionStatisticsNumSize,
/// Delta layer produced (maybe compressed, physical size)
delta_layer_produced: CompactionStatisticsNumSize,
/// Image layer produced (maybe compressed, physical size)
image_layer_produced: CompactionStatisticsNumSize,
num_delta_layer_discarded: usize,
num_image_layer_discarded: usize,
/// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
delta_layer_discarded: CompactionStatisticsNumSize,
/// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
image_layer_discarded: CompactionStatisticsNumSize,
num_unique_keys_visited: usize,
/// Delta visited (uncompressed, original size)
wal_keys_visited: CompactionStatisticsNumSize,
/// Image visited (uncompressed, original size)
image_keys_visited: CompactionStatisticsNumSize,
/// Delta produced (uncompressed, original size)
wal_produced: CompactionStatisticsNumSize,
/// Image produced (uncompressed, original size)
image_produced: CompactionStatisticsNumSize,
// Time spent in each phase
time_acquire_lock_secs: f64,
time_analyze_secs: f64,
time_download_layer_secs: f64,
time_main_loop_secs: f64,
time_final_phase_secs: f64,
time_total_secs: f64,
// Summary
/// Ratio of the key-value size before/after gc-compaction.
uncompressed_size_ratio: f64,
/// Ratio of the physical size before/after gc-compaction.
physical_size_ratio: f64,
}
impl CompactionStatistics {
@@ -779,11 +824,13 @@ impl CompactionStatistics {
self.image_produced.num += 1;
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
}
fn discard_delta_layer(&mut self) {
self.num_delta_layer_discarded += 1;
fn discard_delta_layer(&mut self, original_size: u64) {
self.delta_layer_discarded.num += 1;
self.delta_layer_discarded.size += original_size;
}
fn discard_image_layer(&mut self) {
self.num_image_layer_discarded += 1;
fn discard_image_layer(&mut self, original_size: u64) {
self.image_layer_discarded.num += 1;
self.image_layer_discarded.size += original_size;
}
fn produce_delta_layer(&mut self, size: u64) {
self.delta_layer_produced.num += 1;
@@ -793,6 +840,19 @@ impl CompactionStatistics {
self.image_layer_produced.num += 1;
self.image_layer_produced.size += size;
}
fn finalize(&mut self) {
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
self.uncompressed_size_ratio =
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
let produced_physical_size = self.image_layer_produced.size
+ self.delta_layer_produced.size
+ self.image_layer_discarded.size
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
self.physical_size_ratio =
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -825,9 +885,7 @@ impl Timeline {
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
self.compact_with_gc(cancel, options, ctx)
.await
.map_err(CompactionError::Other)?;
self.compact_with_gc(cancel, options, ctx).await?;
return Ok(CompactionOutcome::Done);
}
@@ -2345,12 +2403,19 @@ impl Timeline {
async fn check_compaction_space(
self: &Arc<Self>,
layer_selection: &[Layer],
) -> anyhow::Result<()> {
let available_space = self.check_available_space().await?;
) -> Result<(), CompactionError> {
let available_space = self
.check_available_space()
.await
.map_err(CompactionError::Other)?;
let mut remote_layer_size = 0;
let mut all_layer_size = 0;
for layer in layer_selection {
let needs_download = layer.needs_download().await?;
let needs_download = layer
.needs_download()
.await
.context("failed to check if layer needs download")
.map_err(CompactionError::Other)?;
if needs_download.is_some() {
remote_layer_size += layer.layer_desc().file_size;
}
@@ -2359,14 +2424,14 @@ impl Timeline {
let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
{
return Err(anyhow!(
return Err(CompactionError::Other(anyhow!(
"not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
available_space,
allocated_space,
all_layer_size,
remote_layer_size,
all_layer_size + remote_layer_size
));
)));
}
Ok(())
}
@@ -2397,7 +2462,7 @@ impl Timeline {
self: &Arc<Self>,
job: GcCompactJob,
sub_compaction_max_job_size_mb: Option<u64>,
) -> anyhow::Result<Vec<GcCompactJob>> {
) -> Result<Vec<GcCompactJob>, CompactionError> {
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
job.compact_lsn_range.end
} else {
@@ -2548,7 +2613,7 @@ impl Timeline {
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<CompactionOutcome, CompactionError> {
let sub_compaction = options.sub_compaction;
let job = GcCompactJob::from_compact_options(options.clone());
if sub_compaction {
@@ -2570,7 +2635,7 @@ impl Timeline {
if jobs_len == 0 {
info!("no jobs to run, skipping gc bottom-most compaction");
}
return Ok(());
return Ok(CompactionOutcome::Done);
}
self.compact_with_gc_inner(cancel, job, ctx).await
}
@@ -2580,19 +2645,24 @@ impl Timeline {
cancel: &CancellationToken,
job: GcCompactJob,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<CompactionOutcome, CompactionError> {
// Block other compaction/GC tasks from running for now. GC-compaction could run along
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
let timer = Instant::now();
let begin_timer = timer;
let gc_lock = async {
tokio::select! {
guard = self.gc_lock.lock() => Ok(guard),
// TODO: refactor to CompactionError to correctly pass cancelled error
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
}
};
let time_acquire_lock = timer.elapsed();
let timer = Instant::now();
let gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
@@ -2644,7 +2714,7 @@ impl Timeline {
tracing::warn!(
"no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
);
return Ok(());
return Ok(CompactionOutcome::Skipped);
}
real_gc_cutoff
} else {
@@ -2682,7 +2752,7 @@ impl Timeline {
"no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
gc_cutoff
);
return Ok(());
return Ok(CompactionOutcome::Done);
};
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2703,7 +2773,7 @@ impl Timeline {
"no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
compact_lsn_range.end
);
return Ok(());
return Ok(CompactionOutcome::Done);
};
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
// layers to compact.
@@ -2729,7 +2799,7 @@ impl Timeline {
"no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
gc_cutoff, compact_key_range.start, compact_key_range.end
);
return Ok(());
return Ok(CompactionOutcome::Done);
}
retain_lsns_below_horizon.sort();
GcCompactionJobDescription {
@@ -2782,6 +2852,9 @@ impl Timeline {
has_data_below,
);
let time_analyze = timer.elapsed();
let timer = Instant::now();
for layer in &job_desc.selected_layers {
debug!("read layer: {}", layer.layer_desc().key());
}
@@ -2810,10 +2883,10 @@ impl Timeline {
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!(
return Err(CompactionError::Other(anyhow!(
"gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
err
);
)));
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = job_desc
@@ -2828,11 +2901,33 @@ impl Timeline {
let mut total_downloaded_size = 0;
let mut total_layer_size = 0;
for layer in &job_desc.selected_layers {
if layer.needs_download().await?.is_some() {
if layer
.needs_download()
.await
.context("failed to check if layer needs download")
.map_err(CompactionError::Other)?
.is_some()
{
total_downloaded_size += layer.layer_desc().file_size;
}
total_layer_size += layer.layer_desc().file_size;
let resident_layer = layer.download_and_keep_resident(ctx).await?;
if cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown);
}
let should_yield = self
.l0_compaction_trigger
.notified()
.now_or_never()
.is_some();
if should_yield {
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
return Ok(CompactionOutcome::YieldForL0);
}
let resident_layer = layer
.download_and_keep_resident(ctx)
.await
.context("failed to download and keep resident layer")
.map_err(CompactionError::Other)?;
downloaded_layers.push(resident_layer);
}
info!(
@@ -2843,19 +2938,36 @@ impl Timeline {
);
for resident_layer in &downloaded_layers {
if resident_layer.layer_desc().is_delta() {
let layer = resident_layer.get_as_delta(ctx).await?;
let layer = resident_layer
.get_as_delta(ctx)
.await
.context("failed to get delta layer")
.map_err(CompactionError::Other)?;
delta_layers.push(layer);
} else {
let layer = resident_layer.get_as_image(ctx).await?;
let layer = resident_layer
.get_as_image(ctx)
.await
.context("failed to get image layer")
.map_err(CompactionError::Other)?;
image_layers.push(layer);
}
}
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
let (dense_ks, sparse_ks) = self
.collect_gc_compaction_keyspace()
.await
.context("failed to collect gc compaction keyspace")
.map_err(CompactionError::Other)?;
let mut merge_iter = FilterIterator::create(
MergeIterator::create(&delta_layers, &image_layers, ctx),
dense_ks,
sparse_ks,
)?;
)
.context("failed to create filter iterator")
.map_err(CompactionError::Other)?;
let time_download_layer = timer.elapsed();
let timer = Instant::now();
// Step 2: Produce images+deltas.
let mut accumulated_values = Vec::new();
@@ -2874,7 +2986,9 @@ impl Timeline {
self.get_compaction_target_size(),
ctx,
)
.await?,
.await
.context("failed to create image layer writer")
.map_err(CompactionError::Other)?,
)
} else {
None
@@ -2887,7 +3001,9 @@ impl Timeline {
lowest_retain_lsn..end_lsn,
self.get_compaction_target_size(),
)
.await?;
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?;
#[derive(Default)]
struct RewritingLayers {
@@ -2927,9 +3043,28 @@ impl Timeline {
// the key and LSN range are determined. However, to keep things simple here, we still
// create this writer, and discard the writer in the end.
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
let mut keys_processed = 0;
while let Some(((key, lsn, val), desc)) = merge_iter
.next_with_trace()
.await
.context("failed to get next key-value pair")
.map_err(CompactionError::Other)?
{
if cancel.is_cancelled() {
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
return Err(CompactionError::ShuttingDown);
}
keys_processed += 1;
if keys_processed % 1000 == 0 {
let should_yield = self
.l0_compaction_trigger
.notified()
.now_or_never()
.is_some();
if should_yield {
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
return Ok(CompactionOutcome::YieldForL0);
}
}
if self.shard_identity.is_key_disposable(&key) {
// If this shard does not need to store this key, simply skip it.
@@ -2960,7 +3095,9 @@ impl Timeline {
desc.lsn_range.clone(),
ctx,
)
.await?,
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?,
);
}
rewriter.before.as_mut().unwrap()
@@ -2975,14 +3112,20 @@ impl Timeline {
desc.lsn_range.clone(),
ctx,
)
.await?,
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?,
);
}
rewriter.after.as_mut().unwrap()
} else {
unreachable!()
};
rewriter.put_value(key, lsn, val, ctx).await?;
rewriter
.put_value(key, lsn, val, ctx)
.await
.context("failed to put value")
.map_err(CompactionError::Other)?;
continue;
}
match val {
@@ -3005,9 +3148,13 @@ impl Timeline {
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
.await?,
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
)
.await?;
.await
.context("failed to generate key retention")
.map_err(CompactionError::Other)?;
retention
.pipe_to(
*last_key,
@@ -3016,7 +3163,9 @@ impl Timeline {
&mut stat,
ctx,
)
.await?;
.await
.context("failed to pipe to delta layer writer")
.map_err(CompactionError::Other)?;
accumulated_values.clear();
*last_key = key;
accumulated_values.push((key, lsn, val));
@@ -3034,9 +3183,14 @@ impl Timeline {
job_desc.gc_cutoff,
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
)
.await?;
.await
.context("failed to generate key retention")
.map_err(CompactionError::Other)?;
retention
.pipe_to(
last_key,
@@ -3045,21 +3199,36 @@ impl Timeline {
&mut stat,
ctx,
)
.await?;
.await
.context("failed to pipe to delta layer writer")
.map_err(CompactionError::Other)?;
// end: move the above part to the loop body
let time_main_loop = timer.elapsed();
let timer = Instant::now();
let mut rewrote_delta_layers = Vec::new();
for (key, writers) in delta_layer_rewriters {
if let Some(delta_writer_before) = writers.before {
let (desc, path) = delta_writer_before
.finish(job_desc.compaction_key_range.start, ctx)
.await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)
.context("failed to finish creating delta layer")
.map_err(CompactionError::Other)?;
rewrote_delta_layers.push(layer);
}
if let Some(delta_writer_after) = writers.after {
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
let (desc, path) = delta_writer_after
.finish(key.key_range.end, ctx)
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)
.context("failed to finish creating delta layer")
.map_err(CompactionError::Other)?;
rewrote_delta_layers.push(layer);
}
}
@@ -3074,7 +3243,9 @@ impl Timeline {
let end_key = job_desc.compaction_key_range.end;
writer
.finish_with_discard_fn(self, ctx, end_key, discard)
.await?
.await
.context("failed to finish image layer writer")
.map_err(CompactionError::Other)?
} else {
drop(writer);
Vec::new()
@@ -3086,7 +3257,9 @@ impl Timeline {
let produced_delta_layers = if !dry_run {
delta_layer_writer
.finish_with_discard_fn(self, ctx, discard)
.await?
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?
} else {
drop(delta_layer_writer);
Vec::new()
@@ -3098,6 +3271,13 @@ impl Timeline {
let mut keep_layers = HashSet::new();
let produced_delta_layers_len = produced_delta_layers.len();
let produced_image_layers_len = produced_image_layers.len();
let layer_selection_by_key = job_desc
.selected_layers
.iter()
.map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
.collect::<HashMap<_, _>>();
for action in produced_delta_layers {
match action {
BatchWriterResult::Produced(layer) => {
@@ -3111,8 +3291,16 @@ impl Timeline {
if cfg!(debug_assertions) {
info!("discarded delta layer: {}", l);
}
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
stat.discard_delta_layer(layer_desc.file_size());
} else {
tracing::warn!(
"discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
l
);
stat.discard_delta_layer(0);
}
keep_layers.insert(l);
stat.discard_delta_layer();
}
}
}
@@ -3121,6 +3309,9 @@ impl Timeline {
"produced rewritten delta layer: {}",
layer.layer_desc().key()
);
// For now, we include rewritten delta layer size in the "produce_delta_layer". We could
// make it a separate statistics in the future.
stat.produce_delta_layer(layer.layer_desc().file_size());
}
compact_to.extend(rewrote_delta_layers);
for action in produced_image_layers {
@@ -3132,8 +3323,16 @@ impl Timeline {
}
BatchWriterResult::Discarded(l) => {
debug!("discarded image layer: {}", l);
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
stat.discard_image_layer(layer_desc.file_size());
} else {
tracing::warn!(
"discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
l
);
stat.discard_image_layer(0);
}
keep_layers.insert(l);
stat.discard_image_layer();
}
}
}
@@ -3166,7 +3365,9 @@ impl Timeline {
&layer.layer_desc().key_range,
&job_desc.compaction_key_range,
) {
bail!("violated constraint: image layer outside of compaction key range");
return Err(CompactionError::Other(anyhow!(
"violated constraint: image layer outside of compaction key range"
)));
}
if !fully_contains(
&job_desc.compaction_key_range,
@@ -3179,13 +3380,25 @@ impl Timeline {
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
let time_final_phase = timer.elapsed();
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
stat.time_analyze_secs = time_analyze.as_secs_f64();
stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
stat.finalize();
info!(
"gc-compaction statistics: {}",
serde_json::to_string(&stat)?
serde_json::to_string(&stat)
.context("failed to serialize gc-compaction statistics")
.map_err(CompactionError::Other)?
);
if dry_run {
return Ok(());
return Ok(CompactionOutcome::Done);
}
info!(
@@ -3220,10 +3433,10 @@ impl Timeline {
// the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
// in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
if let Some(err) = check_valid_layermap(&final_layers) {
bail!(
return Err(CompactionError::Other(anyhow!(
"gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
err
);
)));
}
// Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3275,7 +3488,9 @@ impl Timeline {
// find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
// be batched into `schedule_compaction_update`.
let disk_consistent_lsn = self.disk_consistent_lsn.load();
self.schedule_uploads(disk_consistent_lsn, None)?;
self.schedule_uploads(disk_consistent_lsn, None)
.context("failed to schedule uploads")
.map_err(CompactionError::Other)?;
// If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
// of `compact_from`.
let compact_from = {
@@ -3302,7 +3517,7 @@ impl Timeline {
drop(gc_lock);
Ok(())
Ok(CompactionOutcome::Done)
}
}

View File

@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
CreateTimelineCause::Delete,
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
None, // doesn't matter what we put here
None, // doesn't matter what we put here
)
.context("create_timeline_struct")?;

View File

@@ -1,5 +1,4 @@
//! An efficient way to keep the timeline gate open without preventing
//! timeline shutdown for longer than a single call to a timeline method.
//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
//!
//! # Motivation
//!
@@ -19,27 +18,32 @@
//! we hold the Timeline gate open while we're invoking the method on the
//! Timeline object.
//!
//! However, we want to avoid the overhead of entering the gate for every
//! method invocation.
//!
//! Further, for shard routing, we want to avoid calling the tenant manager to
//! resolve the shard for every request. Instead, we want to cache the
//! routing result so we can bypass the tenant manager for all subsequent requests
//! that get routed to that shard.
//! We want to avoid the overhead of doing, for each incoming request,
//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
//! release the mgr rwlock before doing any request processing work
//! - re-entering the Timeline gate for each Timeline method invocation.
//!
//! Regardless of how we accomplish the above, it should not
//! prevent the Timeline from shutting down promptly.
//!
//!
//! # Design
//!
//! ## Data Structures
//!
//! There are three user-facing data structures:
//! There are two concepts expressed as associated types in the `Types` trait:
//! - `TenantManager`: the thing that performs the expensive work. It produces
//! a `Timeline` object, which is the other associated type.
//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
//!
//! There are three user-facing data structures exposed by this module:
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
//!
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
//!
//! To dispatch a request, the page service connection calls `Cache::get`.
//!
//! A cache miss means we consult the tenant manager for shard routing,
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
//! A cache miss means we call Types::TenantManager::resolve for shard routing,
//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
//!
//! We wrap the object returned from resolve() in an `Arc` and store that inside the
//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
//! and a strong ref in the `PerTimelineState`.
//! A strong ref is returned wrapped in a `Handle`.
//! Another strong ref is returned wrapped in a `Handle`.
//!
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
//! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
//!
//! # Performance
//!
//! Remember from the introductory section:
//!
//! > However, we want to avoid the overhead of entering the gate for every
//! > method invocation.
//! > We want to avoid the overhead of doing, for each incoming request,
//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
//! > release the mgr rwlock before doing any request processing work
//! > - re-entering the Timeline gate for each Timeline method invocation.
//!
//! Why do we want to avoid that?
//! Because the gate is a shared location in memory and entering it involves
//! bumping refcounts, which leads to cache contention if done frequently
//! from multiple cores in parallel.
//! All of these boil down to some state that is either globally shared among all shards
//! or state shared among all tasks that serve a particular timeline.
//! It is either protected by RwLock or manipulated via atomics.
//! Even atomics are costly when shared across multiple cores.
//! So, we want to avoid any permanent need for coordination between page_service tasks.
//!
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
//! That `Arc` is private to the `HandleInner` and hence to the connection.
//! The solution is to add indirection: we wrap the Types::Timeline object that is
//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
//! and hence to the single Cache / page_service connection.
//! (Review the "Data Structures" section if that is unclear to you.)
//!
//! A `WeakHandle` is a weak ref to the `HandleInner`.
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
//!
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
//! Again, this is cheap because the `Arc` is private to the connection.
//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
//! The Mutex is not contended because it is private to the connection.
//! And again, the `Arc<Types::Timeline>` clone is cheap because that wrapper
//! Arc's refcounts are private to the connection.
//!
//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
//!
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
//!
//! # Shutdown
//!
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
//! ```
//!
//! Further, there is this cycle:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
//! ```
//!
//! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
//! - Connection shutdown (=> dropping the `Cache`).
//!
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
//! `Arc<GateGuard>`.
//! Both transition the `HandleInner` from [`HandleInner::Open`] to
//! [`HandleInner::ShutDown`], which drops the only long-lived
//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
//! is dropped, the `Types::Timeline` gets dropped and thereby
//! the `GateGuard` and the `Arc<Timeline>` that it stores,
//! thereby breaking both cycles.
//!
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
//! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
pub(crate) trait Types: Sized + std::fmt::Debug {
type TenantManagerError: Sized + std::fmt::Debug;
type TenantManager: TenantManager<Self> + Sized;
type Timeline: ArcTimeline<Self> + Sized;
type Timeline: Timeline<Self> + Sized;
}
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
/// See module-level comment.
pub(crate) struct Handle<T: Types> {
timeline: Arc<T::Timeline>,
#[allow(dead_code)] // the field exists to keep the gate open
gate_guard: Arc<utils::sync::gate::GateGuard>,
inner: Arc<Mutex<HandleInner<T>>>,
open: Arc<T::Timeline>,
}
pub(crate) struct WeakHandle<T: Types> {
inner: Weak<Mutex<HandleInner<T>>>,
}
enum HandleInner<T: Types> {
KeepingTimelineGateOpen {
#[allow(dead_code)]
gate_guard: Arc<utils::sync::gate::GateGuard>,
timeline: Arc<T::Timeline>,
},
Open(Arc<T::Timeline>),
ShutDown,
}
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
}
/// Abstract view of an [`Arc<Timeline>`], for testability.
pub(crate) trait ArcTimeline<T: Types>: Clone {
fn gate(&self) -> &utils::sync::gate::Gate;
pub(crate) trait Timeline<T: Types> {
fn shard_timeline_id(&self) -> ShardTimelineId;
fn get_shard_identity(&self) -> &ShardIdentity;
fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
#[derive(Debug)]
pub(crate) enum GetError<T: Types> {
TenantManager(T::TenantManagerError),
TimelineGateClosed,
PerTimelineStateShutDown,
}
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
}
trace!("creating new HandleInner");
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
gate_guard: Arc::new(
// this enter() is expensive in production code because
// it hits the global Arc<Timeline>::gate refcounts
match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
},
),
// this clone is expensive in production code because
// it hits the global Arc<Timeline>::clone refcounts
timeline: Arc::new(timeline.clone()),
}));
let timeline = Arc::new(timeline);
let handle_inner_arc =
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
let handle_weak = WeakHandle {
inner: Arc::downgrade(&handle_inner_arc),
};
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
};
let lock_guard = inner.lock().expect("poisoned");
match &*lock_guard {
HandleInner::KeepingTimelineGateOpen {
timeline,
gate_guard,
} => {
let gate_guard = Arc::clone(gate_guard);
let timeline = Arc::clone(timeline);
HandleInner::Open(open) => {
let open = Arc::clone(open);
drop(lock_guard);
Ok(Handle {
timeline,
gate_guard,
inner,
})
Ok(Handle { open, inner })
}
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
}
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.timeline
&self.open
}
}
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
/// to the [`Types::Timeline`] that embeds this per-timeline state.
/// Even if [`TenantManager::resolve`] would still resolve to it.
///
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
/// That's ok because they're short-lived. See module-level comment for details.
#[instrument(level = "trace", skip_all)]
pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
impl<T: Types> HandleInner<T> {
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
match std::mem::replace(self, HandleInner::ShutDown) {
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
HandleInner::Open(timeline) => Some(timeline),
HandleInner::ShutDown => {
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardStripeSize;
use utils::shard::ShardCount;
use utils::sync::gate::GateGuard;
use super::*;
@@ -641,7 +625,7 @@ mod tests {
impl Types for TestTypes {
type TenantManagerError = anyhow::Error;
type TenantManager = StubManager;
type Timeline = Arc<StubTimeline>;
type Timeline = Entered;
}
struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
myself: Weak<StubTimeline>,
}
struct Entered {
timeline: Arc<StubTimeline>,
#[allow(dead_code)] // it's stored here to keep the gate open
gate_guard: Arc<GateGuard>,
}
impl StubTimeline {
fn getpage(&self) {
// do nothing
}
}
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
impl Timeline<TestTypes> for Entered {
fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> anyhow::Result<Arc<StubTimeline>> {
) -> anyhow::Result<Entered> {
for timeline in &self.shards {
if timeline.id == timeline_id {
let enter_gate = || {
let gate_guard = timeline.gate.enter()?;
let gate_guard = Arc::new(gate_guard);
anyhow::Ok(gate_guard)
};
match &shard_selector {
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Zero => continue,
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Page(_) => continue,
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Known(_) => continue,
}
@@ -711,6 +711,13 @@ mod tests {
}
}
impl std::ops::Deref for Entered {
type Target = StubTimeline;
fn deref(&self) -> &Self::Target {
&self.timeline
}
}
#[tokio::test(start_paused = true)]
async fn test_timeline_shutdown() {
crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
let key = DBDIR_KEY;
// Simulate 10 connections that's opened, used, and closed
let mut used_handles = vec![];
for _ in 0..10 {
let mut cache = Cache::<TestTypes>::default();
let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
handle
};
handle.getpage();
used_handles.push(Arc::downgrade(&handle.timeline));
}
// No handles exist, thus gates are closed and don't require shutdown.

View File

@@ -61,11 +61,11 @@ impl HeatmapLayersDownloader {
tracing::info!(
resident_size=%timeline.resident_physical_size(),
heatmap_layers=%heatmap.layers.len(),
heatmap_layers=%heatmap.all_layers().count(),
"Starting heatmap layers download"
);
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|layer| {
let ctx = ctx.attached_child();
let tl = timeline.clone();

View File

@@ -8,14 +8,14 @@ use tracing::trace;
use utils::id::TimelineId;
use utils::lsn::{AtomicLsn, Lsn};
use super::TimelineWriterState;
use super::{ReadableLayer, TimelineWriterState};
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::metrics::TimelineMetrics;
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
use crate::tenant::storage_layer::{
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
PersistentLayerKey, ResidentLayer,
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
};
/// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
}
impl LayerManager {
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
match weak {
ReadableLayerWeak::PersistentLayer(desc) => {
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
}
ReadableLayerWeak::InMemoryLayer(desc) => {
let inmem = self
.layer_map()
.expect("no concurrent shutdown")
.in_memory_layer(&desc);
ReadableLayer::InMemoryLayer(inmem)
}
}
}
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
mapping.remove(layer);
layer.delete_on_drop();
}
#[cfg(test)]
pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
use pageserver_api::models::InMemoryLayerInfo;
match layer.info() {
InMemoryLayerInfo::Open { .. } => {
assert!(self.layer_map.open_layer.is_none());
self.layer_map.open_layer = Some(layer);
}
InMemoryLayerInfo::Frozen { lsn_start, .. } => {
if let Some(last) = self.layer_map.frozen_layers.back() {
assert!(last.get_lsn_range().end <= lsn_start);
}
self.layer_map.frozen_layers.push_back(layer);
}
}
}
}
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

View File

@@ -1369,6 +1369,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
if (lfc_ctl)
value = lfc_ctl->limit;
break;
case 8:
key = "file_cache_chunk_size_pages";
value = BLOCKS_PER_CHUNK;
break;
default:
SRF_RETURN_DONE(funcctx);
}

View File

@@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
if (!neon_prefetch_response_usable(&lsns[i], slot))
continue;
/*
* Ignore errors
*/
if (slot->response->tag != T_NeonGetPageResponse)
{
if (slot->response->tag != T_NeonErrorResponse)
{
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
}
continue;
}
memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
prefetch_set_unused(ring_index);
BITMAP_SET(mask, i);

View File

@@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
static char *FormatEvents(WalProposer *wp, uint32 events);
static void UpdateDonorShmem(WalProposer *wp);
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst);
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
WalProposer *
@@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
wp->config = config;
wp->api = api;
for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep)
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
/*
* If safekeepers list starts with g# parse generation number followed by
* :
*/
if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
{
char *endptr;
errno = 0;
wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
if (errno != 0)
{
wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
}
/* Skip past : to the first hostname. */
host = endptr + 1;
}
else
{
host = wp->config->safekeepers_list;
}
wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
for (; host != NULL && *host != '\0'; host = sep)
{
port = strchr(host, ':');
if (port == NULL)
@@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp)
pfree(wp);
}
static bool
WalProposerGenerationsEnabled(WalProposer *wp)
{
return wp->safekeepers_generation != 0;
}
/*
* Create new AppendRequest message and start sending it. This function is
* called from walsender every time the new WAL is available.
@@ -600,10 +632,14 @@ static void
SendStartWALPush(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
/* Forbid implicit timeline creation if generations are enabled. */
char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true";
#define CMD_LEN 512
char cmd[CMD_LEN];
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation);
if (!wp->api.conn_send_query(sk, cmd))
{
wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
@@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
pfree(mconf_toml);
/*
* Adopt mconf of safekeepers if it is higher. TODO: mconf change should
* restart wp if it started voting.
*/
if (sk->greetResponse.mconf.generation > wp->mconf.generation)
{
MembershipConfigurationFree(&wp->mconf);
MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
/* full conf was just logged above */
wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
}
/* Protocol is all good, move to voting. */
sk->state = SS_VOTING;
@@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf
pq_sendint64_le(buf, m->termHistory->entries[i].term);
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
}
/*
/*
* Removed timeline_start_lsn. Still send it as a valid
* value until safekeepers taking it from term history are
* deployed.
@@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
}
}
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
return false; /* keep the compiler quiet */
return false; /* keep the compiler quiet */
}
/*
@@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf)
return s.data;
}
static void
MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst)
{
dst->generation = src->generation;
dst->members.len = src->members.len;
dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len);
memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len);
dst->new_members.len = src->new_members.len;
dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len);
memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len);
}
static void
MembershipConfigurationFree(MembershipConfiguration *mconf)
{

View File

@@ -160,7 +160,10 @@ typedef struct MemberSet
SafekeeperId *m; /* ids themselves */
} MemberSet;
/* Timeline safekeeper membership configuration. */
/*
* Timeline safekeeper membership configuration as sent in the
* protocol.
*/
typedef struct MembershipConfiguration
{
Generation generation;
@@ -761,8 +764,22 @@ typedef struct WalProposer
/* (n_safekeepers / 2) + 1 */
int quorum;
/*
* Generation of the membership conf of which safekeepers[] are presumably
* members. To make cplane life a bit easier and have more control in
* tests with which sks walproposer gets connected neon.safekeepers GUC
* doesn't provide full mconf, only the list of endpoints to connect to.
* We still would like to know generation associated with it because 1) we
* need some handle to enforce using generations in walproposer, and
* non-zero value of this serves the purpose; 2) currently we don't do
* that, but in theory walproposer can update list of safekeepers to
* connect to upon receiving mconf from safekeepers, and generation number
* must be checked to see which list is newer.
*/
Generation safekeepers_generation;
/* Number of occupied slots in safekeepers[] */
int n_safekeepers;
/* Safekeepers walproposer is connecting to. */
Safekeeper safekeeper[MAX_SAFEKEEPERS];
/* WAL has been generated up to this point */

Some files were not shown because too many files have changed in this diff Show More