mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -32,3 +32,4 @@ config-variables:
|
||||
- NEON_DEV_AWS_ACCOUNT_ID
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- AWS_ECR_REGION
|
||||
- BENCHMARK_LARGE_OLTP_PROJECTID
|
||||
|
||||
12
.github/actions/neon-branch-create/action.yml
vendored
12
.github/actions/neon-branch-create/action.yml
vendored
@@ -84,7 +84,13 @@ runs:
|
||||
--header "Authorization: Bearer ${API_KEY}"
|
||||
)
|
||||
|
||||
role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
|
||||
role_name=$(echo "$roles" | jq --raw-output '
|
||||
(.roles | map(select(.protected == false))) as $roles |
|
||||
if any($roles[]; .name == "neondb_owner")
|
||||
then "neondb_owner"
|
||||
else $roles[0].name
|
||||
end
|
||||
')
|
||||
echo "role_name=${role_name}" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
API_HOST: ${{ inputs.api_host }}
|
||||
@@ -107,13 +113,13 @@ runs:
|
||||
)
|
||||
|
||||
if [ -z "${reset_password}" ]; then
|
||||
sleep 1
|
||||
sleep $i
|
||||
continue
|
||||
fi
|
||||
|
||||
password=$(echo $reset_password | jq --raw-output '.role.password')
|
||||
if [ "${password}" == "null" ]; then
|
||||
sleep 1
|
||||
sleep $i # increasing backoff
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
@@ -44,6 +44,11 @@ inputs:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v16'
|
||||
sanitizers:
|
||||
description: 'enabled or disabled'
|
||||
required: false
|
||||
default: 'disabled'
|
||||
type: string
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -59,7 +64,7 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
|
||||
path: /tmp/neon
|
||||
aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
|
||||
|
||||
@@ -112,6 +117,7 @@ runs:
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FAILED: ${{ inputs.rerun_failed }}
|
||||
PG_VERSION: ${{ inputs.pg_version }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
|
||||
@@ -280,7 +280,7 @@ jobs:
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
|
||||
path: /tmp/neon
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
@@ -347,6 +347,7 @@ jobs:
|
||||
real_s3_region: eu-central-1
|
||||
rerun_failed: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
sanitizers: ${{ inputs.sanitizers }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
|
||||
# Attempt to stop tests gracefully to generate test reports
|
||||
@@ -359,7 +360,6 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -141,6 +141,8 @@ jobs:
|
||||
--ignore test_runner/performance/test_physical_replication.py
|
||||
--ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
|
||||
--ignore test_runner/performance/test_cumulative_statistics_persistence.py
|
||||
--ignore test_runner/performance/test_perf_many_relations.py
|
||||
--ignore test_runner/performance/test_perf_oltp_large_tenant.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -692,15 +692,15 @@ jobs:
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
vm-compute-node-image:
|
||||
vm-compute-node-image-arch:
|
||||
needs: [ check-permissions, meta, compute-node-image ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: [ self-hosted, large ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ amd64, arm64 ]
|
||||
version:
|
||||
# see the comment for `compute-node-image-arch` job
|
||||
- pg: v14
|
||||
debian: bullseye
|
||||
- pg: v15
|
||||
@@ -717,7 +717,7 @@ jobs:
|
||||
|
||||
- name: Downloading vm-builder
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
@@ -738,12 +738,37 @@ jobs:
|
||||
-size=2G \
|
||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
-target-arch=linux/amd64
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
|
||||
-target-arch=linux/${{ matrix.arch }}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
|
||||
docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ vm-compute-node-image-arch, meta ]
|
||||
if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: ubuntu-22.04
|
||||
strategy:
|
||||
matrix:
|
||||
version:
|
||||
# see the comment for `compute-node-image-arch` job
|
||||
- pg: v14
|
||||
- pg: v15
|
||||
- pg: v16
|
||||
- pg: v17
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch compute-node image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
|
||||
neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
|
||||
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, meta, neon-image, compute-node-image ]
|
||||
@@ -1036,7 +1061,7 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
|
||||
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
|
||||
permissions:
|
||||
|
||||
@@ -52,8 +52,9 @@ jobs:
|
||||
- name: Test extension upgrade
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
NEWTAG: latest
|
||||
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
NEW_COMPUTE_TAG: latest
|
||||
OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
|
||||
PG_VERSION: ${{ matrix.pg-version }}
|
||||
FORCE_ALL_UPGRADE_TESTS: true
|
||||
run: ./docker-compose/test_extensions_upgrade.sh
|
||||
|
||||
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
147
.github/workflows/large_oltp_benchmark.yml
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
name: large oltp benchmark
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
push:
|
||||
branches: [ bodobolero/synthetic_oltp_workload ]
|
||||
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||
group: large-oltp-bench-workflow
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
oltp:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
include:
|
||||
- target: new_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
- target: reuse_branch
|
||||
custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4
|
||||
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
|
||||
TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
PG_VERSION: 16 # pre-determined by pre-determined project
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
|
||||
PLATFORM: ${{ matrix.target }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary to download artefacts
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Branch for large tenant
|
||||
if: ${{ matrix.target == 'new_branch' }}
|
||||
id: create-neon-branch-oltp-target
|
||||
uses: ./.github/actions/neon-branch-create
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${{ matrix.target }}" in
|
||||
new_branch)
|
||||
CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
|
||||
;;
|
||||
reuse_branch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown target=${{ matrix.target }}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Benchmark pgbench with custom-scripts
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
|
||||
pg_version: ${{ env.PG_VERSION }}
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Delete Neon Branch for large tenant
|
||||
if: ${{ always() && matrix.target == 'new_branch' }}
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
|
||||
branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Periodic large oltp perf testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
16
.github/workflows/periodic_pagebench.yml
vendored
16
.github/workflows/periodic_pagebench.yml
vendored
@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 */3 * * *' # Runs every 3 hours
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
@@ -78,8 +78,10 @@ jobs:
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Start Bench with run_id
|
||||
@@ -89,7 +91,7 @@ jobs:
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
# Autoscaling
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
|
||||
# DevProd
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
# DevProd & PerfCorr
|
||||
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
|
||||
/test_runner/ @neondatabase/performance-correctness
|
||||
|
||||
# Compute
|
||||
/pgxn/ @neondatabase/compute
|
||||
|
||||
172
Cargo.lock
generated
172
Cargo.lock
generated
@@ -783,6 +783,28 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-extra"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"headers",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"serde",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.21.0"
|
||||
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.1"
|
||||
version = "0.21.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
|
||||
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
@@ -1305,6 +1327,7 @@ dependencies = [
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -1316,6 +1339,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
@@ -2297,7 +2321,7 @@ name = "framed-websockets"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"bytemuck",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
@@ -2410,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
|
||||
|
||||
[[package]]
|
||||
name = "futures-timer"
|
||||
version = "3.0.2"
|
||||
version = "3.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
|
||||
checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
@@ -2515,6 +2539,27 @@ version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "governor"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"dashmap 6.1.0",
|
||||
"futures-sink",
|
||||
"futures-timer",
|
||||
"futures-util",
|
||||
"no-std-compat",
|
||||
"nonzero_ext",
|
||||
"parking_lot 0.12.1",
|
||||
"portable-atomic",
|
||||
"quanta",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"spinning_top",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "group"
|
||||
version = "0.12.1"
|
||||
@@ -2632,7 +2677,7 @@ version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
@@ -2640,6 +2685,30 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headers"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
|
||||
dependencies = [
|
||||
"base64 0.21.7",
|
||||
"bytes",
|
||||
"headers-core",
|
||||
"http 1.1.0",
|
||||
"httpdate",
|
||||
"mime",
|
||||
"sha1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headers-core"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
|
||||
dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -2777,12 +2846,9 @@ name = "http-utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"backtrace",
|
||||
"bytes",
|
||||
"fail",
|
||||
"flate2",
|
||||
"hyper 0.14.30",
|
||||
"inferno 0.12.0",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"metrics",
|
||||
@@ -3281,9 +3347,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "jemalloc_pprof"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
|
||||
checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
@@ -3367,7 +3433,7 @@ version = "9.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"js-sys",
|
||||
"pem",
|
||||
"ring",
|
||||
@@ -3482,9 +3548,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mappings"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
|
||||
checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"libc",
|
||||
@@ -3725,6 +3791,12 @@ dependencies = [
|
||||
"memoffset 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "no-std-compat"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
@@ -3735,6 +3807,12 @@ dependencies = [
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nonzero_ext"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
|
||||
|
||||
[[package]]
|
||||
name = "notify"
|
||||
version = "8.0.0"
|
||||
@@ -4225,6 +4303,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
@@ -4307,9 +4386,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.1.8"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
|
||||
checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"seize",
|
||||
@@ -4437,7 +4516,7 @@ version = "3.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -4591,6 +4670,12 @@ dependencies = [
|
||||
"never-say-never",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
||||
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.7"
|
||||
@@ -4755,12 +4840,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pprof_util"
|
||||
version = "0.6.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
|
||||
checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"backtrace",
|
||||
"flate2",
|
||||
"inferno 0.12.0",
|
||||
"num",
|
||||
"paste",
|
||||
"prost",
|
||||
@@ -5052,6 +5139,21 @@ dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quanta"
|
||||
version = "0.12.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"raw-cpuid",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.26.0"
|
||||
@@ -5182,6 +5284,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raw-cpuid"
|
||||
version = "11.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -5752,7 +5863,7 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5761,7 +5872,7 @@ version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
@@ -6000,9 +6111,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "seize"
|
||||
version = "0.4.9"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
|
||||
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
@@ -6395,6 +6506,15 @@ version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spki"
|
||||
version = "0.6.0"
|
||||
@@ -6471,6 +6591,7 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"governor",
|
||||
"hex",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
@@ -7285,10 +7406,12 @@ version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bitflags 2.8.0",
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -7642,7 +7765,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"backtrace",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -8196,7 +8318,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.1",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
"bytes",
|
||||
"camino",
|
||||
|
||||
@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
backtrace = "0.3.74"
|
||||
flate2 = "1.0.26"
|
||||
assert-json-diff = "2"
|
||||
async-stream = "0.3"
|
||||
@@ -68,6 +67,7 @@ aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.8.1", features = ["ws"] }
|
||||
axum-extra = { version = "0.10.0", features = ["typed-header"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.71"
|
||||
@@ -95,6 +95,7 @@ futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
governor = "0.8"
|
||||
hashbrown = "0.14"
|
||||
hashlink = "0.9.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
@@ -113,11 +114,10 @@ hyper-util = "0.1"
|
||||
tokio-tungstenite = "0.21.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inferno = "0.12.0"
|
||||
ipnet = "2.10.0"
|
||||
itertools = "0.10"
|
||||
itoa = "1.0.11"
|
||||
jemalloc_pprof = "0.6"
|
||||
jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
@@ -192,7 +192,7 @@ toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
|
||||
# This revision uses opentelemetry 0.27. There's no tag for it.
|
||||
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
|
||||
|
||||
7
Makefile
7
Makefile
@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
WITH_SANITIZERS ?= no
|
||||
PG_CFLAGS = -fsigned-char
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
PG_CFLAGS += -O2 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
|
||||
+@echo "Compiling pageinspect $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
|
||||
+@echo "Compiling pg_trgm $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
|
||||
+@echo "Compiling test_decoding $*"
|
||||
|
||||
@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION:?} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION:?}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
|
||||
COPY compute/patches/pg_duckdb_v031.patch .
|
||||
COPY compute/patches/duckdb_v120.patch .
|
||||
# pg_duckdb build requires source dir to be a git repo to get submodules
|
||||
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
|
||||
# - extension management function duckdb.install_extension()
|
||||
# - access to duckdb.extensions table and its sequence
|
||||
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
|
||||
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
|
||||
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/pg_duckdb-src
|
||||
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
|
||||
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_repack"
|
||||
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
|
||||
postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
|
||||
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
|
||||
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
|
||||
else\
|
||||
postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
|
||||
postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
|
||||
pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
|
||||
sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
|
||||
fi\
|
||||
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
|
||||
&& curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
&& curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
|
||||
| tar xzf - --strip-components=1 -C.\
|
||||
@@ -1933,6 +1933,7 @@ RUN apt update && \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates \
|
||||
rsyslog \
|
||||
$VERSION_INSTALLS && \
|
||||
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
@@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
|
||||
# Make the libraries we built available
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
# rsyslog config permissions
|
||||
# directory for rsyslogd pid file
|
||||
RUN mkdir /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /etc/rsyslog.d/
|
||||
|
||||
|
||||
ENV LANG=en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
|
||||
import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
|
||||
import 'sql_exporter/lfc_cache_size_limit.libsonnet',
|
||||
import 'sql_exporter/lfc_chunk_size.libsonnet',
|
||||
import 'sql_exporter/lfc_hits.libsonnet',
|
||||
import 'sql_exporter/lfc_misses.libsonnet',
|
||||
import 'sql_exporter/lfc_used.libsonnet',
|
||||
|
||||
@@ -1 +1,5 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
|
||||
SELECT sum(pg_database_size(datname)) AS total
|
||||
FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2;
|
||||
|
||||
10
compute/etc/sql_exporter/lfc_chunk_size.libsonnet
Normal file
10
compute/etc/sql_exporter/lfc_chunk_size.libsonnet
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
metric_name: 'lfc_chunk_size',
|
||||
type: 'gauge',
|
||||
help: 'LFC chunk size, measured in 8KiB pages',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_chunk_size_pages',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_chunk_size.sql',
|
||||
}
|
||||
1
compute/etc/sql_exporter/lfc_chunk_size.sql
Normal file
1
compute/etc/sql_exporter/lfc_chunk_size.sql
Normal file
@@ -0,0 +1 @@
|
||||
SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';
|
||||
@@ -1,10 +1,20 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
|
||||
tup_updated AS updated, tup_deleted AS deleted, datname
|
||||
SELECT pg_database_size(datname) AS db_size,
|
||||
deadlocks,
|
||||
tup_inserted AS inserted,
|
||||
tup_updated AS updated,
|
||||
tup_deleted AS deleted,
|
||||
datname
|
||||
FROM pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2
|
||||
AND datname <> 'postgres'
|
||||
AND NOT datistemplate
|
||||
ORDER BY oid
|
||||
LIMIT 10
|
||||
);
|
||||
|
||||
@@ -39,6 +39,10 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
@@ -54,7 +58,7 @@ files:
|
||||
# regardless of hostname (ALL)
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -69,6 +73,12 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -132,6 +142,12 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -39,6 +39,10 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
@@ -54,7 +58,7 @@ files:
|
||||
# regardless of hostname (ALL)
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -69,6 +73,12 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -128,6 +138,11 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
anyhow.workspace = true
|
||||
axum = { workspace = true, features = [] }
|
||||
axum-extra.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
@@ -25,6 +26,7 @@ fail.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
|
||||
@@ -33,39 +33,27 @@
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Parser;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use compute_tools::compute::{
|
||||
ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
|
||||
use compute_tools::extension_server::get_pg_version_string;
|
||||
use compute_tools::http::server::Server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::swap::resize_swap;
|
||||
use rlimit::{Resource, setrlimit};
|
||||
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
|
||||
use signal_hook::iterator::Signals;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
@@ -164,29 +152,41 @@ fn main() -> Result<()> {
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
|
||||
let compute = wait_spec(build_tag, &cli, cli_spec)?;
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
connstr,
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
ext_remote_storage: cli.remote_ext_config.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
#[cfg(target_os = "linux")]
|
||||
filecache_connstr: cli.filecache_connstr,
|
||||
#[cfg(target_os = "linux")]
|
||||
cgroup: cli.cgroup,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
build_tag,
|
||||
|
||||
start_postgres(&cli, compute)?
|
||||
live_config_allowed: cli_spec.live_config_allowed,
|
||||
},
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
)?;
|
||||
|
||||
// Startup is finished, exit the startup tracing span
|
||||
};
|
||||
|
||||
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||
|
||||
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
|
||||
|
||||
maybe_delay_exit(delay_exit);
|
||||
let exit_code = compute_node.run()?;
|
||||
|
||||
scenario.teardown();
|
||||
|
||||
deinit_and_exit(wait_pg_result);
|
||||
deinit_and_exit(exit_code);
|
||||
}
|
||||
|
||||
async fn init() -> Result<String> {
|
||||
@@ -207,56 +207,6 @@ async fn init() -> Result<String> {
|
||||
Ok(build_tag)
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// There is no standard for passing context in env variables, but a lot of
|
||||
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
|
||||
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// If this pod is pre-created without binding it to any particular endpoint
|
||||
// yet, this isn't the right place to enter the startup context. In that
|
||||
// case, the control plane should pass the tracing context as part of the
|
||||
// /configure API call.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
//
|
||||
// XXX: If the pod is restarted, we perform the startup actions in the same
|
||||
// context as the original startup actions, which probably doesn't make
|
||||
// sense.
|
||||
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
|
||||
if let Ok(val) = std::env::var("TRACEPARENT") {
|
||||
startup_tracing_carrier.insert("traceparent".to_string(), val);
|
||||
}
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
.extract(&startup_tracing_carrier)
|
||||
.attach();
|
||||
info!("startup tracing context attached");
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
if let Some(ref spec_json) = cli.spec_json {
|
||||
@@ -307,357 +257,7 @@ struct CliSpecParams {
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
fn wait_spec(
|
||||
build_tag: String,
|
||||
cli: &Cli,
|
||||
CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
compute_ctl_config: _,
|
||||
}: CliSpecParams,
|
||||
) -> Result<Arc<ComputeNode>> {
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
info!("new pspec.spec: {:?}", pspec.spec);
|
||||
new_state.pspec = Some(pspec);
|
||||
spec_set = true;
|
||||
} else {
|
||||
spec_set = false;
|
||||
}
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
let conn_conf = postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build postgres config from connstr")?;
|
||||
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
let compute_node = ComputeNode {
|
||||
compute_id: cli.compute_id.clone(),
|
||||
connstr,
|
||||
conn_conf,
|
||||
tokio_conn_conf,
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage: cli.remote_ext_config.clone(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps Postgres start quicker later,
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch the external HTTP server first, so that we can serve control plane
|
||||
// requests while configuration is still in progress.
|
||||
Server::External(cli.external_http_port).launch(&compute);
|
||||
|
||||
// The internal HTTP server could be launched later, but there isn't much
|
||||
// sense in waiting.
|
||||
Server::Internal(cli.internal_http_port).launch(&compute);
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got spec, continue configuration");
|
||||
// Spec is already set by the http server handler.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Record for how long we slept waiting for the spec.
|
||||
let now = Utc::now();
|
||||
state.metrics.wait_for_spec_ms = now
|
||||
.signed_duration_since(state.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// Reset start time, so that the total startup time that is calculated later will
|
||||
// not include the time that we waited for the spec.
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(compute)
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
cli: &Cli,
|
||||
compute: Arc<ComputeNode>,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
// Create a tracing span for the startup operation.
|
||||
//
|
||||
// We could otherwise just annotate the function with #[instrument], but if
|
||||
// we're being configured from a /configure HTTP request, we want the
|
||||
// startup to be considered part of the /configure request.
|
||||
let _this_entered = {
|
||||
// Temporarily enter the /configure request's span, so that the new span
|
||||
// becomes its child.
|
||||
let _parent_entered = state.startup_span.take().map(|p| p.entered());
|
||||
|
||||
tracing::info_span!("start_postgres")
|
||||
}
|
||||
.entered();
|
||||
|
||||
state.set_status(ComputeStatus::Init, &compute.state_changed);
|
||||
|
||||
info!(
|
||||
"running compute with features: {:?}",
|
||||
state.pspec.as_ref().unwrap().spec.features
|
||||
);
|
||||
// before we release the mutex, fetch some parameters for later.
|
||||
let &ComputeSpec {
|
||||
swap_size_bytes,
|
||||
disk_quota_bytes,
|
||||
#[cfg(target_os = "linux")]
|
||||
disable_lfc_resizing,
|
||||
..
|
||||
} = &state.pspec.as_ref().unwrap().spec;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
let mut prestartup_failed = false;
|
||||
let mut delay_exit = false;
|
||||
|
||||
// Resize swap to the desired size if the compute spec says so
|
||||
if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
|
||||
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
|
||||
// *before* starting postgres.
|
||||
//
|
||||
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
|
||||
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_mib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set disk quota if the compute spec says so
|
||||
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
|
||||
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
|
||||
{
|
||||
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
|
||||
Ok(()) => {
|
||||
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%disk_quota_bytes, %size_mib, "set disk quota");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to set disk quota");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start Postgres
|
||||
let mut pg = None;
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute() {
|
||||
Ok(pg) => {
|
||||
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
|
||||
Some(pg)
|
||||
}
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
};
|
||||
} else {
|
||||
warn!("skipping postgres startup because pre-startup step failed");
|
||||
}
|
||||
|
||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||
// because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
// don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
|
||||
let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
|
||||
None
|
||||
} else {
|
||||
Some(cli.filecache_connstr.clone())
|
||||
};
|
||||
|
||||
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
|
||||
let vm_monitor = tokio::spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: Some(cli.cgroup.clone()),
|
||||
pgconnstr,
|
||||
addr: cli.vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
));
|
||||
Some(vm_monitor)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
pg,
|
||||
StartPostgresResult {
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited. Wait for the log collecting task to finish.
|
||||
let _ = tokio::runtime::Handle::current()
|
||||
.block_on(logs_handle)
|
||||
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
|
||||
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
|
||||
Ok(WaitPostgresResult { exit_code })
|
||||
}
|
||||
|
||||
struct WaitPostgresResult {
|
||||
exit_code: Option<i32>,
|
||||
}
|
||||
|
||||
fn cleanup_after_postgres_exit(
|
||||
StartPostgresResult {
|
||||
mut delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
}: StartPostgresResult,
|
||||
) -> Result<bool> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
// /sys/fs/cgroup/neon-postgres.
|
||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
if let Some(handle) = vm_monitor {
|
||||
// Kills all threads spawned by the monitor
|
||||
token.cancel();
|
||||
// Kills the actual task running the monitor
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe sync safekeepers again, to speed up next startup
|
||||
let compute_state = compute.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
|
||||
info!("syncing safekeepers on shutdown");
|
||||
let storage_auth_token = pspec.storage_auth_token.clone();
|
||||
let lsn = compute.sync_safekeepers(storage_auth_token)?;
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
compute.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
|
||||
Ok(delay_exit)
|
||||
}
|
||||
|
||||
fn maybe_delay_exit(delay_exit: bool) {
|
||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||
// control plane can get the actual error.
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
}
|
||||
}
|
||||
|
||||
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
// hang for quite some time, see, for example:
|
||||
|
||||
@@ -58,14 +58,14 @@ pub async fn get_database_schema(
|
||||
compute: &Arc<ComputeNode>,
|
||||
dbname: &str,
|
||||
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
|
||||
let pgbin = &compute.pgbin;
|
||||
let pgbin = &compute.params.pgbin;
|
||||
let basepath = Path::new(pgbin).parent().unwrap();
|
||||
let pgdump = basepath.join("pg_dump");
|
||||
|
||||
// Replace the DB in the connection string and disable it to parts.
|
||||
// This is the only option to handle DBs with special characters.
|
||||
let conf =
|
||||
postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
|
||||
let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
|
||||
.map_err(|_| SchemaDumpError::Unexpected)?;
|
||||
let host = conf
|
||||
.get_hosts()
|
||||
.first()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,12 +1,16 @@
|
||||
use anyhow::Result;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
|
||||
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -55,10 +59,20 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
"safekeepers_connstrings is not zero, gen: {:?}",
|
||||
spec.safekeepers_generation
|
||||
);
|
||||
// If generation is given, prepend sk list with g#number:
|
||||
if let Some(generation) = spec.safekeepers_generation {
|
||||
write!(neon_safekeepers_value, "g#{}:", generation)?;
|
||||
}
|
||||
neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
|
||||
writeln!(
|
||||
file,
|
||||
"neon.safekeepers={}",
|
||||
escape_conf_value(&spec.safekeeper_connstrings.join(","))
|
||||
escape_conf_value(&neon_safekeepers_value)
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.tenant_id {
|
||||
@@ -126,6 +140,54 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
// If audit logging is enabled, configure pgaudit.
|
||||
//
|
||||
// Note, that this is called after the settings from spec are written.
|
||||
// This way we always override the settings from the spec
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
if let ComputeAudit::Hipaa = spec.audit_log_level {
|
||||
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
// Set log rotation to 5 minutes
|
||||
// TODO: tune this after performance testing
|
||||
writeln!(file, "pgaudit.log_rotation_age=5")?;
|
||||
|
||||
// Add audit shared_preload_libraries, if they are not present.
|
||||
//
|
||||
// The caller who sets the flag is responsible for ensuring that the necessary
|
||||
// shared_preload_libraries are present in the compute image,
|
||||
// otherwise the compute start will fail.
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
let mut extra_shared_preload_libraries = String::new();
|
||||
if !libs.contains("pgaudit") {
|
||||
extra_shared_preload_libraries.push_str(",pgaudit");
|
||||
}
|
||||
if !libs.contains("pgauditlogtofile") {
|
||||
extra_shared_preload_libraries.push_str(",pgauditlogtofile");
|
||||
}
|
||||
writeln!(
|
||||
file,
|
||||
"shared_preload_libraries='{}{}'",
|
||||
libs, extra_shared_preload_libraries
|
||||
)?;
|
||||
} else {
|
||||
// Typically, this should be unreacheable,
|
||||
// because we always set at least some shared_preload_libraries in the spec
|
||||
// but let's handle it explicitly anyway.
|
||||
writeln!(
|
||||
file,
|
||||
"shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
|
||||
)?;
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl audit settings: end")?;
|
||||
}
|
||||
|
||||
writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
|
||||
|
||||
if spec.drop_subscriptions_before_start {
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# Load imfile module to read log files
|
||||
module(load="imfile")
|
||||
|
||||
# Input configuration for log files in the specified directory
|
||||
# Replace {log_directory} with the directory containing the log files
|
||||
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
|
||||
global(workDirectory="/var/log")
|
||||
|
||||
# Forward logs to remote syslog server
|
||||
*.* @@{remote_endpoint}
|
||||
@@ -253,27 +253,31 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, i.e.
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// using HHTP GET
|
||||
// and return the response body as bytes
|
||||
//
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
.to_str()
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
|
||||
info!("Download extension {} from uri {}", ext_path, uri);
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
.with_label_values(&[&StatusCode::OK.to_string()])
|
||||
.with_label_values(&[&StatusCode::OK.to_string(), &filename])
|
||||
.inc();
|
||||
Ok(resp)
|
||||
}
|
||||
Err((msg, status)) => {
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
.with_label_values(&[&status])
|
||||
.with_label_values(&[&status, &filename])
|
||||
.inc();
|
||||
bail!(msg);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
pub(crate) mod json;
|
||||
pub(crate) mod path;
|
||||
pub(crate) mod query;
|
||||
pub(crate) mod request_id;
|
||||
|
||||
pub(crate) use json::Json;
|
||||
pub(crate) use path::Path;
|
||||
pub(crate) use query::Query;
|
||||
pub(crate) use request_id::RequestId;
|
||||
|
||||
86
compute_tools/src/http/extract/request_id.rs
Normal file
86
compute_tools/src/http/extract/request_id.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::{
|
||||
fmt::Display,
|
||||
ops::{Deref, DerefMut},
|
||||
};
|
||||
|
||||
use axum::{extract::FromRequestParts, response::IntoResponse};
|
||||
use http::{StatusCode, request::Parts};
|
||||
|
||||
use crate::http::{JsonResponse, headers::X_REQUEST_ID};
|
||||
|
||||
/// Extract the request ID from the `X-Request-Id` header.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(crate) struct RequestId(pub String);
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Rejection used for [`RequestId`].
|
||||
///
|
||||
/// Contains one variant for each way the [`RequestId`] extractor can
|
||||
/// fail.
|
||||
pub(crate) enum RequestIdRejection {
|
||||
/// The request is missing the header.
|
||||
MissingRequestId,
|
||||
|
||||
/// The value of the header is invalid UTF-8.
|
||||
InvalidUtf8,
|
||||
}
|
||||
|
||||
impl RequestIdRejection {
|
||||
pub fn status(&self) -> StatusCode {
|
||||
match self {
|
||||
RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn message(&self) -> String {
|
||||
match self {
|
||||
RequestIdRejection::MissingRequestId => "request ID is missing",
|
||||
RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoResponse for RequestIdRejection {
|
||||
fn into_response(self) -> axum::response::Response {
|
||||
JsonResponse::error(self.status(), self.message())
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> FromRequestParts<S> for RequestId
|
||||
where
|
||||
S: Send + Sync,
|
||||
{
|
||||
type Rejection = RequestIdRejection;
|
||||
|
||||
async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
|
||||
match parts.headers.get(X_REQUEST_ID) {
|
||||
Some(value) => match value.to_str() {
|
||||
Ok(request_id) => Ok(Self(request_id.to_string())),
|
||||
Err(_) => Err(RequestIdRejection::InvalidUtf8),
|
||||
},
|
||||
None => Err(RequestIdRejection::MissingRequestId),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for RequestId {
|
||||
type Target = String;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for RequestId {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for RequestId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(&self.0)
|
||||
}
|
||||
}
|
||||
2
compute_tools/src/http/headers.rs
Normal file
2
compute_tools/src/http/headers.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
/// Constant for `X-Request-Id` header.
|
||||
pub const X_REQUEST_ID: &str = "x-request-id";
|
||||
145
compute_tools/src/http/middleware/authorize.rs
Normal file
145
compute_tools/src/http/middleware/authorize.rs
Normal file
@@ -0,0 +1,145 @@
|
||||
use std::{collections::HashSet, net::SocketAddr};
|
||||
|
||||
use anyhow::{Result, anyhow};
|
||||
use axum::{RequestExt, body::Body, extract::ConnectInfo};
|
||||
use axum_extra::{
|
||||
TypedHeader,
|
||||
headers::{Authorization, authorization::Bearer},
|
||||
};
|
||||
use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use serde::Deserialize;
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub(in crate::http) struct Claims {
|
||||
compute_id: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
compute_id: String,
|
||||
jwks: JwkSet,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
impl Authorize {
|
||||
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
|
||||
let mut validation = Validation::new(Algorithm::EdDSA);
|
||||
// Nothing is currently required
|
||||
validation.required_spec_claims = HashSet::new();
|
||||
validation.validate_exp = true;
|
||||
// Unused by the control plane
|
||||
validation.validate_aud = false;
|
||||
// Unused by the control plane
|
||||
validation.validate_nbf = false;
|
||||
|
||||
Self {
|
||||
compute_id,
|
||||
jwks,
|
||||
validation,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
type RequestBody = Body;
|
||||
type ResponseBody = Body;
|
||||
type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
|
||||
|
||||
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
|
||||
let compute_id = self.compute_id.clone();
|
||||
let jwks = self.jwks.clone();
|
||||
let validation = self.validation.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
let request_id = request.extract_parts::<RequestId>().await.unwrap();
|
||||
|
||||
// TODO: Remove this check after a successful rollout
|
||||
if jwks.keys.is_empty() {
|
||||
warn!(%request_id, "Authorization has not been configured");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let connect_info = request
|
||||
.extract_parts::<ConnectInfo<SocketAddr>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// In the event the request is coming from the loopback interface,
|
||||
// allow all requests
|
||||
if connect_info.ip().is_loopback() {
|
||||
warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
|
||||
let TypedHeader(Authorization(bearer)) = request
|
||||
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
.map_err(|_| {
|
||||
JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
|
||||
})?;
|
||||
|
||||
let data = match Self::verify(&jwks, bearer.token(), &validation) {
|
||||
Ok(claims) => claims,
|
||||
Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
|
||||
};
|
||||
|
||||
if data.claims.compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid claims in authorization token",
|
||||
));
|
||||
}
|
||||
|
||||
// Make claims available to any subsequent middleware or request
|
||||
// handlers
|
||||
request.extensions_mut().insert(data.claims);
|
||||
|
||||
Ok(request)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Authorize {
|
||||
/// Verify the token using the JSON Web Key set and return the token data.
|
||||
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
|
||||
debug_assert!(!jwks.keys.is_empty());
|
||||
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to construct decoding key from {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to decode authorization token using {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!("Failed to verify authorization token"))
|
||||
}
|
||||
}
|
||||
1
compute_tools/src/http/middleware/mod.rs
Normal file
1
compute_tools/src/http/middleware/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub(in crate::http) mod authorize;
|
||||
@@ -7,6 +7,8 @@ use serde::Serialize;
|
||||
use tracing::error;
|
||||
|
||||
mod extract;
|
||||
mod headers;
|
||||
mod middleware;
|
||||
mod routes;
|
||||
pub mod server;
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
if !compute.live_config_allowed {
|
||||
if !compute.params.live_config_allowed {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
|
||||
@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
|
||||
/// Download a remote extension.
|
||||
pub(in crate::http) async fn download_extension(
|
||||
Path(filename): Path<String>,
|
||||
params: Query<ExtensionServerParams>,
|
||||
ext_server_params: Query<ExtensionServerParams>,
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
// Don't even try to download extensions if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
if compute.params.ext_remote_storage.is_none() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"remote storage is not configured",
|
||||
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
ext_server_params.is_library,
|
||||
&compute.params.build_tag,
|
||||
&compute.params.pgversion,
|
||||
)
|
||||
};
|
||||
|
||||
|
||||
@@ -10,48 +10,58 @@ use axum::middleware::{self, Next};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::{get, post};
|
||||
use http::StatusCode;
|
||||
use jsonwebtoken::jwk::JwkSet;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::request_id::PropagateRequestIdLayer;
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing::{Span, debug, error, info};
|
||||
use tower_http::{
|
||||
auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
|
||||
};
|
||||
use tracing::{Span, error, info};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, metrics, metrics_json, status, terminate,
|
||||
use super::{
|
||||
headers::X_REQUEST_ID,
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, metrics, metrics_json, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
/// `compute_ctl` has two servers: internal and external. The internal server
|
||||
/// binds to the loopback interface and handles communication from clients on
|
||||
/// the compute. The external server is what receives communication from the
|
||||
/// control plane, the metrics scraper, etc. We make the distinction because
|
||||
/// certain routes in `compute_ctl` only need to be exposed to local processes
|
||||
/// like Postgres via the neon extension and local_proxy.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum Server {
|
||||
Internal(u16),
|
||||
External(u16),
|
||||
Internal {
|
||||
port: u16,
|
||||
},
|
||||
External {
|
||||
port: u16,
|
||||
jwks: JwkSet,
|
||||
compute_id: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for Server {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Server::Internal(_) => f.write_str("internal"),
|
||||
Server::External(_) => f.write_str("external"),
|
||||
Server::Internal { .. } => f.write_str("internal"),
|
||||
Server::External { .. } => f.write_str("external"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
fn from(server: Server) -> Self {
|
||||
impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
fn from(server: &Server) -> Self {
|
||||
let mut router = Router::<Arc<ComputeNode>>::new();
|
||||
|
||||
router = match server {
|
||||
Server::Internal(_) => {
|
||||
Server::Internal { .. } => {
|
||||
router = router
|
||||
.route(
|
||||
"/extension_server/{*filename}",
|
||||
@@ -69,59 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
|
||||
router
|
||||
}
|
||||
Server::External(_) => router
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate)),
|
||||
Server::External {
|
||||
jwks, compute_id, ..
|
||||
} => {
|
||||
let unauthenticated_router =
|
||||
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
|
||||
|
||||
let authenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate))
|
||||
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
|
||||
compute_id.clone(),
|
||||
jwks.clone(),
|
||||
)));
|
||||
|
||||
router
|
||||
.merge(unauthenticated_router)
|
||||
.merge(authenticated_router)
|
||||
}
|
||||
};
|
||||
|
||||
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
|
||||
ServiceBuilder::new()
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
let request_id = request
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
match request.uri().path() {
|
||||
"/metrics" => {
|
||||
debug!(%request_id, "{} {}", request.method(), request.uri())
|
||||
}
|
||||
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
|
||||
};
|
||||
})
|
||||
.on_response(
|
||||
|response: &http::Response<_>, latency: Duration, _span: &Span| {
|
||||
let request_id = response
|
||||
router
|
||||
.fallback(Server::handle_404)
|
||||
.method_not_allowed_fallback(Server::handle_405)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
let request_id = request
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
info!(
|
||||
%request_id,
|
||||
code = response.status().as_u16(),
|
||||
latency = latency.as_millis()
|
||||
)
|
||||
},
|
||||
),
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
.layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
|
||||
info!(%request_id, "{} {}", request.method(), request.uri());
|
||||
})
|
||||
.on_response(
|
||||
|response: &http::Response<_>, latency: Duration, _span: &Span| {
|
||||
let request_id = response
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
info!(
|
||||
%request_id,
|
||||
code = response.status().as_u16(),
|
||||
latency = latency.as_millis()
|
||||
);
|
||||
},
|
||||
),
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,15 +167,15 @@ impl Server {
|
||||
match self {
|
||||
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
|
||||
// allow binding to localhost
|
||||
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
}
|
||||
}
|
||||
|
||||
fn port(self) -> u16 {
|
||||
fn port(&self) -> u16 {
|
||||
match self {
|
||||
Server::Internal(port) => port,
|
||||
Server::External(port) => port,
|
||||
Server::Internal { port, .. } => *port,
|
||||
Server::External { port, .. } => *port,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +202,9 @@ impl Server {
|
||||
);
|
||||
}
|
||||
|
||||
let router = Router::from(self).with_state(compute);
|
||||
let router = Router::from(&self)
|
||||
.with_state(compute)
|
||||
.into_make_service_with_connect_info::<SocketAddr>();
|
||||
|
||||
if let Err(e) = axum::serve(listener, router).await {
|
||||
error!("compute_ctl {} HTTP server error: {}", self, e);
|
||||
|
||||
@@ -21,6 +21,7 @@ mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod rsyslog;
|
||||
pub mod spec;
|
||||
mod spec_apply;
|
||||
pub mod swap;
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
|
||||
pub fn inlinify(s: &str) -> String {
|
||||
s.replace('\n', "\u{200B}")
|
||||
}
|
||||
|
||||
pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
//
|
||||
// This is used to propagate the context for the 'start_compute' operation
|
||||
// from the neon control plane. This allows linking together the wider
|
||||
// 'start_compute' operation that creates the compute container, with the
|
||||
// startup actions here within the container.
|
||||
//
|
||||
// There is no standard for passing context in env variables, but a lot of
|
||||
// tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
|
||||
// https://github.com/open-telemetry/opentelemetry-specification/issues/740
|
||||
//
|
||||
// Switch to the startup context here, and exit it once the startup has
|
||||
// completed and Postgres is up and running.
|
||||
//
|
||||
// If this pod is pre-created without binding it to any particular endpoint
|
||||
// yet, this isn't the right place to enter the startup context. In that
|
||||
// case, the control plane should pass the tracing context as part of the
|
||||
// /configure API call.
|
||||
//
|
||||
// NOTE: This is supposed to only cover the *startup* actions. Once
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
//
|
||||
// XXX: If the pod is restarted, we perform the startup actions in the same
|
||||
// context as the original startup actions, which probably doesn't make
|
||||
// sense.
|
||||
let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
|
||||
if let Ok(val) = std::env::var("TRACEPARENT") {
|
||||
startup_tracing_carrier.insert("traceparent".to_string(), val);
|
||||
}
|
||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
info!("got startup tracing context from env variables");
|
||||
Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
|
||||
register_int_counter_vec!(
|
||||
"compute_ctl_remote_ext_requests_total",
|
||||
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
|
||||
// Do not use any labels like extension name yet.
|
||||
// We can add them later if needed.
|
||||
&["http_status"]
|
||||
&["http_status", "filename"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// should be handled gracefully.
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.clone();
|
||||
let connstr = compute.params.connstr.clone();
|
||||
let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
|
||||
@@ -186,15 +186,40 @@ impl DatabaseExt for Database {
|
||||
/// Postgres SQL queries and DATABASE_URL.
|
||||
pub trait Escaping {
|
||||
fn pg_quote(&self) -> String;
|
||||
fn pg_quote_dollar(&self) -> (String, String);
|
||||
}
|
||||
|
||||
impl Escaping for PgIdent {
|
||||
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
||||
/// always quotes provided string with `""` and escapes every `"`.
|
||||
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
|
||||
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
|
||||
/// clause, use `escape_literal()` instead.
|
||||
fn pg_quote(&self) -> String {
|
||||
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
||||
result
|
||||
format!("\"{}\"", self.replace('"', "\"\""))
|
||||
}
|
||||
|
||||
/// This helper is intended to be used for dollar-escaping strings for usage
|
||||
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
|
||||
/// it also returns a tag that is intended to be used inside the outer
|
||||
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
|
||||
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
|
||||
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
|
||||
fn pg_quote_dollar(&self) -> (String, String) {
|
||||
let mut tag: String = "".to_string();
|
||||
let mut outer_tag = "x".to_string();
|
||||
|
||||
// Find the first suitable tag that is not present in the string.
|
||||
// Postgres' max role/DB name length is 63 bytes, so even in the
|
||||
// worst case it won't take long.
|
||||
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
|
||||
tag += "x";
|
||||
outer_tag = tag.clone() + "x";
|
||||
}
|
||||
|
||||
let escaped = format!("${tag}${self}${tag}$");
|
||||
|
||||
(escaped, outer_tag)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,10 +251,13 @@ pub async fn get_existing_dbs_async(
|
||||
// invalid state. See:
|
||||
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
|
||||
let rowstream = client
|
||||
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
|
||||
// because the latter automatically wraps the result in double quotes,
|
||||
// if the role name contains special characters.
|
||||
.query_raw::<str, &String, &[String; 0]>(
|
||||
"SELECT
|
||||
datname AS name,
|
||||
datdba::regrole::text AS owner,
|
||||
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
FROM
|
||||
|
||||
77
compute_tools/src/rsyslog.rs
Normal file
77
compute_tools/src/rsyslog.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use std::process::Command;
|
||||
use std::{fs::OpenOptions, io::Write};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tracing::info;
|
||||
|
||||
fn get_rsyslog_pid() -> Option<String> {
|
||||
let output = Command::new("pgrep")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.expect("Failed to execute pgrep");
|
||||
|
||||
if !output.stdout.is_empty() {
|
||||
let pid = std::str::from_utf8(&output.stdout)
|
||||
.expect("Invalid UTF-8 in process output")
|
||||
.trim()
|
||||
.to_string();
|
||||
Some(pid)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Restart rsyslogd to apply the new configuration.
|
||||
// This is necessary, because there is no other way to reload the rsyslog configuration.
|
||||
//
|
||||
// Rsyslogd shouldn't lose any messages, because of the restart,
|
||||
// because it tracks the last read position in the log files
|
||||
// and will continue reading from that position.
|
||||
// TODO: test it properly
|
||||
//
|
||||
fn restart_rsyslog() -> Result<()> {
|
||||
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
|
||||
info!("rsyslogd is running with pid: {}, restart it", old_pid);
|
||||
|
||||
// kill it to restart
|
||||
let _ = Command::new("pkill")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.context("Failed to stop rsyslogd")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn configure_audit_rsyslog(
|
||||
log_directory: &str,
|
||||
tag: &str,
|
||||
remote_endpoint: &str,
|
||||
) -> Result<()> {
|
||||
let config_content: String = format!(
|
||||
include_str!("config_template/compute_audit_rsyslog_template.conf"),
|
||||
log_directory = log_directory,
|
||||
tag = tag,
|
||||
remote_endpoint = remote_endpoint
|
||||
);
|
||||
|
||||
info!("rsyslog config_content: {}", config_content);
|
||||
|
||||
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.open(rsyslog_conf_path)?;
|
||||
|
||||
file.write_all(config_content.as_bytes())?;
|
||||
|
||||
info!(
|
||||
"rsyslog configuration file {} added successfully. Starting rsyslogd",
|
||||
rsyslog_conf_path
|
||||
);
|
||||
|
||||
// start the service, using the configuration
|
||||
restart_rsyslog()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,21 +6,22 @@ use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
|
||||
use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
|
||||
use futures::future::join_all;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
|
||||
use crate::compute::{ComputeNode, ComputeState};
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
|
||||
CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
|
||||
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
};
|
||||
@@ -187,7 +188,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreateSuperUser,
|
||||
CreateNeonSuperuser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -277,6 +278,19 @@ impl ComputeNode {
|
||||
phases.push(FinalizeDropLogicalSubscriptions);
|
||||
}
|
||||
|
||||
// Keep DisablePostgresDBPgAudit phase at the end,
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log => { /* not implemented yet */ }
|
||||
ComputeAudit::Disabled => {}
|
||||
}
|
||||
|
||||
for phase in phases {
|
||||
debug!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
@@ -455,7 +469,7 @@ pub enum PerDatabasePhase {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreateSuperUser,
|
||||
CreateNeonSuperuser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -463,6 +477,9 @@ pub enum ApplySpecPhase {
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
|
||||
CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension,
|
||||
DisablePostgresDBPgAudit,
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
CreateAvailabilityCheck,
|
||||
@@ -579,14 +596,10 @@ async fn get_operations<'a>(
|
||||
apply_spec_phase: &'a ApplySpecPhase,
|
||||
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
|
||||
match apply_spec_phase {
|
||||
ApplySpecPhase::CreateSuperUser => {
|
||||
let query = construct_superuser_query(spec);
|
||||
|
||||
Ok(Box::new(once(Operation {
|
||||
query,
|
||||
comment: None,
|
||||
})))
|
||||
}
|
||||
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
|
||||
query: include_str!("sql/create_neon_superuser.sql").to_string(),
|
||||
comment: None,
|
||||
}))),
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -720,14 +733,15 @@ async fn get_operations<'a>(
|
||||
// We do not check whether the DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
let (db_name, outer_tag) = op.name.pg_quote_dollar();
|
||||
// In Postgres we can't drop a database if it is a template.
|
||||
// So we need to unset the template flag first, but it could
|
||||
// be a retry, so we could've already dropped the database.
|
||||
// Check that database exists first to make it idempotent.
|
||||
let unset_template_query: String = format!(
|
||||
include_str!("sql/unset_template_for_drop_dbs.sql"),
|
||||
datname_str = escape_literal(&op.name),
|
||||
datname = &op.name.pg_quote()
|
||||
datname = db_name,
|
||||
outer_tag = outer_tag,
|
||||
);
|
||||
|
||||
// Use FORCE to drop database even if there are active connections.
|
||||
@@ -834,6 +848,8 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
query: format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
db.name.pg_quote()
|
||||
@@ -893,9 +909,11 @@ async fn get_operations<'a>(
|
||||
PerDatabasePhase::DropLogicalSubscriptions => {
|
||||
match &db {
|
||||
DB::UserDB(db) => {
|
||||
let (db_name, outer_tag) = db.name.pg_quote_dollar();
|
||||
let drop_subscription_query: String = format!(
|
||||
include_str!("sql/drop_subscriptions.sql"),
|
||||
datname_str = escape_literal(&db.name),
|
||||
datname_str = db_name,
|
||||
outer_tag = outer_tag,
|
||||
);
|
||||
|
||||
let operations = vec![Operation {
|
||||
@@ -934,6 +952,7 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
|
||||
DB::UserDB(db) => db.owner.pg_quote(),
|
||||
};
|
||||
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
|
||||
|
||||
Some(vec![
|
||||
// This will reassign all dependent objects to the db owner
|
||||
@@ -948,7 +967,9 @@ async fn get_operations<'a>(
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
|
||||
role_name = quoted,
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
@@ -973,12 +994,14 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => return Ok(Box::new(empty())),
|
||||
DB::UserDB(db) => db,
|
||||
};
|
||||
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
|
||||
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/set_public_schema_owner.sql"),
|
||||
db_owner = db.owner.pg_quote()
|
||||
db_owner = db_owner,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
@@ -1098,6 +1121,25 @@ async fn get_operations<'a>(
|
||||
}
|
||||
Ok(Box::new(empty()))
|
||||
}
|
||||
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
|
||||
comment: Some(String::from("create pgaudit extensions")),
|
||||
}))),
|
||||
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
|
||||
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
|
||||
comment: Some(String::from("create pgauditlogtofile extensions")),
|
||||
}))),
|
||||
// Disable pgaudit logging for postgres database.
|
||||
// Postgres is neon system database used by monitors
|
||||
// and compute_ctl tuning functions and thus generates a lot of noise.
|
||||
// We do not consider data stored in this database as sensitive.
|
||||
ApplySpecPhase::DisablePostgresDBPgAudit => {
|
||||
let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
|
||||
Ok(Box::new(once(Operation {
|
||||
query: query.to_string(),
|
||||
comment: Some(query.to_string()),
|
||||
})))
|
||||
}
|
||||
ApplySpecPhase::HandleNeonExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
|
||||
8
compute_tools/src/sql/create_neon_superuser.sql
Normal file
8
compute_tools/src/sql/create_neon_superuser.sql
Normal file
@@ -0,0 +1,8 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
@@ -1,4 +1,4 @@
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
subname TEXT;
|
||||
BEGIN
|
||||
@@ -9,4 +9,4 @@ BEGIN
|
||||
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
SET SESSION ROLE neon_superuser;
|
||||
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
schema TEXT;
|
||||
revoke_query TEXT;
|
||||
@@ -16,13 +16,15 @@ BEGIN
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
|
||||
schema
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY neon_superuser;',
|
||||
schema,
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
{role_name}
|
||||
);
|
||||
|
||||
EXECUTE revoke_query;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
|
||||
RESET ROLE;
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
DO
|
||||
$$
|
||||
DO ${outer_tag}$
|
||||
DECLARE
|
||||
schema_owner TEXT;
|
||||
BEGIN
|
||||
@@ -16,8 +15,8 @@ $$
|
||||
|
||||
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
|
||||
THEN
|
||||
ALTER SCHEMA public OWNER TO {db_owner};
|
||||
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
@@ -1,12 +1,12 @@
|
||||
DO $$
|
||||
DO ${outer_tag}$
|
||||
BEGIN
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {datname_str}
|
||||
WHERE datname = {datname}
|
||||
)
|
||||
THEN
|
||||
ALTER DATABASE {datname} is_template false;
|
||||
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
${outer_tag}$;
|
||||
|
||||
@@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ident_pg_quote_dollar() {
|
||||
let test_cases = vec![
|
||||
("name", ("$$name$$", "x")),
|
||||
("name$$", ("$x$name$$$x$", "xx")),
|
||||
("name$$$", ("$x$name$$$$x$", "xx")),
|
||||
("name$$$$", ("$x$name$$$$$x$", "xx")),
|
||||
("name$x$", ("$xx$name$x$$xx$", "xxx")),
|
||||
];
|
||||
|
||||
for (input, expected) in test_cases {
|
||||
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
|
||||
assert_eq!(escaped, expected.0);
|
||||
assert_eq!(tag, expected.1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generic_options_search() {
|
||||
let generic_options: GenericOptions = Some(vec![
|
||||
|
||||
@@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
|
||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
@@ -596,7 +597,15 @@ struct EndpointStartCmdArgs {
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
#[clap(long)]
|
||||
#[clap(
|
||||
long,
|
||||
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
|
||||
)]
|
||||
safekeepers_generation: Option<u32>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
|
||||
)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
#[clap(
|
||||
@@ -617,9 +626,9 @@ struct EndpointStartCmdArgs {
|
||||
)]
|
||||
allow_multiple: bool,
|
||||
|
||||
#[clap(short = 't', long, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "10s")]
|
||||
start_timeout: humantime::Duration,
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let pageserver_id = args.endpoint_pageserver_id;
|
||||
let remote_ext_config = &args.remote_ext_config;
|
||||
|
||||
let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
|
||||
@@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint
|
||||
.start(
|
||||
&auth_token,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
remote_ext_config.as_ref(),
|
||||
stripe_size.0 as usize,
|
||||
args.create_test_user,
|
||||
args.start_timeout,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -42,17 +42,19 @@ use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
};
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use safekeeper_api::membership::SafekeeperGeneration;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
@@ -576,14 +578,17 @@ impl Endpoint {
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
start_timeout: Duration,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -655,6 +660,7 @@ impl Endpoint {
|
||||
timeline_id: Some(self.timeline_id),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
@@ -663,6 +669,7 @@ impl Endpoint {
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -770,17 +777,18 @@ impl Endpoint {
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
|
||||
// Wait for it to start
|
||||
let mut attempt = 0;
|
||||
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
|
||||
let start_at = Instant::now();
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status().await {
|
||||
Ok(state) => {
|
||||
match state.status {
|
||||
ComputeStatus::Init => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
bail!("compute startup timed out; still in Init state");
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
bail!(
|
||||
"compute startup timed out {:?}; still in Init state",
|
||||
start_timeout
|
||||
);
|
||||
}
|
||||
// keep retrying
|
||||
}
|
||||
@@ -807,8 +815,11 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
|
||||
if Instant::now().duration_since(start_at) > start_timeout {
|
||||
return Err(e).context(format!(
|
||||
"timed out {:?} waiting to connect to compute_ctl HTTP",
|
||||
start_timeout,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,8 +6,11 @@ generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
}
|
||||
if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
|
||||
echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
|
||||
echo "${OLD_COMPUTE_TAG}"
|
||||
echo "${NEW_COMPUTE_TAG}"
|
||||
echo "${TEST_EXTENSIONS_TAG}"
|
||||
if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
|
||||
echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
|
||||
exit 1
|
||||
fi
|
||||
export PG_VERSION=${PG_VERSION:-16}
|
||||
@@ -82,7 +85,7 @@ EXTENSIONS='[
|
||||
{"extname": "pg_repack", "extdir": "pg_repack-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
wait_for_ready
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
@@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}"
|
||||
query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
|
||||
new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
|
||||
docker compose --profile test-extensions down
|
||||
COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
|
||||
COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
|
||||
wait_for_ready
|
||||
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
|
||||
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
|
||||
|
||||
201
docs/rfcs/041-rel-sparse-keyspace.md
Normal file
201
docs/rfcs/041-rel-sparse-keyspace.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# Sparse Keyspace for Relation Directories
|
||||
|
||||
## Summary
|
||||
|
||||
This is an RFC describing a new storage strategy for storing relation directories.
|
||||
|
||||
## Motivation
|
||||
|
||||
Postgres maintains a directory structure for databases and relations. In Neon, we store these information
|
||||
by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
|
||||
|
||||
```rust
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
```
|
||||
|
||||
We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
|
||||
|
||||
```rust
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub(crate) struct RelDirectory {
|
||||
// Set of relations that exist. (relfilenode, forknum)
|
||||
//
|
||||
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
||||
// key-value pairs, if you have a lot of relations
|
||||
pub(crate) rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
```
|
||||
|
||||
The current codebase has the following three access patterns for the relation directory.
|
||||
|
||||
1. Check if a relation exists.
|
||||
2. List all relations.
|
||||
3. Create/drop a relation.
|
||||
|
||||
For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
|
||||
hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
|
||||
and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
|
||||
|
||||
If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
|
||||
relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
|
||||
relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
|
||||
we would have to deserialize this super big 100k-large key before checking if a single relation exists.
|
||||
|
||||
In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
|
||||
to seamlessly migrate users to use the new keyspace.
|
||||
|
||||
The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
|
||||
|
||||
## Key Mapping
|
||||
|
||||
We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
|
||||
[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
|
||||
for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
|
||||
into the key.
|
||||
|
||||
```plain
|
||||
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
|
||||
(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
|
||||
```
|
||||
|
||||
Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
|
||||
implemented as follows.
|
||||
|
||||
1. Check if a relation exists: check if the key maps to "exists".
|
||||
2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
|
||||
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
|
||||
be removed during image layer generation upon compaction.
|
||||
|
||||
Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
|
||||
The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
|
||||
|
||||
## Changes to Sparse Keyspace
|
||||
|
||||
Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
|
||||
information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
|
||||
to be updated accordingly to accommodate such "inherited sparse keys". This is done in
|
||||
[PR#10313](https://github.com/neondatabase/neon/pull/10313).
|
||||
|
||||
## Coexistence of the Old and New Keyspaces
|
||||
|
||||
Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
|
||||
ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
|
||||
path needs to combine the data from both keyspaces.
|
||||
|
||||
Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
|
||||
new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
|
||||
process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
|
||||
migration can happen seamlessly and imposes no potential downtime for the user.
|
||||
|
||||
With the coexistence assumption, the 3 reldir operations will be implemented as follows:
|
||||
|
||||
1. Check if a relation exists
|
||||
- Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
|
||||
return it to the user.
|
||||
- Otherwise, deserialize the old reldir key and get the result.
|
||||
2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
|
||||
Combine them to obtain the final result.
|
||||
3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
|
||||
- We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
|
||||
- For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
|
||||
- For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
|
||||
remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
|
||||
- The delete tombstone will be removed during image layer generation upon compaction.
|
||||
|
||||
This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
|
||||
amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
|
||||
There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
|
||||
with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
|
||||
us `O(1)` complexity after fully opt-in the sparse keyspace.
|
||||
|
||||
The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
|
||||
to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
|
||||
|
||||
We will introduce a config item and an index_part record to record the current status of the migration process.
|
||||
|
||||
- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
|
||||
- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
|
||||
|
||||
If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
|
||||
`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
|
||||
`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
|
||||
read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
|
||||
once v2 is enabled, the user cannot go back to v1.
|
||||
|
||||
## Next Steps
|
||||
|
||||
### Full Migration
|
||||
|
||||
This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
|
||||
v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
|
||||
code path, we must ensure the timeline has no old reldir data.
|
||||
|
||||
We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
|
||||
the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
|
||||
copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
|
||||
the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
|
||||
process discovers the following keys at this LSN.
|
||||
|
||||
```plain
|
||||
db1/reldir_key -> (table 1, table 2, table 3)
|
||||
...db1 rel keys
|
||||
db2/reldir_key -> (table 4, table 5, table 6)
|
||||
...db2 rel keys
|
||||
sparse_reldir_db2_table7 -> exists
|
||||
sparse_reldir_db1_table8 -> deleted
|
||||
```
|
||||
|
||||
It will generate the following keys:
|
||||
|
||||
```plain
|
||||
db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
|
||||
...db1 rel keys
|
||||
db2/reldir_key -> ()
|
||||
...db2 rel keys
|
||||
|
||||
-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
|
||||
sparse_reldir_db1_table1 -> exists
|
||||
sparse_reldir_db1_table2 -> exists
|
||||
sparse_reldir_db1_table3 -> exists
|
||||
sparse_reldir_db2_table4 -> exists
|
||||
sparse_reldir_db2_table5 -> exists
|
||||
sparse_reldir_db2_table6 -> exists
|
||||
sparse_reldir_db2_table7 -> exists
|
||||
-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
|
||||
|
||||
# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
|
||||
# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
|
||||
# are no correctness issue.
|
||||
```
|
||||
|
||||
We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
|
||||
we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
|
||||
above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
|
||||
in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
|
||||
`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
|
||||
don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
|
||||
|
||||
The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
|
||||
|
||||
### Consolidate Relation Size Keys
|
||||
|
||||
We have relsize at the end of all relation nodes.
|
||||
|
||||
```plain
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
```
|
||||
|
||||
This means that computing logical size requires us to do several single-key gets across the keyspace,
|
||||
potentially requiring downloading many layer files. We could consolidate them into a single
|
||||
keyspace, improving logical size calculation performance.
|
||||
|
||||
### Migrate DBDir Keys
|
||||
|
||||
We assume the number of databases created by the users will be small, and therefore, the current way
|
||||
of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
|
||||
the sparse keyspace to support large amount of databases.
|
||||
@@ -134,8 +134,10 @@ pub struct CatalogObjects {
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeCtlConfig {
|
||||
/// Set of JSON web keys that the compute can use to authenticate
|
||||
/// communication from the control plane.
|
||||
pub jwks: JwkSet,
|
||||
}
|
||||
|
||||
|
||||
@@ -101,6 +101,17 @@ pub struct ComputeSpec {
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
/// Safekeeper membership config generation. It is put in
|
||||
/// neon.safekeepers GUC and serves two purposes:
|
||||
/// 1) Non zero value forces walproposer to use membership configurations.
|
||||
/// 2) If walproposer wants to update list of safekeepers to connect to
|
||||
/// taking them from some safekeeper mconf, it should check what value
|
||||
/// is newer by comparing the generation.
|
||||
///
|
||||
/// Note: it could be SafekeeperGeneration, but this needs linking
|
||||
/// compute_ctl with postgres_ffi.
|
||||
#[serde(default)]
|
||||
pub safekeepers_generation: Option<u32>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
|
||||
@@ -144,6 +155,16 @@ pub struct ComputeSpec {
|
||||
/// over the same replication content from publisher.
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
|
||||
/// Log level for audit logging:
|
||||
///
|
||||
/// Disabled - no audit logging. This is the default.
|
||||
/// log - log masked statements to the postgres log using pgaudit extension
|
||||
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
|
||||
///
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -251,6 +272,17 @@ pub enum ComputeMode {
|
||||
Replica,
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
/// Disabled, log, hipaa
|
||||
/// Default is Disabled
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
#[default]
|
||||
Disabled,
|
||||
Log,
|
||||
Hipaa,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: Option<String>,
|
||||
|
||||
@@ -6,11 +6,8 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
backtrace.workspace = true
|
||||
bytes.workspace = true
|
||||
inferno.workspace = true
|
||||
fail.workspace = true
|
||||
flate2.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jemalloc_pprof.workspace = true
|
||||
|
||||
@@ -3,8 +3,6 @@ use std::io::Write as _;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use ::pprof::ProfilerGuardBuilder;
|
||||
use ::pprof::protos::Message as _;
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
|
||||
@@ -12,7 +10,8 @@ use hyper::http::HeaderValue;
|
||||
use hyper::{Body, Method, Request, Response};
|
||||
use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use pprof::ProfilerGuardBuilder;
|
||||
use pprof::protos::Message as _;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::sync::{Mutex, Notify, mpsc};
|
||||
@@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn};
|
||||
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
|
||||
use crate::error::{ApiError, api_error_handler, route_error_handler};
|
||||
use crate::pprof;
|
||||
use crate::request::{get_query_param, parse_query_param};
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
|
||||
};
|
||||
|
||||
// Functions and mappings to strip when symbolizing pprof profiles. If true,
|
||||
// also remove child frames.
|
||||
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
|
||||
vec![
|
||||
(Regex::new("^__rust").unwrap(), false),
|
||||
(Regex::new("^_start$").unwrap(), false),
|
||||
(Regex::new("^irallocx_prof").unwrap(), true),
|
||||
(Regex::new("^prof_alloc_prep").unwrap(), true),
|
||||
(Regex::new("^std::rt::lang_start").unwrap(), false),
|
||||
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
|
||||
]
|
||||
});
|
||||
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
|
||||
|
||||
// Obtain profiler handle.
|
||||
let mut prof_ctl = jemalloc_pprof::PROF_CTL
|
||||
.as_ref()
|
||||
@@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
|
||||
Format::Pprof => {
|
||||
let data = tokio::task::spawn_blocking(move || {
|
||||
let bytes = prof_ctl.dump_pprof()?;
|
||||
// Symbolize the profile.
|
||||
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
|
||||
// serialization roundtrip.
|
||||
let profile = pprof::decode(&bytes)?;
|
||||
let profile = pprof::symbolize(profile)?;
|
||||
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
|
||||
pprof::encode(&profile)
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "application/octet-stream")
|
||||
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
|
||||
.header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
|
||||
.body(Body::from(data))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
|
||||
Format::Svg => {
|
||||
let body = tokio::task::spawn_blocking(move || {
|
||||
let bytes = prof_ctl.dump_pprof()?;
|
||||
let profile = pprof::decode(&bytes)?;
|
||||
let profile = pprof::symbolize(profile)?;
|
||||
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
|
||||
let mut opts = inferno::flamegraph::Options::default();
|
||||
opts.title = "Heap inuse".to_string();
|
||||
opts.count_name = "bytes".to_string();
|
||||
pprof::flamegraph(profile, &mut opts)
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "image/svg+xml")
|
||||
.body(Body::from(body))
|
||||
.body(Body::from(svg))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ pub mod endpoint;
|
||||
pub mod error;
|
||||
pub mod failpoints;
|
||||
pub mod json;
|
||||
pub mod pprof;
|
||||
pub mod request;
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
@@ -1,238 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::ffi::c_void;
|
||||
use std::io::Write as _;
|
||||
|
||||
use anyhow::bail;
|
||||
use flate2::Compression;
|
||||
use flate2::write::{GzDecoder, GzEncoder};
|
||||
use itertools::Itertools as _;
|
||||
use pprof::protos::{Function, Line, Location, Message as _, Profile};
|
||||
use regex::Regex;
|
||||
|
||||
/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
|
||||
pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
|
||||
let mut gz = GzDecoder::new(Vec::new());
|
||||
gz.write_all(bytes)?;
|
||||
Ok(Profile::parse_from_bytes(&gz.finish()?)?)
|
||||
}
|
||||
|
||||
/// Encodes a pprof profile as gzip-compressed Protobuf.
|
||||
pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
|
||||
let mut gz = GzEncoder::new(Vec::new(), Compression::default());
|
||||
profile.write_to_writer(&mut gz)?;
|
||||
Ok(gz.finish()?)
|
||||
}
|
||||
|
||||
/// Symbolizes a pprof profile using the current binary.
|
||||
pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
|
||||
if !profile.function.is_empty() {
|
||||
return Ok(profile); // already symbolized
|
||||
}
|
||||
|
||||
// Collect function names.
|
||||
let mut functions: HashMap<String, Function> = HashMap::new();
|
||||
let mut strings: HashMap<String, i64> = profile
|
||||
.string_table
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, s)| (s, i as i64))
|
||||
.collect();
|
||||
|
||||
// Helper to look up or register a string.
|
||||
let mut string_id = |s: &str| -> i64 {
|
||||
// Don't use .entry() to avoid unnecessary allocations.
|
||||
if let Some(id) = strings.get(s) {
|
||||
return *id;
|
||||
}
|
||||
let id = strings.len() as i64;
|
||||
strings.insert(s.to_string(), id);
|
||||
id
|
||||
};
|
||||
|
||||
for loc in &mut profile.location {
|
||||
if !loc.line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Resolve the line and function for each location.
|
||||
backtrace::resolve(loc.address as *mut c_void, |symbol| {
|
||||
let Some(symbol_name) = symbol.name() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let function_name = format!("{symbol_name:#}");
|
||||
let functions_len = functions.len();
|
||||
let function_id = functions
|
||||
.entry(function_name)
|
||||
.or_insert_with_key(|function_name| {
|
||||
let function_id = functions_len as u64 + 1;
|
||||
let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
|
||||
let filename = symbol
|
||||
.filename()
|
||||
.map(|path| path.to_string_lossy())
|
||||
.unwrap_or(Cow::Borrowed(""));
|
||||
Function {
|
||||
id: function_id,
|
||||
name: string_id(function_name),
|
||||
system_name: string_id(&system_name),
|
||||
filename: string_id(&filename),
|
||||
..Default::default()
|
||||
}
|
||||
})
|
||||
.id;
|
||||
loc.line.push(Line {
|
||||
function_id,
|
||||
line: symbol.lineno().unwrap_or(0) as i64,
|
||||
..Default::default()
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Store the resolved functions, and mark the mapping as resolved.
|
||||
profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
|
||||
profile.string_table = strings
|
||||
.into_iter()
|
||||
.sorted_by_key(|(_, i)| *i)
|
||||
.map(|(s, _)| s)
|
||||
.collect();
|
||||
|
||||
for mapping in &mut profile.mapping {
|
||||
mapping.has_functions = true;
|
||||
mapping.has_filenames = true;
|
||||
}
|
||||
|
||||
Ok(profile)
|
||||
}
|
||||
|
||||
/// Strips locations (stack frames) matching the given mappings (substring) or function names
|
||||
/// (regex). The function bool specifies whether child frames should be stripped as well.
|
||||
///
|
||||
/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
|
||||
/// string references.
|
||||
pub fn strip_locations(
|
||||
mut profile: Profile,
|
||||
mappings: &[&str],
|
||||
functions: &[(Regex, bool)],
|
||||
) -> Profile {
|
||||
// Strip mappings.
|
||||
let mut strip_mappings: HashSet<u64> = HashSet::new();
|
||||
|
||||
profile.mapping.retain(|mapping| {
|
||||
let Some(name) = profile.string_table.get(mapping.filename as usize) else {
|
||||
return true;
|
||||
};
|
||||
if mappings.iter().any(|substr| name.contains(substr)) {
|
||||
strip_mappings.insert(mapping.id);
|
||||
return false;
|
||||
}
|
||||
true
|
||||
});
|
||||
|
||||
// Strip functions.
|
||||
let mut strip_functions: HashMap<u64, bool> = HashMap::new();
|
||||
|
||||
profile.function.retain(|function| {
|
||||
let Some(name) = profile.string_table.get(function.name as usize) else {
|
||||
return true;
|
||||
};
|
||||
for (regex, strip_children) in functions {
|
||||
if regex.is_match(name) {
|
||||
strip_functions.insert(function.id, *strip_children);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
});
|
||||
|
||||
// Strip locations. The bool specifies whether child frames should be stripped too.
|
||||
let mut strip_locations: HashMap<u64, bool> = HashMap::new();
|
||||
|
||||
profile.location.retain(|location| {
|
||||
for line in &location.line {
|
||||
if let Some(strip_children) = strip_functions.get(&line.function_id) {
|
||||
strip_locations.insert(location.id, *strip_children);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if strip_mappings.contains(&location.mapping_id) {
|
||||
strip_locations.insert(location.id, false);
|
||||
return false;
|
||||
}
|
||||
true
|
||||
});
|
||||
|
||||
// Strip sample locations.
|
||||
for sample in &mut profile.sample {
|
||||
// First, find the uppermost function with child removal and truncate the stack.
|
||||
if let Some(truncate) = sample
|
||||
.location_id
|
||||
.iter()
|
||||
.rposition(|id| strip_locations.get(id) == Some(&true))
|
||||
{
|
||||
sample.location_id.drain(..=truncate);
|
||||
}
|
||||
// Next, strip any individual frames without child removal.
|
||||
sample
|
||||
.location_id
|
||||
.retain(|id| !strip_locations.contains_key(id));
|
||||
}
|
||||
|
||||
profile
|
||||
}
|
||||
|
||||
/// Generates an SVG flamegraph from a symbolized pprof profile.
|
||||
pub fn flamegraph(
|
||||
profile: Profile,
|
||||
opts: &mut inferno::flamegraph::Options,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
if profile.mapping.iter().any(|m| !m.has_functions) {
|
||||
bail!("profile not symbolized");
|
||||
}
|
||||
|
||||
// Index locations, functions, and strings.
|
||||
let locations: HashMap<u64, Location> =
|
||||
profile.location.into_iter().map(|l| (l.id, l)).collect();
|
||||
let functions: HashMap<u64, Function> =
|
||||
profile.function.into_iter().map(|f| (f.id, f)).collect();
|
||||
let strings = profile.string_table;
|
||||
|
||||
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
|
||||
// since inferno expects it bottom-up.
|
||||
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
|
||||
for sample in profile.sample {
|
||||
let mut stack = Vec::with_capacity(sample.location_id.len());
|
||||
for location in sample.location_id.into_iter().rev() {
|
||||
let Some(location) = locations.get(&location) else {
|
||||
bail!("missing location {location}");
|
||||
};
|
||||
for line in location.line.iter().rev() {
|
||||
let Some(function) = functions.get(&line.function_id) else {
|
||||
bail!("missing function {}", line.function_id);
|
||||
};
|
||||
let Some(name) = strings.get(function.name as usize) else {
|
||||
bail!("missing string {}", function.name);
|
||||
};
|
||||
stack.push(name.as_str());
|
||||
}
|
||||
}
|
||||
let Some(&value) = sample.value.first() else {
|
||||
bail!("missing value");
|
||||
};
|
||||
*stacks.entry(stack).or_default() += value;
|
||||
}
|
||||
|
||||
// Construct stack lines for inferno.
|
||||
let lines = stacks
|
||||
.into_iter()
|
||||
.map(|(stack, value)| (stack.into_iter().join(";"), value))
|
||||
.map(|(stack, value)| format!("{stack} {value}"))
|
||||
.sorted()
|
||||
.collect_vec();
|
||||
|
||||
// Construct the flamegraph.
|
||||
let mut bytes = Vec::new();
|
||||
let lines = lines.iter().map(|line| line.as_str());
|
||||
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
|
||||
Ok(bytes)
|
||||
}
|
||||
@@ -123,6 +123,10 @@ pub struct ConfigToml {
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub validate_wal_contiguity: Option<bool>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub load_previous_heatmap: Option<bool>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generate_unarchival_heatmap: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -523,6 +527,8 @@ impl Default for ConfigToml {
|
||||
None
|
||||
},
|
||||
validate_wal_contiguity: None,
|
||||
load_previous_heatmap: None,
|
||||
generate_unarchival_heatmap: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo {
|
||||
pub archived_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum RelSizeMigration {
|
||||
/// The tenant is using the old rel_size format.
|
||||
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
|
||||
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
|
||||
Legacy,
|
||||
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
|
||||
/// persisted in the index part. The read path will read both formats and merge them.
|
||||
Migrating,
|
||||
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
|
||||
/// in the index part, and the read path will not read the old format.
|
||||
Migrated,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
@@ -1243,7 +1258,11 @@ pub struct TimelineInfo {
|
||||
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
|
||||
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
|
||||
// read.
|
||||
/// Whether the timeline is archived.
|
||||
pub is_archived: Option<bool>,
|
||||
|
||||
/// The status of the rel_size migration.
|
||||
pub rel_size_migration: Option<RelSizeMigration>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -34,8 +34,13 @@ where
|
||||
.make_tls_connect(hostname)
|
||||
.map_err(|e| Error::tls(e.into()))?;
|
||||
|
||||
let socket =
|
||||
connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
|
||||
let socket = connect_socket::connect_socket(
|
||||
config.host_addr,
|
||||
&config.host,
|
||||
config.port,
|
||||
config.connect_timeout,
|
||||
)
|
||||
.await?;
|
||||
|
||||
cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Duration;
|
||||
@@ -137,6 +138,7 @@ impl InnerClient {
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct SocketConfig {
|
||||
pub host_addr: Option<IpAddr>,
|
||||
pub host: Host,
|
||||
pub port: u16,
|
||||
pub connect_timeout: Option<Duration>,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! Connection configuration.
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::time::Duration;
|
||||
use std::{fmt, str};
|
||||
|
||||
@@ -65,6 +66,7 @@ pub enum AuthKeys {
|
||||
/// Connection configuration.
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct Config {
|
||||
pub(crate) host_addr: Option<IpAddr>,
|
||||
pub(crate) host: Host,
|
||||
pub(crate) port: u16,
|
||||
|
||||
@@ -83,6 +85,7 @@ impl Config {
|
||||
/// Creates a new configuration.
|
||||
pub fn new(host: String, port: u16) -> Config {
|
||||
Config {
|
||||
host_addr: None,
|
||||
host: Host::Tcp(host),
|
||||
port,
|
||||
password: None,
|
||||
@@ -163,6 +166,15 @@ impl Config {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
|
||||
self.host_addr = Some(addr);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn get_host_addr(&self) -> Option<IpAddr> {
|
||||
self.host_addr
|
||||
}
|
||||
|
||||
/// Sets the SSL configuration.
|
||||
///
|
||||
/// Defaults to `prefer`.
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::net::IpAddr;
|
||||
|
||||
use postgres_protocol2::message::backend::Message;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::mpsc;
|
||||
@@ -25,13 +27,14 @@ where
|
||||
.make_tls_connect(hostname)
|
||||
.map_err(|e| Error::tls(e.into()))?;
|
||||
|
||||
match connect_once(&config.host, config.port, tls, config).await {
|
||||
match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
|
||||
Ok((client, connection)) => Ok((client, connection)),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
async fn connect_once<T>(
|
||||
host_addr: Option<IpAddr>,
|
||||
host: &Host,
|
||||
port: u16,
|
||||
tls: T,
|
||||
@@ -40,7 +43,7 @@ async fn connect_once<T>(
|
||||
where
|
||||
T: TlsConnect<TcpStream>,
|
||||
{
|
||||
let socket = connect_socket(host, port, config.connect_timeout).await?;
|
||||
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
|
||||
let RawConnection {
|
||||
stream,
|
||||
parameters,
|
||||
@@ -50,6 +53,7 @@ where
|
||||
} = connect_raw(socket, tls, config).await?;
|
||||
|
||||
let socket_config = SocketConfig {
|
||||
host_addr,
|
||||
host: host.clone(),
|
||||
port,
|
||||
connect_timeout: config.connect_timeout,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::future::Future;
|
||||
use std::io;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::net::{self, TcpStream};
|
||||
@@ -9,15 +10,20 @@ use crate::Error;
|
||||
use crate::config::Host;
|
||||
|
||||
pub(crate) async fn connect_socket(
|
||||
host_addr: Option<IpAddr>,
|
||||
host: &Host,
|
||||
port: u16,
|
||||
connect_timeout: Option<Duration>,
|
||||
) -> Result<TcpStream, Error> {
|
||||
match host {
|
||||
Host::Tcp(host) => {
|
||||
let addrs = net::lookup_host((&**host, port))
|
||||
.await
|
||||
.map_err(Error::connect)?;
|
||||
let addrs = match host_addr {
|
||||
Some(addr) => vec![SocketAddr::new(addr, port)],
|
||||
None => net::lookup_host((&**host, port))
|
||||
.await
|
||||
.map_err(Error::connect)?
|
||||
.collect(),
|
||||
};
|
||||
|
||||
let mut last_err = None;
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-compression.workspace = true
|
||||
anyhow.workspace = true
|
||||
backtrace.workspace = true
|
||||
bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
|
||||
@@ -3,20 +3,24 @@ use std::env;
|
||||
|
||||
use sentry::ClientInitGuard;
|
||||
pub use sentry::release_name;
|
||||
use tracing::{error, info};
|
||||
|
||||
#[must_use]
|
||||
pub fn init_sentry(
|
||||
release_name: Option<Cow<'static, str>>,
|
||||
extra_options: &[(&str, &str)],
|
||||
) -> Option<ClientInitGuard> {
|
||||
let dsn = env::var("SENTRY_DSN").ok()?;
|
||||
let Ok(dsn) = env::var("SENTRY_DSN") else {
|
||||
info!("not initializing Sentry, no SENTRY_DSN given");
|
||||
return None;
|
||||
};
|
||||
let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
|
||||
|
||||
let guard = sentry::init((
|
||||
dsn,
|
||||
sentry::ClientOptions {
|
||||
release: release_name,
|
||||
environment: Some(environment.into()),
|
||||
release: release_name.clone(),
|
||||
environment: Some(environment.clone().into()),
|
||||
..Default::default()
|
||||
},
|
||||
));
|
||||
@@ -25,5 +29,19 @@ pub fn init_sentry(
|
||||
scope.set_extra(key, value.into());
|
||||
}
|
||||
});
|
||||
|
||||
if let Some(dsn) = guard.dsn() {
|
||||
info!(
|
||||
"initialized Sentry for project {}, environment {}, release {} (using API {})",
|
||||
dsn.project_id(),
|
||||
environment,
|
||||
release_name.unwrap_or(Cow::Borrowed("None")),
|
||||
dsn.envelope_api_url(),
|
||||
);
|
||||
} else {
|
||||
// This should panic during sentry::init(), but we may as well cover it.
|
||||
error!("failed to initialize Sentry, invalid DSN");
|
||||
}
|
||||
|
||||
Some(guard)
|
||||
}
|
||||
|
||||
@@ -98,6 +98,7 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
indoc.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::time::Instant;
|
||||
|
||||
use criterion::measurement::WallTime;
|
||||
use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
|
||||
use pageserver_api::key::Key;
|
||||
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Construct a partitioning for testing get_difficulty map when we
|
||||
// don't have an exact result of `collect_keyspace` to work with.
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
|
||||
let mut parts = Vec::new();
|
||||
|
||||
// We add a partition boundary at the start of each image layer,
|
||||
// no matter what lsn range it covers. This is just the easiest
|
||||
// thing to do. A better thing to do would be to get a real
|
||||
// partitioning from some database. Even better, remove the need
|
||||
// for key partitions by deciding where to create image layers
|
||||
// directly based on a coverage-based difficulty map.
|
||||
let mut keys: Vec<_> = layer_map
|
||||
.iter_historic_layers()
|
||||
.filter_map(|l| {
|
||||
if l.is_incremental() {
|
||||
None
|
||||
} else {
|
||||
let kr = l.get_key_range();
|
||||
Some(kr.start.next())
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
keys.sort();
|
||||
|
||||
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
for key in keys {
|
||||
parts.push(KeySpace {
|
||||
ranges: vec![current_key..key],
|
||||
});
|
||||
current_key = key;
|
||||
}
|
||||
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
// Benchmark using metadata extracted from our performance test environment, from
|
||||
// a project where we have run pgbench many timmes. The pgbench database was initialized
|
||||
// between each test run.
|
||||
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
|
||||
// Choose uniformly distributed queries
|
||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||
|
||||
// Choose inputs for get_difficulty_map
|
||||
let latest_lsn = layer_map
|
||||
.iter_historic_layers()
|
||||
.map(|l| l.get_lsn_range().end)
|
||||
.max()
|
||||
.unwrap();
|
||||
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
|
||||
|
||||
// Check correctness of get_difficulty_map
|
||||
// TODO put this in a dedicated test outside of this mod
|
||||
{
|
||||
println!("running correctness check");
|
||||
|
||||
let now = Instant::now();
|
||||
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
|
||||
assert!(result_bruteforce.len() == partitioning.parts.len());
|
||||
println!("Finished bruteforce in {:?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
|
||||
assert!(result_fast.len() == partitioning.parts.len());
|
||||
println!("Finished fast in {:?}", now.elapsed());
|
||||
|
||||
// Assert results are equal. Manually iterate for easier debugging.
|
||||
let zip = std::iter::zip(
|
||||
&partitioning.parts,
|
||||
std::iter::zip(result_bruteforce, result_fast),
|
||||
);
|
||||
for (_part, (bruteforce, fast)) in zip {
|
||||
assert_eq!(bruteforce, fast);
|
||||
}
|
||||
|
||||
println!("No issues found");
|
||||
}
|
||||
|
||||
// Define and name the benchmark function
|
||||
let mut group = c.benchmark_group("real_map");
|
||||
group.bench_function("uniform_queries", |b| {
|
||||
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
|
||||
}
|
||||
});
|
||||
});
|
||||
group.bench_function("get_difficulty_map", |b| {
|
||||
b.iter(|| {
|
||||
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
|
||||
});
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
|
||||
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BasebackupError {
|
||||
@@ -42,6 +43,26 @@ pub enum BasebackupError {
|
||||
Server(#[from] anyhow::Error),
|
||||
#[error("basebackup client error {0:#} when {1}")]
|
||||
Client(#[source] io::Error, &'static str),
|
||||
#[error("basebackup during shutdown")]
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for BasebackupError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
match value {
|
||||
PageReconstructError::Cancelled => BasebackupError::Shutdown,
|
||||
err => BasebackupError::Server(err.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for BasebackupError {
|
||||
fn from(value: GetVectoredError) -> Self {
|
||||
match value {
|
||||
GetVectoredError::Cancelled => BasebackupError::Shutdown,
|
||||
err => BasebackupError::Server(err.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
@@ -127,7 +148,7 @@ where
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?,
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
@@ -323,8 +344,7 @@ where
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
@@ -336,11 +356,10 @@ where
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
let block = block?;
|
||||
slru_builder.add_block(&key, block).await?;
|
||||
}
|
||||
}
|
||||
@@ -349,11 +368,8 @@ where
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in self
|
||||
.timeline
|
||||
.list_dbdirs(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
@@ -362,8 +378,7 @@ where
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
@@ -391,8 +406,7 @@ where
|
||||
let aux_files = self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
let aux_scan_time = start_time.elapsed();
|
||||
let aux_estimated_size = aux_files
|
||||
.values()
|
||||
@@ -451,16 +465,14 @@ where
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
let repl_origins = self
|
||||
.timeline
|
||||
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
let n_origins = repl_origins.len();
|
||||
if n_origins != 0 {
|
||||
//
|
||||
@@ -505,8 +517,7 @@ where
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
@@ -532,8 +543,7 @@ where
|
||||
// TODO: investigate using get_vectored for the entire startblk..endblk range.
|
||||
// But this code path is not on the critical path for most basebackups (?).
|
||||
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
@@ -567,8 +577,7 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
if img.len()
|
||||
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
|
||||
@@ -622,8 +631,7 @@ where
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
@@ -674,8 +682,7 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
|
||||
@@ -194,6 +194,13 @@ pub struct PageServerConf {
|
||||
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
|
||||
/// safekeepers does not have gaps.
|
||||
pub validate_wal_contiguity: bool,
|
||||
|
||||
/// When set, the previously written to disk heatmap is loaded on tenant attach and used
|
||||
/// to avoid clobbering the heatmap from new, cold, attached locations.
|
||||
pub load_previous_heatmap: bool,
|
||||
|
||||
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -358,6 +365,8 @@ impl PageServerConf {
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
validate_wal_contiguity,
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -447,6 +456,8 @@ impl PageServerConf {
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
|
||||
load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
|
||||
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
@@ -480,7 +491,9 @@ impl PageServerConf {
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
||||
|
||||
let test_id = uuid::Uuid::new_v4();
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
|
||||
}
|
||||
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
@@ -493,6 +506,8 @@ impl PageServerConf {
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
load_previous_heatmap: Some(true),
|
||||
generate_unarchival_heatmap: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
|
||||
@@ -842,6 +842,12 @@ paths:
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
- name: recurse
|
||||
description: When set, will recurse with the downloads into ancestor timelines
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
post:
|
||||
description: |
|
||||
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
|
||||
|
||||
@@ -481,6 +481,7 @@ async fn build_timeline_info_common(
|
||||
|
||||
state,
|
||||
is_archived: Some(is_archived),
|
||||
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
|
||||
|
||||
walreceiver_status,
|
||||
};
|
||||
|
||||
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_per_read_batch_global",
|
||||
"Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
|
||||
vec![
|
||||
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_per_read_amortized_global",
|
||||
"Layers visited to serve a single read (read amplification). Amortized across a batch: \
|
||||
all visited layers are divided by number of reads.",
|
||||
vec![
|
||||
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
// We expect this to be low because of Postgres checkpoints. Let's see if that holds.
|
||||
register_histogram!(
|
||||
@@ -4074,6 +4097,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
// histograms
|
||||
[
|
||||
&LAYERS_PER_READ_GLOBAL,
|
||||
&LAYERS_PER_READ_BATCH_GLOBAL,
|
||||
&LAYERS_PER_READ_AMORTIZED_GLOBAL,
|
||||
&DELTAS_PER_READ_GLOBAL,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
|
||||
@@ -392,10 +392,6 @@ impl TimelineHandles {
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
timeline::handle::GetError::TenantManager(e) => e,
|
||||
timeline::handle::GetError::TimelineGateClosed => {
|
||||
trace!("timeline gate closed");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
}
|
||||
timeline::handle::GetError::PerTimelineStateShutDown => {
|
||||
trace!("per-timeline state shut down");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
@@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes;
|
||||
impl timeline::handle::Types for TenantManagerTypes {
|
||||
type TenantManagerError = GetActiveTimelineError;
|
||||
type TenantManager = TenantManagerWrapper;
|
||||
type Timeline = Arc<Timeline>;
|
||||
type Timeline = TenantManagerCacheItem;
|
||||
}
|
||||
|
||||
impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
pub(crate) struct TenantManagerCacheItem {
|
||||
pub(crate) timeline: Arc<Timeline>,
|
||||
#[allow(dead_code)] // we store it to keep the gate open
|
||||
pub(crate) gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantManagerCacheItem {
|
||||
type Target = Arc<Timeline>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
|
||||
fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
|
||||
Timeline::shard_timeline_id(self)
|
||||
Timeline::shard_timeline_id(&self.timeline)
|
||||
}
|
||||
|
||||
fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
|
||||
&self.handles
|
||||
&self.timeline.handles
|
||||
}
|
||||
|
||||
fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
|
||||
Timeline::get_shard_identity(self)
|
||||
Timeline::get_shard_identity(&self.timeline)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -448,7 +453,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
|
||||
let tenant_id = self.tenant_id.get().expect("we set this in get()");
|
||||
let timeout = ACTIVE_TENANT_TIMEOUT;
|
||||
let wait_start = Instant::now();
|
||||
@@ -491,7 +496,20 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
Ok(timeline)
|
||||
|
||||
let gate_guard = match timeline.gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::ShuttingDown,
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(TenantManagerCacheItem {
|
||||
timeline,
|
||||
gate_guard,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2095,6 +2113,7 @@ impl PageServerHandler {
|
||||
// TODO: passthrough the error site to the final error message?
|
||||
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
BasebackupError::Server(e) => QueryError::Other(e),
|
||||
BasebackupError::Shutdown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::key::{
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -492,7 +493,9 @@ impl Timeline {
|
||||
// Otherwise, read the old reldir keyspace.
|
||||
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
|
||||
|
||||
if self.get_rel_size_v2_enabled() {
|
||||
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
|
||||
self.get_rel_size_v2_status()
|
||||
{
|
||||
// fetch directory listing (new)
|
||||
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
|
||||
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
|
||||
@@ -544,7 +547,7 @@ impl Timeline {
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
if !self.get_rel_size_v2_enabled() {
|
||||
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
|
||||
return Ok(rels_v1);
|
||||
}
|
||||
|
||||
@@ -599,28 +602,36 @@ impl Timeline {
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for blkno in 0..n_blocks {
|
||||
let block = self
|
||||
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
|
||||
.await?;
|
||||
segment.extend_from_slice(&block[..BLCKSZ as usize]);
|
||||
}
|
||||
Ok(segment.freeze())
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub(crate) async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn, ctx).await
|
||||
let keyspace = KeySpace::single(
|
||||
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
|
||||
);
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for batch in batches.parts {
|
||||
let blocks = self
|
||||
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, block) in blocks {
|
||||
let block = block?;
|
||||
segment.extend_from_slice(&block[..BLCKSZ as usize]);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(segment.freeze())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
@@ -829,19 +840,41 @@ impl Timeline {
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = self
|
||||
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
|
||||
|
||||
let keyspace = KeySpace::single(
|
||||
slru_block_to_key(SlruKind::Clog, segno, 0)
|
||||
..slru_block_to_key(SlruKind::Clog, segno, nblocks),
|
||||
);
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
|
||||
for batch in batches.parts.into_iter().rev() {
|
||||
let blocks = self
|
||||
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
for (_key, clog_page) in blocks.into_iter().rev() {
|
||||
let clog_page = clog_page?;
|
||||
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1720,6 +1753,35 @@ impl DatadirModification<'_> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
|
||||
/// we enable it, we also need to persist it in `index_part.json`.
|
||||
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
|
||||
let status = self.tline.get_rel_size_v2_status();
|
||||
let config = self.tline.get_rel_size_v2_enabled();
|
||||
match (config, status) {
|
||||
(false, RelSizeMigration::Legacy) => {
|
||||
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
|
||||
Ok(false)
|
||||
}
|
||||
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
Ok(true)
|
||||
}
|
||||
(true, RelSizeMigration::Legacy) => {
|
||||
// The first time we enable it, we need to persist it in `index_part.json`
|
||||
self.tline
|
||||
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
|
||||
tracing::info!("enabled rel_size_v2");
|
||||
Ok(true)
|
||||
}
|
||||
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
// and we don't need to do anything
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
@@ -1728,6 +1790,8 @@ impl DatadirModification<'_> {
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
let mut dbdir = DbDirectory::des(&buf)?;
|
||||
@@ -1748,7 +1812,7 @@ impl DatadirModification<'_> {
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
|
||||
if self.tline.get_rel_size_v2_enabled() {
|
||||
if v2_enabled {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
}
|
||||
@@ -1900,12 +1964,12 @@ impl DatadirModification<'_> {
|
||||
.context("deserialize db")?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
if self.tline.get_rel_size_v2_enabled() {
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
@@ -1940,6 +2004,10 @@ impl DatadirModification<'_> {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
|
||||
@@ -1953,6 +2021,7 @@ impl DatadirModification<'_> {
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -2031,6 +2100,7 @@ impl DatadirModification<'_> {
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2043,7 +2113,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
|
||||
dirty = true;
|
||||
true
|
||||
} else if self.tline.get_rel_size_v2_enabled() {
|
||||
} else if v2_enabled {
|
||||
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
|
||||
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
|
||||
// logic).
|
||||
@@ -2074,7 +2144,7 @@ impl DatadirModification<'_> {
|
||||
// Remove entry from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel_tag);
|
||||
|
||||
// Delete size entry, as well as all blocks
|
||||
// Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
|
||||
self.delete(rel_key_range(rel_tag));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,8 +31,8 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use itertools::Itertools as _;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{self, RelSizeMigration};
|
||||
use pageserver_api::models::{
|
||||
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
WalRedoManagerStatus,
|
||||
@@ -1123,6 +1123,7 @@ impl Tenant {
|
||||
CreateTimelineCause::Load,
|
||||
idempotency.clone(),
|
||||
index_part.gc_compaction.clone(),
|
||||
index_part.rel_size_migration.clone(),
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
@@ -1149,7 +1150,7 @@ impl Tenant {
|
||||
// a previous heatmap which contains all visible layers in the layer map.
|
||||
// This previous heatmap will be used whenever a fresh heatmap is generated
|
||||
// for the timeline.
|
||||
if matches!(cause, LoadTimelineCause::Unoffload) {
|
||||
if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
|
||||
let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
|
||||
while let Some((tline, end_lsn)) = tline_ending_at {
|
||||
let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
|
||||
@@ -1581,6 +1582,10 @@ impl Tenant {
|
||||
}
|
||||
|
||||
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
|
||||
if !self.conf.load_previous_heatmap {
|
||||
return None;
|
||||
}
|
||||
|
||||
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
|
||||
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
|
||||
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
|
||||
@@ -2446,6 +2451,7 @@ impl Tenant {
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -2501,6 +2507,7 @@ impl Tenant {
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
|
||||
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
@@ -2522,6 +2529,11 @@ impl Tenant {
|
||||
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
for in_memory in in_memory_layer_desc {
|
||||
tline
|
||||
.force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
@@ -2771,6 +2783,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -4122,6 +4135,7 @@ impl Tenant {
|
||||
cause: CreateTimelineCause,
|
||||
create_idempotency: CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let state = match cause {
|
||||
CreateTimelineCause::Load => {
|
||||
@@ -4154,6 +4168,7 @@ impl Tenant {
|
||||
self.attach_wal_lag_cooldown.clone(),
|
||||
create_idempotency,
|
||||
gc_compaction_state,
|
||||
rel_size_v2_status,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
|
||||
@@ -4856,6 +4871,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
Some(src_timeline.get_rel_size_v2_status()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -5129,6 +5145,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -5207,13 +5224,14 @@ impl Tenant {
|
||||
create_guard: TimelineCreateGuard,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
resources
|
||||
.remote_client
|
||||
.init_upload_queue_for_empty_remote(new_metadata)?;
|
||||
.init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
|
||||
|
||||
let timeline_struct = self
|
||||
.create_timeline_struct(
|
||||
@@ -5225,6 +5243,7 @@ impl Tenant {
|
||||
CreateTimelineCause::Load,
|
||||
create_guard.idempotency.clone(),
|
||||
None,
|
||||
rel_size_v2_status,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
@@ -5913,6 +5932,8 @@ mod tests {
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::InMemoryLayerTestDesc;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc};
|
||||
use utils::id::TenantId;
|
||||
@@ -7925,6 +7946,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
@@ -8012,6 +8034,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x20),
|
||||
@@ -8227,6 +8250,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8307,6 +8331,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8380,6 +8405,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8512,6 +8538,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
@@ -8705,6 +8732,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x40),
|
||||
delta1,
|
||||
@@ -8761,6 +8789,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(),
|
||||
image_layers,
|
||||
end_lsn,
|
||||
@@ -8967,6 +8996,7 @@ mod tests {
|
||||
Lsn(0x08),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x08)..Lsn(0x10),
|
||||
@@ -8985,7 +9015,7 @@ mod tests {
|
||||
delta3,
|
||||
),
|
||||
], // delta layers
|
||||
vec![], // image layers
|
||||
vec![], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?
|
||||
@@ -8996,6 +9026,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x48),
|
||||
@@ -9546,6 +9577,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
|
||||
@@ -9793,6 +9825,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
// delta1 and delta 2 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||
@@ -10028,6 +10061,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![], // delta layers
|
||||
vec![(Lsn(0x18), img_layer)], // image layers
|
||||
Lsn(0x18),
|
||||
@@ -10274,6 +10308,7 @@ mod tests {
|
||||
baseline_image_layer_lsn,
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
delta_layer_start_lsn..delta_layer_end_lsn,
|
||||
delta_layer_spec,
|
||||
@@ -10305,6 +10340,158 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let will_init_keys = [2, 6];
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let mut expected_key_values = HashMap::new();
|
||||
|
||||
let baseline_image_layer_lsn = Lsn(0x10);
|
||||
let mut baseline_img_layer = Vec::new();
|
||||
for i in 0..5 {
|
||||
let key = get_key(i);
|
||||
let value = format!("value {i}@{baseline_image_layer_lsn}");
|
||||
|
||||
let removed = expected_key_values.insert(key, value.clone());
|
||||
assert!(removed.is_none());
|
||||
|
||||
baseline_img_layer.push((key, Bytes::from(value)));
|
||||
}
|
||||
|
||||
let nested_image_layer_lsn = Lsn(0x50);
|
||||
let mut nested_img_layer = Vec::new();
|
||||
for i in 5..10 {
|
||||
let key = get_key(i);
|
||||
let value = format!("value {i}@{nested_image_layer_lsn}");
|
||||
|
||||
let removed = expected_key_values.insert(key, value.clone());
|
||||
assert!(removed.is_none());
|
||||
|
||||
nested_img_layer.push((key, Bytes::from(value)));
|
||||
}
|
||||
|
||||
let frozen_layer = {
|
||||
let lsn_range = Lsn(0x40)..Lsn(0x60);
|
||||
let mut data = Vec::new();
|
||||
for i in 0..10 {
|
||||
let key = get_key(i);
|
||||
let key_in_nested = nested_img_layer
|
||||
.iter()
|
||||
.any(|(key_with_img, _)| *key_with_img == key);
|
||||
let lsn = {
|
||||
if key_in_nested {
|
||||
Lsn(nested_image_layer_lsn.0 + 5)
|
||||
} else {
|
||||
lsn_range.start
|
||||
}
|
||||
};
|
||||
|
||||
let will_init = will_init_keys.contains(&i);
|
||||
if will_init {
|
||||
data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
|
||||
|
||||
expected_key_values.insert(key, "".to_string());
|
||||
} else {
|
||||
let delta = format!("@{lsn}");
|
||||
data.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
InMemoryLayerTestDesc {
|
||||
lsn_range,
|
||||
is_open: false,
|
||||
data,
|
||||
}
|
||||
};
|
||||
|
||||
let (open_layer, last_record_lsn) = {
|
||||
let start_lsn = Lsn(0x70);
|
||||
let mut data = Vec::new();
|
||||
let mut end_lsn = Lsn(0);
|
||||
for i in 0..10 {
|
||||
let key = get_key(i);
|
||||
let lsn = Lsn(start_lsn.0 + i as u64);
|
||||
let delta = format!("@{lsn}");
|
||||
data.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
|
||||
end_lsn = std::cmp::max(end_lsn, lsn);
|
||||
}
|
||||
|
||||
(
|
||||
InMemoryLayerTestDesc {
|
||||
lsn_range: start_lsn..Lsn::MAX,
|
||||
is_open: true,
|
||||
data,
|
||||
},
|
||||
end_lsn,
|
||||
)
|
||||
};
|
||||
|
||||
assert!(
|
||||
nested_image_layer_lsn > frozen_layer.lsn_range.start
|
||||
&& nested_image_layer_lsn < frozen_layer.lsn_range.end
|
||||
);
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
baseline_image_layer_lsn,
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![open_layer, frozen_layer], // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![
|
||||
(baseline_image_layer_lsn, baseline_img_layer),
|
||||
(nested_image_layer_lsn, nested_img_layer),
|
||||
], // image layers
|
||||
last_record_lsn,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let results = tline
|
||||
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
let value = res.expect("No key errors");
|
||||
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
|
||||
assert_eq!(value, Bytes::from(expected_value.clone()));
|
||||
|
||||
tracing::info!("key={key} value={expected_value}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
|
||||
(
|
||||
k1.is_delta,
|
||||
@@ -10420,6 +10607,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
@@ -10804,6 +10992,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
@@ -11055,6 +11244,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
|
||||
/// The latest state
|
||||
head: LayerCoverageTuple<Value>,
|
||||
|
||||
/// TODO: this could be an ordered vec using binary search.
|
||||
/// We push into this map everytime we add a layer, so might see some benefit
|
||||
/// All previous states
|
||||
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
|
||||
}
|
||||
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
buffer: BTreeMap<LayerKey, Option<Value>>,
|
||||
|
||||
/// All current layers. This is not used for search. Only to make rebuilds easier.
|
||||
// TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
|
||||
// [`Self::historic_coverage`] instead of doubling memory usage.
|
||||
// [`Self::len`]: can require rebuild and serve from latest historic
|
||||
// [`Self::iter`]: already requires rebuild => can serve from latest historic
|
||||
layers: BTreeMap<LayerKey, Value>,
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ pub(crate) use download::{
|
||||
};
|
||||
use index::GcCompactionState;
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use regex::Regex;
|
||||
use remote_storage::{
|
||||
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Initialize the upload queue for the case where the remote storage is empty,
|
||||
/// i.e., it doesn't have an `IndexPart`.
|
||||
///
|
||||
/// `rel_size_v2_status` needs to be carried over during branching, and that's why
|
||||
/// it's passed in here.
|
||||
pub fn init_upload_queue_for_empty_remote(
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
|
||||
.as_ref()
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
let initialized_queue =
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
|
||||
/// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
|
||||
pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
|
||||
self: &Arc<Self>,
|
||||
gc_compaction_state: GcCompactionState,
|
||||
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
|
||||
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
|
||||
self: &Arc<Self>,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
|
||||
// TODO: allow this operation to bypass the validation check because we might upload the index part
|
||||
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
|
||||
// upload will include the flag.
|
||||
// self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
|
||||
pub(crate) last_completed_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum RelSizeMigration {
|
||||
/// The tenant is using the old rel_size format.
|
||||
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
|
||||
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
|
||||
Legacy,
|
||||
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
|
||||
/// persisted in the index part. The read path will read both formats and merge them.
|
||||
Migrating,
|
||||
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
|
||||
/// in the index part, and the read path will not read the old format.
|
||||
Migrated,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
|
||||
@@ -869,8 +869,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.hot_layers()
|
||||
.map(|l| (&l.name, l.metadata.generation))
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
@@ -1015,7 +1014,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
for layer in timeline.layers {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
for layer in timeline.into_hot_layers() {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return (Err(UpdateError::Cancelled), touched);
|
||||
@@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.download_layer(tenant_shard_id, &timeline_id, layer, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
@@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
|
||||
|
||||
let (result, touched) = self
|
||||
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
|
||||
@@ -1316,11 +1316,11 @@ async fn init_timeline_state(
|
||||
// As we iterate through layers found on disk, we will look up their metadata from this map.
|
||||
// Layers not present in metadata will be discarded.
|
||||
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||
heatmap.hot_layers().map(|l| (&l.name, l)).collect();
|
||||
|
||||
let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
if let Some(last_heatmap) = last_heatmap {
|
||||
last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
|
||||
last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
|
||||
} else {
|
||||
HashMap::new()
|
||||
};
|
||||
|
||||
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(crate) access_time: SystemTime,
|
||||
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
}
|
||||
|
||||
impl HeatMapLayer {
|
||||
@@ -62,11 +64,13 @@ impl HeatMapLayer {
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
cold: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
metadata,
|
||||
access_time,
|
||||
cold,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
|
||||
layers,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
|
||||
self.layers.into_iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
@@ -92,7 +108,7 @@ impl HeatMapTenant {
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in &timeline.layers {
|
||||
for layer in timeline.hot_layers() {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
use super::PageReconstructError;
|
||||
use super::layer_map::InMemoryLayerDesc;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
@@ -721,6 +722,12 @@ struct LayerToVisitId {
|
||||
lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
pub enum ReadableLayerWeak {
|
||||
PersistentLayer(Arc<PersistentLayerDesc>),
|
||||
InMemoryLayer(InMemoryLayerDesc),
|
||||
}
|
||||
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
@@ -873,7 +880,7 @@ impl ReadableLayer {
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -416,7 +416,7 @@ impl InMemoryLayer {
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
self: &Arc<InMemoryLayer>,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
@@ -433,8 +433,6 @@ impl InMemoryLayer {
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
|
||||
let lsn_range = self.start_lsn..end_lsn;
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
.index
|
||||
|
||||
@@ -1563,10 +1563,10 @@ impl LayerInner {
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ async fn smoke_test() {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Default::default(), // in-memory layers
|
||||
Default::default(),
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
|
||||
@@ -473,21 +473,15 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
|
||||
let mut update_rx = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
result = update_rx.changed() => if result.is_err() {
|
||||
tokio::select! {
|
||||
result = update_rx.wait_for(|s| s == &TenantState::Active) => {
|
||||
if result.is_err() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
match &*update_rx.borrow() {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
|
||||
}
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
ControlFlow::Continue(())
|
||||
},
|
||||
_ = cancel.cancelled() => ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
|
||||
use pageserver_api::models::{
|
||||
CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
|
||||
};
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
|
||||
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::l0_flush::{self, L0FlushGlobalState};
|
||||
use crate::metrics::{
|
||||
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
|
||||
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
|
||||
LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
|
||||
};
|
||||
use crate::page_service::TenantManagerTypes;
|
||||
use crate::pgdatadir_mapping::{
|
||||
@@ -436,6 +437,8 @@ pub struct Timeline {
|
||||
/// May host a background Tokio task which downloads all the layers from the current
|
||||
/// heatmap on demand.
|
||||
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
|
||||
|
||||
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
@@ -1328,10 +1331,6 @@ impl Timeline {
|
||||
// (this is a requirement, not a bug). Skip updating the metric in these cases
|
||||
// to avoid infinite results.
|
||||
if !results.is_empty() {
|
||||
// Record the total number of layers visited towards each key in the batch. While some
|
||||
// layers may not intersect with a given read, and the cost of layer visits are
|
||||
// amortized across the batch, each visited layer contributes directly to the observed
|
||||
// latency for every read in the batch, which is what we care about.
|
||||
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
|
||||
static LOG_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
|
||||
@@ -1346,9 +1345,23 @@ impl Timeline {
|
||||
});
|
||||
}
|
||||
|
||||
// Records the number of layers visited in a few different ways:
|
||||
//
|
||||
// * LAYERS_PER_READ: all layers count towards every read in the batch, because each
|
||||
// layer directly affects its observed latency.
|
||||
//
|
||||
// * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
|
||||
// layer visits and access cost.
|
||||
//
|
||||
// * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
|
||||
// read amplification after batching.
|
||||
let layers_visited = layers_visited as f64;
|
||||
let avg_layers_visited = layers_visited / results.len() as f64;
|
||||
LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
|
||||
for _ in &results {
|
||||
self.metrics.layers_per_read.observe(layers_visited as f64);
|
||||
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
|
||||
self.metrics.layers_per_read.observe(layers_visited);
|
||||
LAYERS_PER_READ_GLOBAL.observe(layers_visited);
|
||||
LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2368,6 +2381,9 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
/// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
|
||||
/// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
|
||||
/// possible that the index part persists the state while the config doesn't get persisted.
|
||||
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2376,6 +2392,14 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
|
||||
}
|
||||
|
||||
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
|
||||
self.rel_size_v2_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.map(|s| s.as_ref().clone())
|
||||
.unwrap_or(RelSizeMigration::Legacy)
|
||||
}
|
||||
|
||||
fn get_compaction_upper_limit(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2636,6 +2660,7 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2794,6 +2819,8 @@ impl Timeline {
|
||||
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
|
||||
|
||||
heatmap_layers_downloader: Mutex::new(None),
|
||||
|
||||
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -2870,6 +2897,16 @@ impl Timeline {
|
||||
.schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
|
||||
}
|
||||
|
||||
pub(crate) fn update_rel_size_v2_status(
|
||||
&self,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
) -> anyhow::Result<()> {
|
||||
self.rel_size_v2_status
|
||||
.store(Some(Arc::new(rel_size_v2_status.clone())));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
|
||||
}
|
||||
|
||||
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
|
||||
self.gc_compaction_state.load_full().as_ref().clone()
|
||||
}
|
||||
@@ -3611,7 +3648,7 @@ impl Timeline {
|
||||
let visible_non_resident = match previous_heatmap.as_deref() {
|
||||
Some(PreviousHeatmap::Active {
|
||||
heatmap, read_at, ..
|
||||
}) => Some(heatmap.layers.iter().filter_map(|hl| {
|
||||
}) => Some(heatmap.all_layers().filter_map(|hl| {
|
||||
let desc: PersistentLayerDesc = hl.name.clone().into();
|
||||
let layer = guard.try_get_from_key(&desc.key())?;
|
||||
|
||||
@@ -3627,7 +3664,7 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((desc, hl.metadata.clone(), hl.access_time))
|
||||
Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
|
||||
})),
|
||||
Some(PreviousHeatmap::Obsolete) => None,
|
||||
None => None,
|
||||
@@ -3643,6 +3680,7 @@ impl Timeline {
|
||||
layer.layer_desc().clone(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
false, // these layers are not cold
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
@@ -3669,12 +3707,14 @@ impl Timeline {
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Cold layers go last for convenience when a human inspects the heatmap.
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
layers.sort_by_key(|(desc, _meta, _atime, cold)| {
|
||||
std::cmp::Reverse((
|
||||
*cold,
|
||||
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
|
||||
desc.lsn_range.end,
|
||||
))
|
||||
@@ -3682,7 +3722,9 @@ impl Timeline {
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.map(|(desc, meta, atime, cold)| {
|
||||
HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
@@ -3702,6 +3744,7 @@ impl Timeline {
|
||||
name: vl.layer_desc().layer_name(),
|
||||
metadata: vl.metadata(),
|
||||
access_time: now,
|
||||
cold: true,
|
||||
};
|
||||
heatmap_layers.push(hl);
|
||||
}
|
||||
@@ -3914,39 +3957,22 @@ impl Timeline {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||
let start_lsn = l.get_lsn_range().start;
|
||||
cont_lsn > start_lsn
|
||||
});
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
match in_memory_layer {
|
||||
Some(l) => {
|
||||
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||
fringe.update(
|
||||
ReadableLayer::InMemoryLayer(l),
|
||||
unmapped_keyspace.clone(),
|
||||
lsn_range,
|
||||
);
|
||||
}
|
||||
None => {
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
}
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
guard.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
@@ -5555,6 +5581,14 @@ pub struct DeltaLayerTestDesc {
|
||||
pub data: Vec<(Key, Lsn, Value)>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Clone)]
|
||||
pub struct InMemoryLayerTestDesc {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub data: Vec<(Key, Lsn, Value)>,
|
||||
pub is_open: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl DeltaLayerTestDesc {
|
||||
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
|
||||
@@ -6567,6 +6601,92 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Force create an in-memory layer and place them into the layer map.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_in_memory_layer(
|
||||
self: &Arc<Timeline>,
|
||||
mut in_memory: InMemoryLayerTestDesc,
|
||||
check_start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
// Validate LSNs
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(in_memory.lsn_range.start >= check_start_lsn);
|
||||
}
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let layer_end_lsn = if in_memory.is_open {
|
||||
in_memory
|
||||
.data
|
||||
.iter()
|
||||
.map(|(_key, lsn, _value)| lsn)
|
||||
.max()
|
||||
.cloned()
|
||||
} else {
|
||||
Some(in_memory.lsn_range.end)
|
||||
};
|
||||
|
||||
if let Some(end) = layer_end_lsn {
|
||||
assert!(
|
||||
end <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
|
||||
end,
|
||||
last_record_lsn,
|
||||
);
|
||||
}
|
||||
|
||||
in_memory.data.iter().for_each(|(_key, lsn, _value)| {
|
||||
assert!(*lsn >= in_memory.lsn_range.start);
|
||||
assert!(*lsn < in_memory.lsn_range.end);
|
||||
});
|
||||
|
||||
// Build the batch
|
||||
in_memory
|
||||
.data
|
||||
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
||||
|
||||
let data = in_memory
|
||||
.data
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| {
|
||||
let value_size = value.serialized_size().unwrap() as usize;
|
||||
(key.to_compact(), lsn, value_size, value)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let batch = SerializedValueBatch::from_values(data);
|
||||
|
||||
// Create the in-memory layer and write the batch into it
|
||||
let layer = InMemoryLayer::create(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
in_memory.lsn_range.start,
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
layer.put_batch(batch, ctx).await.unwrap();
|
||||
if !in_memory.is_open {
|
||||
layer.freeze(in_memory.lsn_range.end).await;
|
||||
}
|
||||
|
||||
info!("force created in-memory layer {:?}", in_memory.lsn_range);
|
||||
|
||||
// Link the layer to the layer map
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let layer_map = guard.open_mut().unwrap();
|
||||
layer_map.force_insert_in_memory_layer(Arc::new(layer));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all keys at the LSN in the image layers
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_image_layers(
|
||||
@@ -6926,6 +7046,7 @@ mod tests {
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use std::iter::Iterator;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -6939,8 +7060,8 @@ mod tests {
|
||||
use crate::tenant::{PreviousHeatmap, Timeline};
|
||||
|
||||
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
|
||||
assert_eq!(lhs.layers.len(), rhs.layers.len());
|
||||
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
|
||||
assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
|
||||
let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
|
||||
for (l, r) in lhs_rhs {
|
||||
assert_eq!(l.name, r.name);
|
||||
assert_eq!(l.metadata, r.metadata);
|
||||
@@ -6999,6 +7120,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
@@ -7017,10 +7139,11 @@ mod tests {
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
|
||||
assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in &heatmap.layers {
|
||||
for layer in heatmap_layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
@@ -7132,6 +7255,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
@@ -7148,7 +7272,7 @@ mod tests {
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
// Both layers should be in the heatmap
|
||||
assert!(!heatmap.layers.is_empty());
|
||||
assert!(heatmap.all_layers().count() > 0);
|
||||
|
||||
// Now simulate a migration.
|
||||
timeline
|
||||
@@ -7174,7 +7298,7 @@ mod tests {
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert!(post_eviction_heatmap.layers.is_empty());
|
||||
assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
|
||||
assert!(matches!(
|
||||
timeline.previous_heatmap.load().as_deref(),
|
||||
Some(PreviousHeatmap::Obsolete)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
@@ -15,10 +16,11 @@ use super::{
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{Context, anyhow, bail};
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
|
||||
@@ -234,6 +236,12 @@ impl GcCompactionQueue {
|
||||
// it enough in staging yet.
|
||||
return Ok(());
|
||||
}
|
||||
if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
|
||||
// If the gc watermark is not set, we don't need to trigger auto compaction.
|
||||
// This check is the same as in `gc_compaction_split_jobs` but we don't log
|
||||
// here and we can also skip the computation of the trigger condition earlier.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
|
||||
// Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
|
||||
@@ -321,7 +329,7 @@ impl GcCompactionQueue {
|
||||
l1_size, l2_size, l2_lsn, gc_cutoff
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
debug!(
|
||||
"did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
|
||||
l1_size, l2_size, l2_lsn, gc_cutoff
|
||||
);
|
||||
@@ -357,8 +365,7 @@ impl GcCompactionQueue {
|
||||
GcCompactJob::from_compact_options(options.clone()),
|
||||
options.sub_compaction_max_job_size_mb,
|
||||
)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
.await?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
self.notify_and_unblock(id);
|
||||
@@ -437,6 +444,7 @@ impl GcCompactionQueue {
|
||||
));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
let Some((id, item)) = ({
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
if let Some((id, item)) = guard.queued.pop_front() {
|
||||
@@ -486,13 +494,23 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
|
||||
}
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
let compaction_result =
|
||||
timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
self.notify_and_unblock(id);
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::SubCompactionJob(options) => {
|
||||
// TODO: error handling, clear the queue if any task fails?
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
// We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
|
||||
// again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
|
||||
// we need to clean things up before returning from the function.
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::Notify(id, l2_lsn) => {
|
||||
self.notify_and_unblock(id);
|
||||
@@ -521,7 +539,10 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.running = None;
|
||||
}
|
||||
Ok(if has_pending_tasks {
|
||||
Ok(if yield_for_l0 {
|
||||
tracing::info!("give up gc-compaction: yield for L0 compaction");
|
||||
CompactionOutcome::YieldForL0
|
||||
} else if has_pending_tasks {
|
||||
CompactionOutcome::Pending
|
||||
} else {
|
||||
CompactionOutcome::Done
|
||||
@@ -719,17 +740,41 @@ struct CompactionStatisticsNumSize {
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
pub struct CompactionStatistics {
|
||||
/// Delta layer visited (maybe compressed, physical size)
|
||||
delta_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Image layer visited (maybe compressed, physical size)
|
||||
image_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Delta layer produced (maybe compressed, physical size)
|
||||
delta_layer_produced: CompactionStatisticsNumSize,
|
||||
/// Image layer produced (maybe compressed, physical size)
|
||||
image_layer_produced: CompactionStatisticsNumSize,
|
||||
num_delta_layer_discarded: usize,
|
||||
num_image_layer_discarded: usize,
|
||||
/// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
delta_layer_discarded: CompactionStatisticsNumSize,
|
||||
/// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
image_layer_discarded: CompactionStatisticsNumSize,
|
||||
num_unique_keys_visited: usize,
|
||||
/// Delta visited (uncompressed, original size)
|
||||
wal_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Image visited (uncompressed, original size)
|
||||
image_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Delta produced (uncompressed, original size)
|
||||
wal_produced: CompactionStatisticsNumSize,
|
||||
/// Image produced (uncompressed, original size)
|
||||
image_produced: CompactionStatisticsNumSize,
|
||||
|
||||
// Time spent in each phase
|
||||
time_acquire_lock_secs: f64,
|
||||
time_analyze_secs: f64,
|
||||
time_download_layer_secs: f64,
|
||||
time_main_loop_secs: f64,
|
||||
time_final_phase_secs: f64,
|
||||
time_total_secs: f64,
|
||||
|
||||
// Summary
|
||||
/// Ratio of the key-value size before/after gc-compaction.
|
||||
uncompressed_size_ratio: f64,
|
||||
/// Ratio of the physical size before/after gc-compaction.
|
||||
physical_size_ratio: f64,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
@@ -779,11 +824,13 @@ impl CompactionStatistics {
|
||||
self.image_produced.num += 1;
|
||||
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn discard_delta_layer(&mut self) {
|
||||
self.num_delta_layer_discarded += 1;
|
||||
fn discard_delta_layer(&mut self, original_size: u64) {
|
||||
self.delta_layer_discarded.num += 1;
|
||||
self.delta_layer_discarded.size += original_size;
|
||||
}
|
||||
fn discard_image_layer(&mut self) {
|
||||
self.num_image_layer_discarded += 1;
|
||||
fn discard_image_layer(&mut self, original_size: u64) {
|
||||
self.image_layer_discarded.num += 1;
|
||||
self.image_layer_discarded.size += original_size;
|
||||
}
|
||||
fn produce_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_produced.num += 1;
|
||||
@@ -793,6 +840,19 @@ impl CompactionStatistics {
|
||||
self.image_layer_produced.num += 1;
|
||||
self.image_layer_produced.size += size;
|
||||
}
|
||||
fn finalize(&mut self) {
|
||||
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
|
||||
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
|
||||
self.uncompressed_size_ratio =
|
||||
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
|
||||
let produced_physical_size = self.image_layer_produced.size
|
||||
+ self.delta_layer_produced.size
|
||||
+ self.image_layer_discarded.size
|
||||
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
|
||||
self.physical_size_ratio =
|
||||
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -825,9 +885,7 @@ impl Timeline {
|
||||
.flags
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
self.compact_with_gc(cancel, options, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
self.compact_with_gc(cancel, options, ctx).await?;
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
@@ -2345,12 +2403,19 @@ impl Timeline {
|
||||
async fn check_compaction_space(
|
||||
self: &Arc<Self>,
|
||||
layer_selection: &[Layer],
|
||||
) -> anyhow::Result<()> {
|
||||
let available_space = self.check_available_space().await?;
|
||||
) -> Result<(), CompactionError> {
|
||||
let available_space = self
|
||||
.check_available_space()
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let mut remote_layer_size = 0;
|
||||
let mut all_layer_size = 0;
|
||||
for layer in layer_selection {
|
||||
let needs_download = layer.needs_download().await?;
|
||||
let needs_download = layer
|
||||
.needs_download()
|
||||
.await
|
||||
.context("failed to check if layer needs download")
|
||||
.map_err(CompactionError::Other)?;
|
||||
if needs_download.is_some() {
|
||||
remote_layer_size += layer.layer_desc().file_size;
|
||||
}
|
||||
@@ -2359,14 +2424,14 @@ impl Timeline {
|
||||
let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
|
||||
if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
|
||||
{
|
||||
return Err(anyhow!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
|
||||
available_space,
|
||||
allocated_space,
|
||||
all_layer_size,
|
||||
remote_layer_size,
|
||||
all_layer_size + remote_layer_size
|
||||
));
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -2397,7 +2462,7 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
job: GcCompactJob,
|
||||
sub_compaction_max_job_size_mb: Option<u64>,
|
||||
) -> anyhow::Result<Vec<GcCompactJob>> {
|
||||
) -> Result<Vec<GcCompactJob>, CompactionError> {
|
||||
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
|
||||
job.compact_lsn_range.end
|
||||
} else {
|
||||
@@ -2548,7 +2613,7 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let sub_compaction = options.sub_compaction;
|
||||
let job = GcCompactJob::from_compact_options(options.clone());
|
||||
if sub_compaction {
|
||||
@@ -2570,7 +2635,7 @@ impl Timeline {
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await
|
||||
}
|
||||
@@ -2580,19 +2645,24 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
job: GcCompactJob,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
|
||||
let timer = Instant::now();
|
||||
let begin_timer = timer;
|
||||
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
|
||||
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
|
||||
}
|
||||
};
|
||||
|
||||
let time_acquire_lock = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
@@ -2644,7 +2714,7 @@ impl Timeline {
|
||||
tracing::warn!(
|
||||
"no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
real_gc_cutoff
|
||||
} else {
|
||||
@@ -2682,7 +2752,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
|
||||
gc_cutoff
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
};
|
||||
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
|
||||
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
|
||||
@@ -2703,7 +2773,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
|
||||
compact_lsn_range.end
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
@@ -2729,7 +2799,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
|
||||
gc_cutoff, compact_key_range.start, compact_key_range.end
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
GcCompactionJobDescription {
|
||||
@@ -2782,6 +2852,9 @@ impl Timeline {
|
||||
has_data_below,
|
||||
);
|
||||
|
||||
let time_analyze = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
for layer in &job_desc.selected_layers {
|
||||
debug!("read layer: {}", layer.layer_desc().key());
|
||||
}
|
||||
@@ -2810,10 +2883,10 @@ impl Timeline {
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
|
||||
err
|
||||
);
|
||||
)));
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
@@ -2828,11 +2901,33 @@ impl Timeline {
|
||||
let mut total_downloaded_size = 0;
|
||||
let mut total_layer_size = 0;
|
||||
for layer in &job_desc.selected_layers {
|
||||
if layer.needs_download().await?.is_some() {
|
||||
if layer
|
||||
.needs_download()
|
||||
.await
|
||||
.context("failed to check if layer needs download")
|
||||
.map_err(CompactionError::Other)?
|
||||
.is_some()
|
||||
{
|
||||
total_downloaded_size += layer.layer_desc().file_size;
|
||||
}
|
||||
total_layer_size += layer.layer_desc().file_size;
|
||||
let resident_layer = layer.download_and_keep_resident(ctx).await?;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident(ctx)
|
||||
.await
|
||||
.context("failed to download and keep resident layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
info!(
|
||||
@@ -2843,19 +2938,36 @@ impl Timeline {
|
||||
);
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
let layer = resident_layer
|
||||
.get_as_delta(ctx)
|
||||
.await
|
||||
.context("failed to get delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
delta_layers.push(layer);
|
||||
} else {
|
||||
let layer = resident_layer.get_as_image(ctx).await?;
|
||||
let layer = resident_layer
|
||||
.get_as_image(ctx)
|
||||
.await
|
||||
.context("failed to get image layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_gc_compaction_keyspace()
|
||||
.await
|
||||
.context("failed to collect gc compaction keyspace")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
)
|
||||
.context("failed to create filter iterator")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let time_download_layer = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
let mut accumulated_values = Vec::new();
|
||||
@@ -2874,7 +2986,9 @@ impl Timeline {
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create image layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -2887,7 +3001,9 @@ impl Timeline {
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RewritingLayers {
|
||||
@@ -2927,9 +3043,28 @@ impl Timeline {
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
|
||||
let mut keys_processed = 0;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
.await
|
||||
.context("failed to get next key-value pair")
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
keys_processed += 1;
|
||||
if keys_processed % 1000 == 0 {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
// If this shard does not need to store this key, simply skip it.
|
||||
@@ -2960,7 +3095,9 @@ impl Timeline {
|
||||
desc.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
rewriter.before.as_mut().unwrap()
|
||||
@@ -2975,14 +3112,20 @@ impl Timeline {
|
||||
desc.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
rewriter.after.as_mut().unwrap()
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
rewriter.put_value(key, lsn, val, ctx).await?;
|
||||
rewriter
|
||||
.put_value(key, lsn, val, ctx)
|
||||
.await
|
||||
.context("failed to put value")
|
||||
.map_err(CompactionError::Other)?;
|
||||
continue;
|
||||
}
|
||||
match val {
|
||||
@@ -3005,9 +3148,13 @@ impl Timeline {
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
.map_err(CompactionError::Other)?;
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
@@ -3016,7 +3163,9 @@ impl Timeline {
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to pipe to delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
accumulated_values.clear();
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
@@ -3034,9 +3183,14 @@ impl Timeline {
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
|
||||
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
.map_err(CompactionError::Other)?;
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
@@ -3045,21 +3199,36 @@ impl Timeline {
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to pipe to delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
// end: move the above part to the loop body
|
||||
|
||||
let time_main_loop = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let mut rewrote_delta_layers = Vec::new();
|
||||
for (key, writers) in delta_layer_rewriters {
|
||||
if let Some(delta_writer_before) = writers.before {
|
||||
let (desc, path) = delta_writer_before
|
||||
.finish(job_desc.compaction_key_range.start, ctx)
|
||||
.await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.context("failed to finish creating delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
if let Some(delta_writer_after) = writers.after {
|
||||
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
let (desc, path) = delta_writer_after
|
||||
.finish(key.key_range.end, ctx)
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.context("failed to finish creating delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
}
|
||||
@@ -3074,7 +3243,9 @@ impl Timeline {
|
||||
let end_key = job_desc.compaction_key_range.end;
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, end_key, discard)
|
||||
.await?
|
||||
.await
|
||||
.context("failed to finish image layer writer")
|
||||
.map_err(CompactionError::Other)?
|
||||
} else {
|
||||
drop(writer);
|
||||
Vec::new()
|
||||
@@ -3086,7 +3257,9 @@ impl Timeline {
|
||||
let produced_delta_layers = if !dry_run {
|
||||
delta_layer_writer
|
||||
.finish_with_discard_fn(self, ctx, discard)
|
||||
.await?
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?
|
||||
} else {
|
||||
drop(delta_layer_writer);
|
||||
Vec::new()
|
||||
@@ -3098,6 +3271,13 @@ impl Timeline {
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
|
||||
let layer_selection_by_key = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
@@ -3111,8 +3291,16 @@ impl Timeline {
|
||||
if cfg!(debug_assertions) {
|
||||
info!("discarded delta layer: {}", l);
|
||||
}
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_delta_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_delta_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3121,6 +3309,9 @@ impl Timeline {
|
||||
"produced rewritten delta layer: {}",
|
||||
layer.layer_desc().key()
|
||||
);
|
||||
// For now, we include rewritten delta layer size in the "produce_delta_layer". We could
|
||||
// make it a separate statistics in the future.
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
}
|
||||
compact_to.extend(rewrote_delta_layers);
|
||||
for action in produced_image_layers {
|
||||
@@ -3132,8 +3323,16 @@ impl Timeline {
|
||||
}
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
debug!("discarded image layer: {}", l);
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_image_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_image_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3166,7 +3365,9 @@ impl Timeline {
|
||||
&layer.layer_desc().key_range,
|
||||
&job_desc.compaction_key_range,
|
||||
) {
|
||||
bail!("violated constraint: image layer outside of compaction key range");
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"violated constraint: image layer outside of compaction key range"
|
||||
)));
|
||||
}
|
||||
if !fully_contains(
|
||||
&job_desc.compaction_key_range,
|
||||
@@ -3179,13 +3380,25 @@ impl Timeline {
|
||||
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
let time_final_phase = timer.elapsed();
|
||||
|
||||
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
|
||||
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
|
||||
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
|
||||
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
|
||||
stat.time_analyze_secs = time_analyze.as_secs_f64();
|
||||
stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
|
||||
stat.finalize();
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
serde_json::to_string(&stat)
|
||||
.context("failed to serialize gc-compaction statistics")
|
||||
.map_err(CompactionError::Other)?
|
||||
);
|
||||
|
||||
if dry_run {
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -3220,10 +3433,10 @@ impl Timeline {
|
||||
// the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
|
||||
// in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
|
||||
if let Some(err) = check_valid_layermap(&final_layers) {
|
||||
bail!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
|
||||
err
|
||||
);
|
||||
)));
|
||||
}
|
||||
|
||||
// Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
|
||||
@@ -3275,7 +3488,9 @@ impl Timeline {
|
||||
// find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
|
||||
// be batched into `schedule_compaction_update`.
|
||||
let disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
self.schedule_uploads(disk_consistent_lsn, None)?;
|
||||
self.schedule_uploads(disk_consistent_lsn, None)
|
||||
.context("failed to schedule uploads")
|
||||
.map_err(CompactionError::Other)?;
|
||||
// If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
|
||||
// of `compact_from`.
|
||||
let compact_from = {
|
||||
@@ -3302,7 +3517,7 @@ impl Timeline {
|
||||
|
||||
drop(gc_lock);
|
||||
|
||||
Ok(())
|
||||
Ok(CompactionOutcome::Done)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
|
||||
CreateTimelineCause::Delete,
|
||||
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
|
||||
None, // doesn't matter what we put here
|
||||
None, // doesn't matter what we put here
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
//! An efficient way to keep the timeline gate open without preventing
|
||||
//! timeline shutdown for longer than a single call to a timeline method.
|
||||
//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
|
||||
//!
|
||||
//! # Motivation
|
||||
//!
|
||||
@@ -19,27 +18,32 @@
|
||||
//! we hold the Timeline gate open while we're invoking the method on the
|
||||
//! Timeline object.
|
||||
//!
|
||||
//! However, we want to avoid the overhead of entering the gate for every
|
||||
//! method invocation.
|
||||
//!
|
||||
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||
//! resolve the shard for every request. Instead, we want to cache the
|
||||
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||
//! that get routed to that shard.
|
||||
//! We want to avoid the overhead of doing, for each incoming request,
|
||||
//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
|
||||
//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
|
||||
//! release the mgr rwlock before doing any request processing work
|
||||
//! - re-entering the Timeline gate for each Timeline method invocation.
|
||||
//!
|
||||
//! Regardless of how we accomplish the above, it should not
|
||||
//! prevent the Timeline from shutting down promptly.
|
||||
//!
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! ## Data Structures
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! There are two concepts expressed as associated types in the `Types` trait:
|
||||
//! - `TenantManager`: the thing that performs the expensive work. It produces
|
||||
//! a `Timeline` object, which is the other associated type.
|
||||
//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
|
||||
//!
|
||||
//! There are three user-facing data structures exposed by this module:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
|
||||
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
|
||||
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
|
||||
//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
|
||||
//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
|
||||
//!
|
||||
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
|
||||
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
|
||||
@@ -64,11 +68,14 @@
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
|
||||
//! A cache miss means we call Types::TenantManager::resolve for shard routing,
|
||||
//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
|
||||
//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
|
||||
//!
|
||||
//! We wrap the object returned from resolve() in an `Arc` and store that inside the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
|
||||
//! and a strong ref in the `PerTimelineState`.
|
||||
//! A strong ref is returned wrapped in a `Handle`.
|
||||
//! Another strong ref is returned wrapped in a `Handle`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the weak ref in the cache.
|
||||
@@ -78,51 +85,51 @@
|
||||
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
|
||||
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
|
||||
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
|
||||
//!
|
||||
//! # Performance
|
||||
//!
|
||||
//! Remember from the introductory section:
|
||||
//!
|
||||
//! > However, we want to avoid the overhead of entering the gate for every
|
||||
//! > method invocation.
|
||||
//! > We want to avoid the overhead of doing, for each incoming request,
|
||||
//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
|
||||
//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
|
||||
//! > release the mgr rwlock before doing any request processing work
|
||||
//! > - re-entering the Timeline gate for each Timeline method invocation.
|
||||
//!
|
||||
//! Why do we want to avoid that?
|
||||
//! Because the gate is a shared location in memory and entering it involves
|
||||
//! bumping refcounts, which leads to cache contention if done frequently
|
||||
//! from multiple cores in parallel.
|
||||
//! All of these boil down to some state that is either globally shared among all shards
|
||||
//! or state shared among all tasks that serve a particular timeline.
|
||||
//! It is either protected by RwLock or manipulated via atomics.
|
||||
//! Even atomics are costly when shared across multiple cores.
|
||||
//! So, we want to avoid any permanent need for coordination between page_service tasks.
|
||||
//!
|
||||
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
|
||||
//! That `Arc` is private to the `HandleInner` and hence to the connection.
|
||||
//! The solution is to add indirection: we wrap the Types::Timeline object that is
|
||||
//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
|
||||
//! and hence to the single Cache / page_service connection.
|
||||
//! (Review the "Data Structures" section if that is unclear to you.)
|
||||
//!
|
||||
//! A `WeakHandle` is a weak ref to the `HandleInner`.
|
||||
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
|
||||
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
|
||||
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
|
||||
//!
|
||||
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
|
||||
//! Again, this is cheap because the `Arc` is private to the connection.
|
||||
//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
|
||||
//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
|
||||
//! The Mutex is not contended because it is private to the connection.
|
||||
//! And again, the `Arc<Types::Timeline>` clone is cheap because that wrapper
|
||||
//! Arc's refcounts are private to the connection.
|
||||
//!
|
||||
//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
|
||||
//!
|
||||
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
|
||||
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
|
||||
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
|
||||
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
|
||||
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
|
||||
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
|
||||
//!
|
||||
//! # Shutdown
|
||||
//!
|
||||
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! Further, there is this cycle:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! The former cycle is a memory leak if not broken.
|
||||
@@ -135,9 +142,12 @@
|
||||
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
|
||||
//! - Connection shutdown (=> dropping the `Cache`).
|
||||
//!
|
||||
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
|
||||
//! `Arc<GateGuard>`.
|
||||
//! Both transition the `HandleInner` from [`HandleInner::Open`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived
|
||||
//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
|
||||
//! is dropped, the `Types::Timeline` gets dropped and thereby
|
||||
//! the `GateGuard` and the `Arc<Timeline>` that it stores,
|
||||
//! thereby breaking both cycles.
|
||||
//!
|
||||
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
|
||||
//! thereby breaking the cycle.
|
||||
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: ArcTimeline<Self> + Sized;
|
||||
type Timeline: Timeline<Self> + Sized;
|
||||
}
|
||||
|
||||
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types> {
|
||||
timeline: Arc<T::Timeline>,
|
||||
#[allow(dead_code)] // the field exists to keep the gate open
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
inner: Arc<Mutex<HandleInner<T>>>,
|
||||
open: Arc<T::Timeline>,
|
||||
}
|
||||
pub(crate) struct WeakHandle<T: Types> {
|
||||
inner: Weak<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
|
||||
enum HandleInner<T: Types> {
|
||||
KeepingTimelineGateOpen {
|
||||
#[allow(dead_code)]
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
timeline: Arc<T::Timeline>,
|
||||
},
|
||||
Open(Arc<T::Timeline>),
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||
pub(crate) trait Timeline<T: Types> {
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
TimelineGateClosed,
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
|
||||
gate_guard: Arc::new(
|
||||
// this enter() is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::gate refcounts
|
||||
match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
},
|
||||
),
|
||||
// this clone is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::clone refcounts
|
||||
timeline: Arc::new(timeline.clone()),
|
||||
}));
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc =
|
||||
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
|
||||
};
|
||||
let lock_guard = inner.lock().expect("poisoned");
|
||||
match &*lock_guard {
|
||||
HandleInner::KeepingTimelineGateOpen {
|
||||
timeline,
|
||||
gate_guard,
|
||||
} => {
|
||||
let gate_guard = Arc::clone(gate_guard);
|
||||
let timeline = Arc::clone(timeline);
|
||||
HandleInner::Open(open) => {
|
||||
let open = Arc::clone(open);
|
||||
drop(lock_guard);
|
||||
Ok(Handle {
|
||||
timeline,
|
||||
gate_guard,
|
||||
inner,
|
||||
})
|
||||
Ok(Handle { open, inner })
|
||||
}
|
||||
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
|
||||
}
|
||||
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
&self.open
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||
///
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
|
||||
/// That's ok because they're short-lived. See module-level comment for details.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(super) fn shutdown(&self) {
|
||||
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
|
||||
impl<T: Types> HandleInner<T> {
|
||||
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
|
||||
match std::mem::replace(self, HandleInner::ShutDown) {
|
||||
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
|
||||
HandleInner::Open(timeline) => Some(timeline),
|
||||
HandleInner::ShutDown => {
|
||||
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
|
||||
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
|
||||
@@ -631,6 +614,7 @@ mod tests {
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use utils::shard::ShardCount;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -641,7 +625,7 @@ mod tests {
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Arc<StubTimeline>;
|
||||
type Timeline = Entered;
|
||||
}
|
||||
|
||||
struct StubManager {
|
||||
@@ -656,17 +640,19 @@ mod tests {
|
||||
myself: Weak<StubTimeline>,
|
||||
}
|
||||
|
||||
struct Entered {
|
||||
timeline: Arc<StubTimeline>,
|
||||
#[allow(dead_code)] // it's stored here to keep the gate open
|
||||
gate_guard: Arc<GateGuard>,
|
||||
}
|
||||
|
||||
impl StubTimeline {
|
||||
fn getpage(&self) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
|
||||
impl Timeline<TestTypes> for Entered {
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: self.shard.shard_index(),
|
||||
@@ -688,20 +674,34 @@ mod tests {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||
) -> anyhow::Result<Entered> {
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
let enter_gate = || {
|
||||
let gate_guard = timeline.gate.enter()?;
|
||||
let gate_guard = Arc::new(gate_guard);
|
||||
anyhow::Ok(gate_guard)
|
||||
};
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
@@ -711,6 +711,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Entered {
|
||||
type Target = StubTimeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_timeline_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
@@ -1038,7 +1045,6 @@ mod tests {
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
// Simulate 10 connections that's opened, used, and closed
|
||||
let mut used_handles = vec![];
|
||||
for _ in 0..10 {
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = {
|
||||
@@ -1050,7 +1056,6 @@ mod tests {
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.timeline));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown.
|
||||
|
||||
@@ -61,11 +61,11 @@ impl HeatmapLayersDownloader {
|
||||
|
||||
tracing::info!(
|
||||
resident_size=%timeline.resident_physical_size(),
|
||||
heatmap_layers=%heatmap.layers.len(),
|
||||
heatmap_layers=%heatmap.all_layers().count(),
|
||||
"Starting heatmap layers download"
|
||||
);
|
||||
|
||||
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
|
||||
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|
||||
|layer| {
|
||||
let ctx = ctx.attached_child();
|
||||
let tl = timeline.clone();
|
||||
|
||||
@@ -8,14 +8,14 @@ use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::{AtomicLsn, Lsn};
|
||||
|
||||
use super::TimelineWriterState;
|
||||
use super::{ReadableLayer, TimelineWriterState};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
|
||||
PersistentLayerKey, ResidentLayer,
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
@@ -37,6 +37,21 @@ impl Default for LayerManager {
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
match weak {
|
||||
ReadableLayerWeak::PersistentLayer(desc) => {
|
||||
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
|
||||
}
|
||||
ReadableLayerWeak::InMemoryLayer(desc) => {
|
||||
let inmem = self
|
||||
.layer_map()
|
||||
.expect("no concurrent shutdown")
|
||||
.in_memory_layer(&desc);
|
||||
ReadableLayer::InMemoryLayer(inmem)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
@@ -470,6 +485,25 @@ impl OpenLayerManager {
|
||||
mapping.remove(layer);
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
|
||||
match layer.info() {
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
assert!(self.layer_map.open_layer.is_none());
|
||||
self.layer_map.open_layer = Some(layer);
|
||||
}
|
||||
InMemoryLayerInfo::Frozen { lsn_start, .. } => {
|
||||
if let Some(last) = self.layer_map.frozen_layers.back() {
|
||||
assert!(last.get_lsn_range().end <= lsn_start);
|
||||
}
|
||||
|
||||
self.layer_map.frozen_layers.push_back(layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
@@ -1369,6 +1369,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->limit;
|
||||
break;
|
||||
case 8:
|
||||
key = "file_cache_chunk_size_pages";
|
||||
value = BLOCKS_PER_CHUNK;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
|
||||
if (!neon_prefetch_response_usable(&lsns[i], slot))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Ignore errors
|
||||
*/
|
||||
if (slot->response->tag != T_NeonGetPageResponse)
|
||||
{
|
||||
if (slot->response->tag != T_NeonErrorResponse)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
|
||||
T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
|
||||
prefetch_set_unused(ring_index);
|
||||
BITMAP_SET(mask, i);
|
||||
|
||||
@@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
static void UpdateDonorShmem(WalProposer *wp);
|
||||
static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
|
||||
static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst);
|
||||
static void MembershipConfigurationFree(MembershipConfiguration *mconf);
|
||||
|
||||
WalProposer *
|
||||
@@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->config = config;
|
||||
wp->api = api;
|
||||
|
||||
for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep)
|
||||
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
|
||||
|
||||
/*
|
||||
* If safekeepers list starts with g# parse generation number followed by
|
||||
* :
|
||||
*/
|
||||
if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
|
||||
{
|
||||
char *endptr;
|
||||
|
||||
errno = 0;
|
||||
wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
|
||||
if (errno != 0)
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
|
||||
}
|
||||
/* Skip past : to the first hostname. */
|
||||
host = endptr + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
host = wp->config->safekeepers_list;
|
||||
}
|
||||
wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
|
||||
|
||||
for (; host != NULL && *host != '\0'; host = sep)
|
||||
{
|
||||
port = strchr(host, ':');
|
||||
if (port == NULL)
|
||||
@@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp)
|
||||
pfree(wp);
|
||||
}
|
||||
|
||||
static bool
|
||||
WalProposerGenerationsEnabled(WalProposer *wp)
|
||||
{
|
||||
return wp->safekeepers_generation != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create new AppendRequest message and start sending it. This function is
|
||||
* called from walsender every time the new WAL is available.
|
||||
@@ -600,10 +632,14 @@ static void
|
||||
SendStartWALPush(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
/* Forbid implicit timeline creation if generations are enabled. */
|
||||
char *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true";
|
||||
#define CMD_LEN 512
|
||||
char cmd[CMD_LEN];
|
||||
|
||||
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
|
||||
|
||||
snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation);
|
||||
if (!wp->api.conn_send_query(sk, cmd))
|
||||
{
|
||||
wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
|
||||
@@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
|
||||
pfree(mconf_toml);
|
||||
|
||||
/*
|
||||
* Adopt mconf of safekeepers if it is higher. TODO: mconf change should
|
||||
* restart wp if it started voting.
|
||||
*/
|
||||
if (sk->greetResponse.mconf.generation > wp->mconf.generation)
|
||||
{
|
||||
MembershipConfigurationFree(&wp->mconf);
|
||||
MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
|
||||
/* full conf was just logged above */
|
||||
wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
|
||||
}
|
||||
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_VOTING;
|
||||
|
||||
@@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].term);
|
||||
pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
|
||||
}
|
||||
/*
|
||||
|
||||
/*
|
||||
* Removed timeline_start_lsn. Still send it as a valid
|
||||
* value until safekeepers taking it from term history are
|
||||
* deployed.
|
||||
@@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
}
|
||||
}
|
||||
wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
|
||||
return false; /* keep the compiler quiet */
|
||||
return false; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf)
|
||||
return s.data;
|
||||
}
|
||||
|
||||
static void
|
||||
MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst)
|
||||
{
|
||||
dst->generation = src->generation;
|
||||
dst->members.len = src->members.len;
|
||||
dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len);
|
||||
memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len);
|
||||
dst->new_members.len = src->new_members.len;
|
||||
dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len);
|
||||
memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len);
|
||||
}
|
||||
|
||||
static void
|
||||
MembershipConfigurationFree(MembershipConfiguration *mconf)
|
||||
{
|
||||
|
||||
@@ -160,7 +160,10 @@ typedef struct MemberSet
|
||||
SafekeeperId *m; /* ids themselves */
|
||||
} MemberSet;
|
||||
|
||||
/* Timeline safekeeper membership configuration. */
|
||||
/*
|
||||
* Timeline safekeeper membership configuration as sent in the
|
||||
* protocol.
|
||||
*/
|
||||
typedef struct MembershipConfiguration
|
||||
{
|
||||
Generation generation;
|
||||
@@ -761,8 +764,22 @@ typedef struct WalProposer
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
int quorum;
|
||||
|
||||
/*
|
||||
* Generation of the membership conf of which safekeepers[] are presumably
|
||||
* members. To make cplane life a bit easier and have more control in
|
||||
* tests with which sks walproposer gets connected neon.safekeepers GUC
|
||||
* doesn't provide full mconf, only the list of endpoints to connect to.
|
||||
* We still would like to know generation associated with it because 1) we
|
||||
* need some handle to enforce using generations in walproposer, and
|
||||
* non-zero value of this serves the purpose; 2) currently we don't do
|
||||
* that, but in theory walproposer can update list of safekeepers to
|
||||
* connect to upon receiving mconf from safekeepers, and generation number
|
||||
* must be checked to see which list is newer.
|
||||
*/
|
||||
Generation safekeepers_generation;
|
||||
/* Number of occupied slots in safekeepers[] */
|
||||
int n_safekeepers;
|
||||
/* Safekeepers walproposer is connecting to. */
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user