Merge pull request #10903 from neondatabase/rc/release-proxy/2025-02-20

Proxy release 2025-02-20
This commit is contained in:
Folke Behrens
2025-02-20 10:37:47 +01:00
committed by GitHub
147 changed files with 4803 additions and 1442 deletions

View File

@@ -28,3 +28,7 @@ config-variables:
- DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
- SLACK_ON_CALL_STORAGE_STAGING_STREAM
- SLACK_CICD_CHANNEL_ID
- SLACK_STORAGE_CHANNEL_ID
- NEON_DEV_AWS_ACCOUNT_ID
- NEON_PROD_AWS_ACCOUNT_ID
- AWS_ECR_REGION

View File

@@ -19,7 +19,11 @@ inputs:
default: '[1, 1]'
# settings below only needed if you want the project to be sharded from the beginning
shard_split_project:
description: 'by default new projects are not shard-split, specify true to shard-split'
description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially'
required: false
default: 'false'
disable_sharding:
description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding'
required: false
default: 'false'
admin_api_key:
@@ -107,6 +111,21 @@ runs:
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
fi
if [ "${DISABLE_SHARDING}" = "true" ]; then
# determine tenant ID
TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}"
echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy"
echo "with body {\"scheduling\": \"Essential\"}"
# we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
curl -X PUT \
"https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \
-H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-d "{\"scheduling\": \"Essential\"}"
fi
env:
API_HOST: ${{ inputs.api_host }}
@@ -116,6 +135,7 @@ runs:
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
DISABLE_SHARDING: ${{ inputs.disable_sharding }}
ADMIN_API_KEY: ${{ inputs.admin_api_key }}
SHARD_COUNT: ${{ inputs.shard_count }}
STRIPE_SIZE: ${{ inputs.stripe_size }}

View File

@@ -348,6 +348,10 @@ jobs:
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
# until they are forcibly stopped by the stricter `timeout-minutes` limit.
extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

View File

@@ -2,7 +2,7 @@ name: Push images to Container Registry
on:
workflow_call:
inputs:
# Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
# Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
image-map:
description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
required: true

View File

@@ -68,7 +68,7 @@ jobs:
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -263,8 +263,9 @@ jobs:
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -858,14 +859,17 @@ jobs:
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
AWS_REGION: "${{ vars.AWS_ECR_REGION }}"
push-neon-image-dev:
needs: [ generate-image-maps, neon-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -880,8 +884,8 @@ jobs:
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -897,8 +901,8 @@ jobs:
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -914,8 +918,8 @@ jobs:
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -1028,7 +1032,7 @@ jobs:
statuses: write
contents: write
runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest
steps:
- uses: actions/checkout@v4
@@ -1177,6 +1181,22 @@ jobs:
exit 1
fi
notify-storage-release-deploy-failure:
needs: [ deploy ]
# We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
runs-on: ubuntu-22.04
steps:
- name: Post release-deploy failure to team-storage slack channel
uses: slackapi/slack-github-action@v2
with:
method: chat.postMessage
token: ${{ secrets.SLACK_BOT_TOKEN }}
payload: |
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
text: |
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ deploy ]
@@ -1273,7 +1293,7 @@ jobs:
done
pin-build-tools-image:
needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
needs: [ build-build-tools-image, test-images, build-and-test-locally ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:

View File

@@ -27,7 +27,7 @@ env:
jobs:
tag:
runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}

View File

@@ -0,0 +1,76 @@
name: Force Test Upgrading of Extension
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '45 2 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow
group: ${{ github.workflow }}
cancel-in-progress: true
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
jobs:
regress:
strategy:
fail-fast: false
matrix:
pg-version: [16, 17]
runs-on: small
steps:
- uses: actions/checkout@v4
with:
submodules: false
- name: Get the last compute release tag
id: get-last-compute-release-tag
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/releases")
echo tag=${tag} >> ${GITHUB_OUTPUT}
- name: Test extension upgrade
timeout-minutes: 20
env:
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh
- name: Print logs and clean up
if: always()
run: |
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
- name: Post to the Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
slack-message: |
Test upgrading of extensions: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -32,18 +32,27 @@ jobs:
- target_project: new_empty_project_stripe_size_2048
stripe_size: 2048 # 16 MiB
postgres_version: 16
disable_sharding: false
- target_project: new_empty_project_stripe_size_32768
stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
# while here it is sharded from the beginning with a shard size of 256 MiB
disable_sharding: false
postgres_version: 16
- target_project: new_empty_project
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
disable_sharding: false
postgres_version: 16
- target_project: new_empty_project
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
disable_sharding: false
postgres_version: 17
- target_project: large_existing_project
stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
disable_sharding: false
postgres_version: 16
- target_project: new_empty_project_unsharded
stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
disable_sharding: true
postgres_version: 16
max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
permissions:
@@ -96,6 +105,7 @@ jobs:
admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
shard_count: 8
stripe_size: ${{ matrix.stripe_size }}
disable_sharding: ${{ matrix.disable_sharding }}
- name: Initialize Neon project
if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}

View File

@@ -33,10 +33,6 @@ concurrency:
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
env:
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: pinned
jobs:
check-manifests:
runs-on: ubuntu-22.04
@@ -46,11 +42,14 @@ jobs:
steps:
- name: Check if we really need to pin the image
id: check-manifests
env:
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: pinned
run: |
docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json
docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json"
docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json"
if diff ${FROM_TAG}.json ${TO_TAG}.json; then
if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then
skip=true
else
skip=false
@@ -64,55 +63,34 @@ jobs:
# use format(..) to catch both inputs.force = true AND inputs.force = 'true'
if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
runs-on: ubuntu-22.04
permissions:
id-token: write # for `azure/login` and aws auth
id-token: write # Required for aws/azure login
steps:
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- name: Azure login
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
- name: Login to ACR
run: |
az acr login --name=neoneastus2
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
env:
DEFAULT_DEBIAN_VERSION: bookworm
run: |
for debian_version in bullseye bookworm; do
tags=()
tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
fi
docker buildx imagetools create "${tags[@]}" \
neondatabase/build-tools:${FROM_TAG}-${debian_version}
done
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: |
{
"docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
"docker.io/neondatabase/build-tools:pinned-bullseye",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
"${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
],
"docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
"docker.io/neondatabase/build-tools:pinned-bookworm",
"docker.io/neondatabase/build-tools:pinned",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
"${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
"${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
"${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned"
]
}
aws-region: ${{ vars.AWS_ECR_REGION }}
aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

View File

@@ -0,0 +1,41 @@
name: Regenerate Postgres Settings
on:
pull_request:
types:
- opened
- synchronize
- reopened
paths:
- pgxn/neon/**.c
- vendor/postgres-v*
- vendor/revisions.json
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
pull-requests: write
jobs:
regenerate-pg-settings:
runs-on: ubuntu-22.04
steps:
- name: Add comment
uses: thollander/actions-comment-pull-request@v3
with:
comment-tag: ${{ github.job }}
pr-number: ${{ github.event.number }}
message: |
If this PR added a GUC in the Postgres fork or `neon` extension,
please regenerate the Postgres settings in the `cloud` repo:
```
make NEON_WORKDIR=path/to/neon/checkout \
-C goapp/internal/shareddomain/postgres generate
```
If you're an external contributor, a Neon employee will assist in
making sure this step is done.

View File

@@ -88,7 +88,7 @@ jobs:
BUILD_AND_TEST_RUN_ID=${TAG}
while true; do
gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then
if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
break
fi
jq -c '.[]' jobs.json | while read -r job; do

33
Cargo.lock generated
View File

@@ -786,7 +786,7 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
dependencies = [
"async-trait",
"base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
dependencies = [
"async-lock",
"async-trait",
@@ -834,7 +834,7 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
dependencies = [
"RustyXML",
"async-lock",
@@ -852,7 +852,7 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
dependencies = [
"RustyXML",
"azure_core",
@@ -872,7 +872,7 @@ dependencies = [
[[package]]
name = "azure_svc_blobstorage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
dependencies = [
"azure_core",
"bytes",
@@ -1029,12 +1029,6 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boxcar"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
[[package]]
name = "bstr"
version = "1.5.0"
@@ -1293,6 +1287,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"jsonwebtoken",
"regex",
"remote_storage",
"serde",
@@ -1308,6 +1303,7 @@ dependencies = [
"aws-config",
"aws-sdk-kms",
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"base64 0.13.1",
"bytes",
@@ -1329,7 +1325,6 @@ dependencies = [
"opentelemetry_sdk",
"postgres",
"postgres_initdb",
"prometheus",
"regex",
"remote_storage",
"reqwest",
@@ -1348,13 +1343,13 @@ dependencies = [
"tower 0.5.2",
"tower-http",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"url",
"utils",
"uuid",
"vm_monitor",
"walkdir",
"workspace_hack",
"zstd",
]
@@ -4927,7 +4922,6 @@ dependencies = [
"aws-sdk-iam",
"aws-sigv4",
"base64 0.13.1",
"boxcar",
"bstr",
"bytes",
"camino",
@@ -4979,7 +4973,6 @@ dependencies = [
"postgres-protocol2",
"postgres_backend",
"pq_proto",
"prometheus",
"rand 0.8.5",
"rand_distr",
"rcgen",
@@ -5004,7 +4997,6 @@ dependencies = [
"smallvec",
"smol_str",
"socket2",
"strum",
"strum_macros",
"subtle",
"thiserror 1.0.69",
@@ -5019,7 +5011,6 @@ dependencies = [
"tracing",
"tracing-log",
"tracing-opentelemetry",
"tracing-serde",
"tracing-subscriber",
"tracing-utils",
"try-lock",
@@ -6460,10 +6451,13 @@ dependencies = [
"pageserver_client",
"postgres_connection",
"rand 0.8.5",
"regex",
"reqwest",
"routerify",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"safekeeper_api",
"safekeeper_client",
"scoped-futures",
"scopeguard",
"serde",
@@ -6471,6 +6465,7 @@ dependencies = [
"strum",
"strum_macros",
"thiserror 1.0.69",
"tikv-jemallocator",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
@@ -7024,14 +7019,11 @@ dependencies = [
name = "tokio-postgres2"
version = "0.1.0"
dependencies = [
"async-trait",
"byteorder",
"bytes",
"fallible-iterator",
"futures-util",
"log",
"parking_lot 0.12.1",
"percent-encoding",
"phf",
"pin-project-lite",
"postgres-protocol2",
@@ -7618,7 +7610,6 @@ dependencies = [
"hex",
"hex-literal",
"humantime",
"inferno 0.12.0",
"jsonwebtoken",
"metrics",
"nix 0.27.1",

View File

@@ -50,6 +50,14 @@ RUN set -e \
&& rm -rf pg_install/build \
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
# Prepare cargo-chef recipe
FROM $REPOSITORY/$IMAGE:$TAG AS plan
WORKDIR /home/nonroot
COPY --chown=nonroot . .
RUN cargo chef prepare --recipe-path recipe.json
# Build neon binaries
FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
@@ -63,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
COPY --from=plan /home/nonroot/recipe.json recipe.json
ARG ADDITIONAL_RUSTFLAGS=""
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \

View File

@@ -300,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
ARG CARGO_DENY_VERSION=0.16.2
ARG CARGO_HACK_VERSION=0.6.33
ARG CARGO_NEXTEST_VERSION=0.9.85
ARG CARGO_CHEF_VERSION=0.1.71
ARG CARGO_DIESEL_CLI_VERSION=2.2.6
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
@@ -314,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
--features postgres-bundled --no-default-features && \
rm -rf /home/nonroot/.cargo/registry && \

View File

@@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
apt install --no-install-recommends --no-install-suggests -y \
ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
$VERSION_INSTALLS \
&& apt clean && rm -rf /var/lib/apt/lists/*
@@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
#########################################################################################
#
# Layer "pg-duckdb-pg-build"
# compile pg_duckdb extension
#
#########################################################################################
FROM build-deps AS pg_duckdb-src
WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
cd pg_duckdb-src && \
git submodule update --init --recursive && \
patch -p1 < /ext-src/pg_duckdb_v031.patch
FROM pg-build AS pg_duckdb-build
ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1484,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src
RUN make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pgaudit"
# compile pgaudit extension
#
#########################################################################################
FROM build-deps AS pgaudit-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION}" in \
"v14") \
export PGAUDIT_VERSION=1.6.2 \
export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
;; \
"v15") \
export PGAUDIT_VERSION=1.7.0 \
export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
;; \
"v16") \
export PGAUDIT_VERSION=16.0 \
export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
;; \
"v17") \
export PGAUDIT_VERSION=17.0 \
export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
;; \
*) \
echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
esac && \
wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
FROM pg-build AS pgaudit-build
COPY --from=pgaudit-src /ext-src/ /ext-src/
WORKDIR /ext-src/pgaudit-src
RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
#########################################################################################
#
# Layer "pgauditlogtofile"
# compile pgauditlogtofile extension
#
#########################################################################################
FROM build-deps AS pgauditlogtofile-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16" | "v17") \
export PGAUDITLOGTOFILE_VERSION=v1.6.4 \
export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \
;; \
*) \
echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \
esac && \
wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \
echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \
mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C .
FROM pg-build AS pgauditlogtofile-build
COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/
WORKDIR /ext-src/pgauditlogtofile-src
RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
#########################################################################################
#
# Layer "neon-ext-build"
@@ -1577,7 +1669,10 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
#########################################################################################
#
@@ -1669,29 +1764,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
&& echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
&& echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
#########################################################################################
#
# Layer "awscli"
#
#########################################################################################
FROM build-deps AS awscli
ARG TARGETARCH
RUN set -ex; \
if [ "${TARGETARCH}" = "amd64" ]; then \
TARGETARCH_ALT="x86_64"; \
CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
elif [ "${TARGETARCH}" = "arm64" ]; then \
TARGETARCH_ALT="aarch64"; \
CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
else \
echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
fi; \
curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \
unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
/tmp/awscliv2/aws/install; \
rm -rf /tmp/awscliv2.zip /tmp/awscliv2
#########################################################################################
#
# Clean up postgres folder before inclusion
@@ -1750,7 +1822,7 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
COPY --from=hypopg-src /ext-src/ /ext-src/
COPY --from=pg_hashids-src /ext-src/ /ext-src/
COPY --from=rum-src /ext-src/ /ext-src/
#COPY --from=pgtap-src /ext-src/ /ext-src/
COPY --from=pgtap-src /ext-src/ /ext-src/
COPY --from=ip4r-src /ext-src/ /ext-src/
COPY --from=prefix-src /ext-src/ /ext-src/
COPY --from=hll-src /ext-src/ /ext-src/
@@ -1775,11 +1847,14 @@ COPY --from=pg_partman-src /ext-src/ /ext-src/
#COPY --from=pg_repack-src /ext-src/ /ext-src/
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
&& apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/*
ENV PATH=/usr/local/pgsql/bin:$PATH
ENV PGHOST=compute
ENV PGPORT=55433
ENV PGUSER=cloud_admin
ENV PGDATABASE=postgres
ENV PG_VERSION=${PG_VERSION:?}
#########################################################################################
#
@@ -1861,9 +1936,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
mkdir /usr/local/download_extensions && \
chown -R postgres:postgres /usr/local/download_extensions
# aws cli is used by fast_import
COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
# pgbouncer and its config
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini

View File

@@ -0,0 +1,11 @@
diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
index d777d76..af60106 100644
--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;

View File

@@ -47,7 +47,9 @@ files:
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes

View File

@@ -14,6 +14,7 @@ base64.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-sdk-kms.workspace = true
aws-smithy-types.workspace = true
anyhow.workspace = true
axum = { workspace = true, features = [] }
camino.workspace = true
@@ -46,13 +47,12 @@ tokio-postgres.workspace = true
tokio-util.workspace = true
tokio-stream.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
uuid.workspace = true
prometheus.workspace = true
walkdir.workspace = true
postgres_initdb.workspace = true
compute_api.workspace = true

View File

@@ -55,7 +55,7 @@ use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
@@ -281,6 +281,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
info!("got spec from cli argument {}", spec_json);
return Ok(CliSpecParams {
spec: Some(serde_json::from_str(spec_json)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: false,
});
}
@@ -290,6 +291,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: true,
});
}
@@ -299,8 +301,9 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
};
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(spec) => Ok(CliSpecParams {
spec,
Ok(resp) => Ok(CliSpecParams {
spec: resp.0,
compute_ctl_config: resp.1,
live_config_allowed: true,
}),
Err(e) => {
@@ -317,6 +320,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
#[allow(dead_code)]
compute_ctl_config: ComputeCtlConfig,
live_config_allowed: bool,
}
@@ -326,6 +331,7 @@ fn wait_spec(
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();

View File

@@ -25,10 +25,10 @@
//! docker push localhost:3030/localregistry/compute-node-v14:latest
//! ```
use anyhow::Context;
use anyhow::{bail, Context};
use aws_config::BehaviorVersion;
use camino::{Utf8Path, Utf8PathBuf};
use clap::Parser;
use clap::{Parser, Subcommand};
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
use nix::unistd::Pid;
use tracing::{error, info, info_span, warn, Instrument};
@@ -44,32 +44,59 @@ mod s3_uri;
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
#[derive(Subcommand, Debug)]
enum Command {
/// Runs local postgres (neon binary), restores into it,
/// uploads pgdata to s3 to be consumed by pageservers
Pgdata {
/// Raw connection string to the source database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
source_connection_string: Option<String>,
/// If specified, will not shut down the local postgres after the import. Used in local testing
#[clap(short, long)]
interactive: bool,
/// Port to run postgres on. Default is 5432.
#[clap(long, default_value_t = 5432)]
pg_port: u16, // port to run postgres on, 5432 is default
/// Number of CPUs in the system. This is used to configure # of
/// parallel worker processes, for index creation.
#[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
num_cpus: Option<usize>,
/// Amount of RAM in the system. This is used to configure shared_buffers
/// and maintenance_work_mem.
#[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
memory_mb: Option<usize>,
},
/// Runs pg_dump-pg_restore from source to destination without running local postgres.
DumpRestore {
/// Raw connection string to the source database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
source_connection_string: Option<String>,
/// Raw connection string to the destination database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
destination_connection_string: Option<String>,
},
}
#[derive(clap::Parser)]
struct Args {
#[clap(long)]
#[clap(long, env = "NEON_IMPORTER_WORKDIR")]
working_directory: Utf8PathBuf,
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
s3_prefix: Option<s3_uri::S3Uri>,
#[clap(long)]
source_connection_string: Option<String>,
#[clap(short, long)]
interactive: bool,
#[clap(long)]
#[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
pg_bin_dir: Utf8PathBuf,
#[clap(long)]
#[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
pg_lib_dir: Utf8PathBuf,
#[clap(long)]
pg_port: Option<u16>, // port to run postgres on, 5432 is default
/// Number of CPUs in the system. This is used to configure # of
/// parallel worker processes, for index creation.
#[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
num_cpus: Option<usize>,
/// Amount of RAM in the system. This is used to configure shared_buffers
/// and maintenance_work_mem.
#[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
memory_mb: Option<usize>,
#[clap(subcommand)]
command: Command,
}
#[serde_with::serde_as]
@@ -78,6 +105,8 @@ struct Spec {
encryption_secret: EncryptionSecret,
#[serde_as(as = "serde_with::base64::Base64")]
source_connstring_ciphertext_base64: Vec<u8>,
#[serde_as(as = "Option<serde_with::base64::Base64>")]
destination_connstring_ciphertext_base64: Option<Vec<u8>>,
}
#[derive(serde::Deserialize)]
@@ -93,192 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
"C.UTF-8"
};
#[tokio::main]
pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Plain,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::Output::Stdout,
)?;
info!("starting");
let args = Args::parse();
// Validate arguments
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
}
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
}
let working_directory = args.working_directory;
let pg_bin_dir = args.pg_bin_dir;
let pg_lib_dir = args.pg_lib_dir;
let pg_port = args.pg_port.unwrap_or_else(|| {
info!("pg_port not specified, using default 5432");
5432
});
// Initialize AWS clients only if s3_prefix is specified
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms = aws_sdk_kms::Client::new(&config);
(Some(config), Some(kms))
} else {
(None, None)
};
// Get source connection string either from S3 spec or direct argument
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
let spec: Spec = {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
};
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let mut output = kms_client
.unwrap()
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
spec.source_connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt source connection string")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext source connection string")?;
String::from_utf8(plaintext.into_inner())
.context("parse source connection string as utf8")?
}
}
} else {
args.source_connection_string.unwrap()
};
match tokio::fs::create_dir(&working_directory).await {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
if !is_directory_empty(&working_directory)
.await
.context("check if working directory is empty")?
{
anyhow::bail!("working directory is not empty");
} else {
// ok
}
}
Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
}
let pgdata_dir = working_directory.join("pgdata");
tokio::fs::create_dir(&pgdata_dir)
async fn decode_connstring(
kms_client: &aws_sdk_kms::Client,
key_id: &String,
connstring_ciphertext_base64: Vec<u8>,
) -> Result<String, anyhow::Error> {
let mut output = kms_client
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
connstring_ciphertext_base64,
))
.send()
.await
.context("create pgdata directory")?;
.context("decrypt connection string")?;
let pgbin = pg_bin_dir.join("postgres");
let pg_version = match get_pg_version(pgbin.as_ref()) {
PostgresMajorVersion::V14 => 14,
PostgresMajorVersion::V15 => 15,
PostgresMajorVersion::V16 => 16,
PostgresMajorVersion::V17 => 17,
};
let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser,
locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
pg_version,
initdb_bin: pg_bin_dir.join("initdb").as_ref(),
library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
pgdata: &pgdata_dir,
})
.await
.context("initdb")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext connection string")?;
// If the caller didn't specify CPU / RAM to use for sizing, default to
// number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
let memory_mb = args.memory_mb.unwrap_or(256);
String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
}
// Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
// maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
// available for misc other stuff that PostgreSQL uses memory for.
let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
struct PostgresProcess {
pgdata_dir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pgbin: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
postgres_proc: Option<tokio::process::Child>,
}
//
// Launch postgres process
//
let mut postgres_proc = tokio::process::Command::new(pgbin)
.arg("-D")
.arg(&pgdata_dir)
.args(["-p", &format!("{pg_port}")])
.args(["-c", "wal_level=minimal"])
.args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
.args(["-c", "max_wal_senders=0"])
.args(["-c", "fsync=off"])
.args(["-c", "full_page_writes=off"])
.args(["-c", "synchronous_commit=off"])
.args([
"-c",
&format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
])
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args([
"-c",
&format!(
"effective_io_concurrency={}",
if cfg!(target_os = "macos") { 0 } else { 100 }
),
])
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
impl PostgresProcess {
fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
Self {
pgdata_dir,
pgbin: pg_bin_dir.join("postgres"),
pg_bin_dir,
pg_lib_dir,
postgres_proc: None,
}
}
async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
tokio::fs::create_dir(&self.pgdata_dir)
.await
.context("create pgdata directory")?;
let pg_version = match get_pg_version(self.pgbin.as_ref()) {
PostgresMajorVersion::V14 => 14,
PostgresMajorVersion::V15 => 15,
PostgresMajorVersion::V16 => 16,
PostgresMajorVersion::V17 => 17,
};
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser: initdb_user,
locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
pg_version,
initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
pgdata: &self.pgdata_dir,
})
.await
.context("initdb")
}
async fn start(
&mut self,
initdb_user: &str,
port: u16,
nproc: usize,
memory_mb: usize,
) -> Result<&tokio::process::Child, anyhow::Error> {
self.prepare(initdb_user).await?;
// Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
// maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
// available for misc other stuff that PostgreSQL uses memory for.
let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
//
// Launch postgres process
//
let mut proc = tokio::process::Command::new(&self.pgbin)
.arg("-D")
.arg(&self.pgdata_dir)
.args(["-p", &format!("{port}")])
.args(["-c", "wal_level=minimal"])
.args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
.args(["-c", "max_wal_senders=0"])
.args(["-c", "fsync=off"])
.args(["-c", "full_page_writes=off"])
.args(["-c", "synchronous_commit=off"])
.args([
"-c",
&format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
])
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args(["-c", "effective_io_concurrency=100"])
.env_clear()
.env("LD_LIBRARY_PATH", &self.pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn postgres")?;
info!("spawned postgres, waiting for it to become ready");
tokio::spawn(
child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
.instrument(info_span!("postgres")),
);
self.postgres_proc = Some(proc);
Ok(self.postgres_proc.as_ref().unwrap())
}
async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
info!("shutdown postgres");
nix::sys::signal::kill(
Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
nix::sys::signal::SIGTERM,
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn postgres")?;
info!("spawned postgres, waiting for it to become ready");
tokio::spawn(
child_stdio_to_log::relay_process_output(
postgres_proc.stdout.take(),
postgres_proc.stderr.take(),
)
.instrument(info_span!("postgres")),
);
.context("signal postgres to shut down")?;
proc.wait()
.await
.context("wait for postgres to shut down")
.map(|_| ())
}
}
async fn wait_until_ready(connstring: String, create_dbname: String) {
// Create neondb database in the running postgres
let restore_pg_connstring =
format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
let start_time = std::time::Instant::now();
loop {
@@ -289,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
std::process::exit(1);
}
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
match tokio_postgres::connect(
&connstring.replace("dbname=neondb", "dbname=postgres"),
tokio_postgres::NoTls,
)
.await
{
Ok((client, connection)) => {
// Spawn the connection handling task to maintain the connection
tokio::spawn(async move {
@@ -298,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
}
});
match client.simple_query("CREATE DATABASE neondb;").await {
match client
.simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
.await
{
Ok(_) => {
info!("created neondb database");
info!("created {} database", create_dbname);
break;
}
Err(e) => {
@@ -324,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
}
}
}
}
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
let dumpdir = working_directory.join("dumpdir");
async fn run_dump_restore(
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
source_connstring: String,
destination_connstring: String,
) -> Result<(), anyhow::Error> {
let dumpdir = workdir.join("dumpdir");
let common_args = [
// schema mapping (prob suffices to specify them on one side)
@@ -356,10 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> {
.arg("--no-sync")
// POSITIONAL args
// source db (db name included in connection string)
.arg(&source_connection_string)
.arg(&source_connstring)
// how we run it
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.kill_on_drop(true)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
@@ -376,24 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> {
let st = pg_dump.wait().await.context("wait for pg_dump")?;
info!(status=?st, "pg_dump exited");
if !st.success() {
warn!(status=%st, "pg_dump failed, restore will likely fail as well");
error!(status=%st, "pg_dump failed, restore will likely fail as well");
bail!("pg_dump failed");
}
}
// TODO: do it in a streaming way, plenty of internal research done on this already
// TODO: maybe do it in a streaming way, plenty of internal research done on this already
// TODO: do the unlogged table trick
info!("restore from working directory into vanilla postgres");
{
let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
.args(&common_args)
.arg("-d")
.arg(&restore_pg_connstring)
.arg(&destination_connstring)
// POSITIONAL args
.arg(&dumpdir)
// how we run it
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.kill_on_drop(true)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
@@ -411,48 +427,259 @@ pub(crate) async fn main() -> anyhow::Result<()> {
let st = pg_restore.wait().await.context("wait for pg_restore")?;
info!(status=?st, "pg_restore exited");
if !st.success() {
warn!(status=%st, "pg_restore failed, restore will likely fail as well");
}
}
// If interactive mode, wait for Ctrl+C
if args.interactive {
info!("Running in interactive mode. Press Ctrl+C to shut down.");
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
}
info!("shutdown postgres");
{
nix::sys::signal::kill(
Pid::from_raw(
i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
),
nix::sys::signal::SIGTERM,
)
.context("signal postgres to shut down")?;
postgres_proc
.wait()
.await
.context("wait for postgres to shut down")?;
}
// Only sync if s3_prefix was specified
if let Some(s3_prefix) = args.s3_prefix {
info!("upload pgdata");
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
.await
.context("sync dump directory to destination")?;
info!("write status");
{
let status_dir = working_directory.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("pgdata");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
.context("write status file")?;
aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
.await
.context("sync status directory to destination")?;
error!(status=%st, "pg_restore failed, restore will likely fail as well");
bail!("pg_restore failed");
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
async fn cmd_pgdata(
s3_client: Option<aws_sdk_s3::Client>,
kms_client: Option<aws_sdk_kms::Client>,
maybe_s3_prefix: Option<s3_uri::S3Uri>,
maybe_spec: Option<Spec>,
source_connection_string: Option<String>,
interactive: bool,
pg_port: u16,
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
num_cpus: Option<usize>,
memory_mb: Option<usize>,
) -> Result<(), anyhow::Error> {
if maybe_spec.is_none() && source_connection_string.is_none() {
bail!("spec must be provided for pgdata command");
}
if maybe_spec.is_some() && source_connection_string.is_some() {
bail!("only one of spec or source_connection_string can be provided");
}
let source_connection_string = if let Some(spec) = maybe_spec {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
decode_connstring(
kms_client.as_ref().unwrap(),
&key_id,
spec.source_connstring_ciphertext_base64,
)
.await?
}
}
} else {
source_connection_string.unwrap()
};
let superuser = "cloud_admin";
let destination_connstring = format!(
"host=localhost port={} user={} dbname=neondb",
pg_port, superuser
);
let pgdata_dir = workdir.join("pgdata");
let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
let nproc = num_cpus.unwrap_or_else(num_cpus::get);
let memory_mb = memory_mb.unwrap_or(256);
proc.start(superuser, pg_port, nproc, memory_mb).await?;
wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
run_dump_restore(
workdir.clone(),
pg_bin_dir,
pg_lib_dir,
source_connection_string,
destination_connstring,
)
.await?;
// If interactive mode, wait for Ctrl+C
if interactive {
info!("Running in interactive mode. Press Ctrl+C to shut down.");
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
}
proc.shutdown().await?;
// Only sync if s3_prefix was specified
if let Some(s3_prefix) = maybe_s3_prefix {
info!("upload pgdata");
aws_s3_sync::upload_dir_recursive(
s3_client.as_ref().unwrap(),
Utf8Path::new(&pgdata_dir),
&s3_prefix.append("/pgdata/"),
)
.await
.context("sync dump directory to destination")?;
info!("write status");
{
let status_dir = workdir.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("pgdata");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
.context("write status file")?;
aws_s3_sync::upload_dir_recursive(
s3_client.as_ref().unwrap(),
&status_dir,
&s3_prefix.append("/status/"),
)
.await
.context("sync status directory to destination")?;
}
}
Ok(())
}
async fn cmd_dumprestore(
kms_client: Option<aws_sdk_kms::Client>,
maybe_spec: Option<Spec>,
source_connection_string: Option<String>,
destination_connection_string: Option<String>,
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
) -> Result<(), anyhow::Error> {
let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let source = decode_connstring(
kms_client.as_ref().unwrap(),
&key_id,
spec.source_connstring_ciphertext_base64,
)
.await?;
let dest = if let Some(dest_ciphertext) =
spec.destination_connstring_ciphertext_base64
{
decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
.await?
} else {
bail!("destination connection string must be provided in spec for dump_restore command");
};
(source, dest)
}
}
} else {
(
source_connection_string.unwrap(),
if let Some(val) = destination_connection_string {
val
} else {
bail!("destination connection string must be provided for dump_restore command");
},
)
};
run_dump_restore(
workdir,
pg_bin_dir,
pg_lib_dir,
source_connstring,
destination_connstring,
)
.await
}
#[tokio::main]
pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Json,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::Output::Stdout,
)?;
info!("starting");
let args = Args::parse();
// Initialize AWS clients only if s3_prefix is specified
let (s3_client, kms_client) = if args.s3_prefix.is_some() {
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let s3_client = aws_sdk_s3::Client::new(&config);
let kms = aws_sdk_kms::Client::new(&config);
(Some(s3_client), Some(kms))
} else {
(None, None)
};
let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
let spec_key = s3_prefix.append("/spec.json");
let object = s3_client
.as_ref()
.unwrap()
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
} else {
None
};
match tokio::fs::create_dir(&args.working_directory).await {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
if !is_directory_empty(&args.working_directory)
.await
.context("check if working directory is empty")?
{
bail!("working directory is not empty");
} else {
// ok
}
}
Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
}
match args.command {
Command::Pgdata {
source_connection_string,
interactive,
pg_port,
num_cpus,
memory_mb,
} => {
cmd_pgdata(
s3_client,
kms_client,
args.s3_prefix,
spec,
source_connection_string,
interactive,
pg_port,
args.working_directory,
args.pg_bin_dir,
args.pg_lib_dir,
num_cpus,
memory_mb,
)
.await?;
}
Command::DumpRestore {
source_connection_string,
destination_connection_string,
} => {
cmd_dumprestore(
kms_client,
spec,
source_connection_string,
destination_connection_string,
args.working_directory,
args.pg_bin_dir,
args.pg_lib_dir,
)
.await?;
}
}

View File

@@ -1,24 +1,102 @@
use anyhow::Context;
use camino::Utf8Path;
use camino::{Utf8Path, Utf8PathBuf};
use tokio::task::JoinSet;
use walkdir::WalkDir;
use super::s3_uri::S3Uri;
pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
let mut builder = tokio::process::Command::new("aws");
builder
.arg("s3")
.arg("sync")
.arg(local.as_str())
.arg(remote.to_string());
let st = builder
.spawn()
.context("spawn aws s3 sync")?
.wait()
.await
.context("wait for aws s3 sync")?;
if st.success() {
Ok(())
} else {
Err(anyhow::anyhow!("aws s3 sync failed"))
use tracing::{info, warn};
const MAX_PARALLEL_UPLOADS: usize = 10;
/// Upload all files from 'local' to 'remote'
pub(crate) async fn upload_dir_recursive(
s3_client: &aws_sdk_s3::Client,
local: &Utf8Path,
remote: &S3Uri,
) -> anyhow::Result<()> {
// Recursively scan directory
let mut dirwalker = WalkDir::new(local)
.into_iter()
.map(|entry| {
let entry = entry?;
let file_type = entry.file_type();
let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf();
Ok((file_type, path))
})
.filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| {
match e {
Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)),
Ok((file_type, _path)) if file_type.is_dir() => {
// The WalkDir iterator will recurse into directories, but we don't want
// to do anything with directories as such. There's no concept of uploading
// an empty directory to S3.
None
}
Ok((file_type, path)) if file_type.is_symlink() => {
// huh, didn't expect a symlink. Can't upload that to S3. Warn and skip.
warn!("cannot upload symlink ({})", path);
None
}
Ok((_file_type, path)) => {
// should not happen
warn!("directory entry has unexpected type ({})", path);
None
}
Err(e) => Some(Err(e)),
}
});
// Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in
// parallel.
let mut joinset = JoinSet::new();
loop {
// Could we upload more?
while joinset.len() < MAX_PARALLEL_UPLOADS {
if let Some(full_local_path) = dirwalker.next() {
let full_local_path = full_local_path?;
let relative_local_path = full_local_path
.strip_prefix(local)
.expect("all paths start from the walkdir root");
let remote_path = remote.append(relative_local_path.as_str());
info!(
"starting upload of {} to {}",
&full_local_path, &remote_path
);
let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path);
joinset.spawn(upload_task);
} else {
info!("draining upload tasks");
break;
}
}
// Wait for an upload to complete
if let Some(res) = joinset.join_next().await {
let _ = res?;
} else {
// all done!
break;
}
}
Ok(())
}
pub(crate) async fn upload_file(
s3_client: aws_sdk_s3::Client,
local_path: Utf8PathBuf,
remote: S3Uri,
) -> anyhow::Result<()> {
use aws_smithy_types::byte_stream::ByteStream;
let stream = ByteStream::from_path(&local_path).await?;
let _result = s3_client
.put_object()
.bucket(remote.bucket)
.key(&remote.key)
.body(stream)
.send()
.await?;
info!("upload of {} to {} finished", &local_path, &remote.key);
Ok(())
}

View File

@@ -11,7 +11,9 @@ use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
use compute_api::responses::{
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::spec::ComputeSpec;
// Do control plane request and return response if any. In case of error it
@@ -73,14 +75,13 @@ fn do_control_plane_request(
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<Option<ComputeSpec>> {
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
let mut spec: Result<Option<ComputeSpec>> = Ok(None);
info!("getting spec from control plane: {}", cp_uri);
@@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane(
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
spec = match do_control_plane_request(&cp_uri, &jwt) {
let result = match do_control_plane_request(&cp_uri, &jwt) {
Ok(spec_resp) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[
@@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane(
])
.inc();
match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok(None),
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
ControlPlaneComputeStatus::Attached => {
if let Some(spec) = spec_resp.spec {
Ok(Some(spec))
Ok((Some(spec), spec_resp.compute_ctl_config))
} else {
bail!("compute is attached, but spec is empty")
}
@@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane(
}
};
if let Err(e) = &spec {
if let Err(e) = &result {
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return spec;
return result;
}
attempt += 1;
@@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane(
}
// All attempts failed, return error.
spec
Err(anyhow::anyhow!(
"Exhausted all attempts to retrieve the spec from the control plane"
))
}
/// Check `pg_hba.conf` and update if needed to allow external connections.

View File

@@ -48,6 +48,8 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
@@ -880,10 +882,13 @@ impl Endpoint {
self.external_http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.body(
serde_json::to_string(&ConfigurationRequest {
spec,
compute_ctl_config: ComputeCtlConfig::default(),
})
.unwrap(),
)
.send()
.await?;

View File

@@ -838,7 +838,10 @@ impl StorageController {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest { node_id }),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
)
.await
}

View File

@@ -22,7 +22,7 @@ use pageserver_api::{
};
use pageserver_client::mgmt_api::{self};
use reqwest::{Method, StatusCode, Url};
use utils::id::{NodeId, TenantId};
use utils::id::{NodeId, TenantId, TimelineId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -239,6 +239,19 @@ enum Command {
#[arg(long)]
scheduling_policy: SkSchedulingPolicyArg,
},
/// Downloads any missing heatmap layers for all shard for a given timeline
DownloadHeatmapLayers {
/// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified,
/// the operation is performed on all shards. When a sharded tenant ID is
/// specified, the operation is only performed on the specified shard.
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
timeline_id: TimelineId,
/// Optional: Maximum download concurrency (default is 16)
#[arg(long)]
concurrency: Option<usize>,
},
}
#[derive(Parser)]
@@ -609,7 +622,10 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest { node_id: node };
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -623,7 +639,10 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest { node_id: node };
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -1082,7 +1101,10 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest { node_id: mv.to }),
Some(TenantShardMigrateRequest {
node_id: mv.to,
migration_config: None,
}),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1238,6 +1260,24 @@ async fn main() -> anyhow::Result<()> {
String::from(scheduling_policy)
);
}
Command::DownloadHeatmapLayers {
tenant_shard_id,
timeline_id,
concurrency,
} => {
let mut path = format!(
"/v1/tenant/{}/timeline/{}/download_heatmap_layers",
tenant_shard_id, timeline_id,
);
if let Some(c) = concurrency {
path = format!("{path}?concurrency={c}");
}
storcon_client
.dispatch::<(), ()>(Method::POST, path, None)
.await?;
}
}
Ok(())

View File

@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
# We are running tests now
rm -f testout.txt testout_contrib.txt
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
$TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
$TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0

View File

@@ -0,0 +1,24 @@
diff --git a/test/sql/base.sql b/test/sql/base.sql
index 53adb30..2eed91b 100644
--- a/test/sql/base.sql
+++ b/test/sql/base.sql
@@ -2,7 +2,6 @@
BEGIN;
\i test/pgtap-core.sql
-CREATE EXTENSION semver;
SELECT plan(334);
--SELECT * FROM no_plan();
diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
index c0fe98e..39cdd2e 100644
--- a/test/sql/corpus.sql
+++ b/test/sql/corpus.sql
@@ -4,7 +4,6 @@ BEGIN;
-- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
\i test/pgtap-core.sql
-CREATE EXTENSION semver;
SELECT plan(76);
--SELECT * FROM no_plan();

View File

@@ -1,6 +1,7 @@
#!/bin/sh
set -ex
cd "$(dirname ${0})"
patch -p1 <test-upgrade.patch
patch -p1 <test-upgrade-${PG_VERSION}.patch
psql -d contrib_regression -c "DROP EXTENSION IF EXISTS pgtap"
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --dbname=contrib_regression base corpus

View File

@@ -0,0 +1,28 @@
diff --git a/Makefile b/Makefile
index f255fe6..0a0fa65 100644
--- a/Makefile
+++ b/Makefile
@@ -346,7 +346,7 @@ test: test-serial test-parallel
TB_DIR = test/build
GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))
diff --git a/test/schedule/create.sql b/test/schedule/create.sql
index ba355ed..7e250f5 100644
--- a/test/schedule/create.sql
+++ b/test/schedule/create.sql
@@ -1,3 +1,2 @@
\unset ECHO
\i test/psql.sql
-CREATE EXTENSION pgtap;
diff --git a/test/schedule/main.sch b/test/schedule/main.sch
index a8a5fbc..0463fc4 100644
--- a/test/schedule/main.sch
+++ b/test/schedule/main.sch
@@ -1,2 +1 @@
-test: build
test: create

View File

@@ -0,0 +1,5 @@
#!/bin/sh
set -ex
cd "$(dirname ${0})"
patch -p1 <test-upgrade.patch
make installcheck

View File

@@ -2,4 +2,5 @@
set -ex
cd "$(dirname ${0})"
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension")+15);}')"
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression ${REGRESS}

View File

@@ -11,6 +11,7 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
export PG_TEST_VERSION=${PG_VERSION}
function wait_for_ready {
TIME=0
while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -41,7 +42,8 @@ EXTENSIONS='[
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
{"extname": "semver", "extdir": "pg_semver-src"},
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
{"extname": "pgjwt", "extdir": "pgjwt-src"}
{"extname": "pgjwt", "extdir": "pgjwt-src"},
{"extname": "pgtap", "extdir": "pgtap-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
@@ -57,9 +59,15 @@ wait_for_ready
docker compose cp ext-src neon-test-extensions:/
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
create_extensions "${EXTNAMES}"
query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
exts="${EXTNAMES}"
else
query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
fi
if [ -z "${exts}" ]; then
echo "No extensions were upgraded"
else
@@ -87,7 +95,10 @@ else
exit 1
fi
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs
exit 1
fi
docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
done

View File

@@ -7,6 +7,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true
regex.workspace = true

View File

@@ -1,18 +1,20 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use crate::{
privilege::Privilege,
responses::ComputeCtlConfig,
spec::{ComputeSpec, ExtVersion, PgIdent},
};
use serde::Deserialize;
use serde::{Deserialize, Serialize};
/// Request of the /configure API
///
/// We now pass only `spec` in the configuration request, but later we can
/// extend it and something like `restart: bool` or something else. So put
/// `spec` into a struct initially to be more flexible in the future.
#[derive(Deserialize, Debug)]
#[derive(Debug, Deserialize, Serialize)]
pub struct ConfigurationRequest {
pub spec: ComputeSpec,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Debug)]

View File

@@ -3,6 +3,7 @@
use std::fmt::Display;
use chrono::{DateTime, Utc};
use jsonwebtoken::jwk::JwkSet;
use serde::{Deserialize, Serialize, Serializer};
use crate::{
@@ -135,13 +136,27 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
pub jwks: JwkSet,
}
impl Default for ComputeCtlConfig {
fn default() -> Self {
Self {
jwks: JwkSet {
keys: Vec::default(),
},
}
}
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving
/// to a different place.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]

View File

@@ -2,7 +2,6 @@ use anyhow::bail;
use flate2::write::{GzDecoder, GzEncoder};
use flate2::Compression;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pprof::protos::{Function, Line, Location, Message as _, Profile};
use regex::Regex;
@@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
// Resolve the line and function for each location.
backtrace::resolve(loc.address as *mut c_void, |symbol| {
let Some(symname) = symbol.name() else {
let Some(symbol_name) = symbol.name() else {
return;
};
let mut name = symname.to_string();
// Strip the Rust monomorphization suffix from the symbol name.
static SUFFIX_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex"));
if let Some(m) = SUFFIX_REGEX.find(&name) {
name.truncate(m.start());
}
let function_id = match functions.get(&name) {
Some(function) => function.id,
None => {
let id = functions.len() as u64 + 1;
let system_name = String::from_utf8_lossy(symname.as_bytes());
let function_name = format!("{symbol_name:#}");
let functions_len = functions.len();
let function_id = functions
.entry(function_name)
.or_insert_with_key(|function_name| {
let function_id = functions_len as u64 + 1;
let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
let filename = symbol
.filename()
.map(|path| path.to_string_lossy())
.unwrap_or(Cow::Borrowed(""));
let function = Function {
id,
name: string_id(&name),
Function {
id: function_id,
name: string_id(function_name),
system_name: string_id(&system_name),
filename: string_id(&filename),
..Default::default()
};
functions.insert(name, function);
id
}
};
}
})
.id;
loc.line.push(Line {
function_id,
line: symbol.lineno().unwrap_or(0) as i64,

View File

@@ -351,7 +351,7 @@ pub struct TenantConfigToml {
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
/// `index_part.json`, and it cannot be reversed.
pub rel_size_v2_enabled: Option<bool>,
pub rel_size_v2_enabled: bool,
// gc-compaction related configs
/// Enable automatic gc-compaction trigger on this tenant.
@@ -544,10 +544,11 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
// This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
// most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
// This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
// 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
// be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
// with this config, we can get a maximum peak compaction usage of 9 GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
@@ -633,7 +634,7 @@ impl Default for TenantConfigToml {
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: true,
wal_receiver_protocol_override: None,
rel_size_v2_enabled: None,
rel_size_v2_enabled: false,
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,

View File

@@ -182,6 +182,18 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
#[serde(default)]
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
#[derive(Serialize, Clone, Debug)]

View File

@@ -1,10 +1,12 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::Bytes;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::Oid;
use postgres_ffi::RepOriginId;
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
use utils::const_assert;
use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
/// The key prefix of ReplOrigin keys.
pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
/// The key prefix of db directory keys.
pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
/// The key prefix of rel directory keys.
pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RelDirExists {
Exists,
Removed,
}
#[derive(Debug)]
pub struct DecodeError;
impl fmt::Display for DecodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid marker")
}
}
impl std::error::Error for DecodeError {}
impl RelDirExists {
/// The value of the rel directory keys that indicates the existence of a relation.
const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
pub fn encode(&self) -> Bytes {
match self {
Self::Exists => Self::REL_EXISTS_MARKER.clone(),
Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
}
}
pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
match data {
Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
// Any other marker is invalid
Some(_) => Err(DecodeError),
None => Ok(Self::Removed),
}
}
pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
let data = data.as_ref();
if data == Self::REL_EXISTS_MARKER {
Ok(Self::Exists)
} else if data == SPARSE_TOMBSTONE_MARKER {
Ok(Self::Removed)
} else {
Err(DecodeError)
}
}
}
/// A tombstone in the sparse keyspace, which is an empty buffer.
pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -110,6 +170,24 @@ impl Key {
}
}
pub fn rel_dir_sparse_key_range() -> Range<Self> {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REL_DIR_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// This function checks more extensively what keys we can take on the write path.
/// If a key beginning with 00 does not have a global/default tablespace OID, it
/// will be rejected on the write path.
@@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
}
}
#[inline(always)]
pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: relnode,
field5: forknum,
field6: 1,
}
}
pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX,
} // it's fine to exclude the last key b/c we only use field6 == 1
}
#[inline(always)]
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
@@ -734,9 +842,9 @@ impl Key {
self.field1 == RELATION_SIZE_PREFIX
}
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
Key {
field1: AUX_KEY_PREFIX,
field2: 0,

View File

@@ -1080,8 +1080,7 @@ pub struct TenantInfo {
/// Opaque explanation if gc is being blocked.
///
/// Only looked up for the individual tenant detail, not the listing. This is purely for
/// debugging, not included in openapi.
/// Only looked up for the individual tenant detail, not the listing.
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_blocking: Option<String>,
}
@@ -1136,7 +1135,26 @@ pub struct TimelineInfo {
pub ancestor_lsn: Option<Lsn>,
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
/// as it is easier to reason about.
#[serde(default)]
pub applied_gc_cutoff_lsn: Lsn,
/// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
/// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
/// LSN at which it is legal to create a branch or ephemeral endpoint.
///
/// Note that holders of valid LSN leases may be able to create branches and read pages earlier
/// than this LSN, but new leases may not be taken out earlier than this LSN.
#[serde(default)]
pub min_readable_lsn: Lsn,
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage

View File

@@ -9,6 +9,8 @@ use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::io::ErrorKind;
use std::net::SocketAddr;
use std::os::fd::AsRawFd;
use std::os::fd::RawFd;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{ready, Poll};
@@ -268,6 +270,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
}
pub struct PostgresBackend<IO> {
pub socket_fd: RawFd,
framed: MaybeWriteOnly<IO>,
pub state: ProtoState,
@@ -293,9 +296,11 @@ impl PostgresBackend<tokio::net::TcpStream> {
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> io::Result<Self> {
let peer_addr = socket.peer_addr()?;
let socket_fd = socket.as_raw_fd();
let stream = MaybeTlsStream::Unencrypted(socket);
Ok(Self {
socket_fd,
framed: MaybeWriteOnly::Full(Framed::new(stream)),
state: ProtoState::Initialization,
auth_type,
@@ -307,6 +312,7 @@ impl PostgresBackend<tokio::net::TcpStream> {
impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
pub fn new_from_io(
socket_fd: RawFd,
socket: IO,
peer_addr: SocketAddr,
auth_type: AuthType,
@@ -315,6 +321,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
let stream = MaybeTlsStream::Unencrypted(socket);
Ok(Self {
socket_fd,
framed: MaybeWriteOnly::Full(Framed::new(stream)),
state: ProtoState::Initialization,
auth_type,

View File

@@ -5,18 +5,15 @@ edition = "2021"
license = "MIT/Apache-2.0"
[dependencies]
async-trait.workspace = true
bytes.workspace = true
byteorder.workspace = true
fallible-iterator.workspace = true
futures-util = { workspace = true, features = ["sink"] }
log = "0.4"
parking_lot.workspace = true
percent-encoding = "2.0"
pin-project-lite.workspace = true
phf = "0.11"
postgres-protocol2 = { path = "../postgres-protocol2" }
postgres-types2 = { path = "../postgres-types2" }
tokio = { workspace = true, features = ["io-util", "time", "net"] }
tokio-util = { workspace = true, features = ["codec"] }
serde = { workspace = true, features = ["derive"] }
serde = { workspace = true, features = ["derive"] }

View File

@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
use crate::types::{Oid, ToSql, Type};
use crate::{
prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
};
use bytes::BytesMut;
use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
}
/// A cache of type info and prepared statements for fetching type info
/// (corresponding to the queries in the [prepare] module).
/// (corresponding to the queries in the [crate::prepare] module).
#[derive(Default)]
struct CachedTypeInfo {
/// A statement for basic information for a type from its
/// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
/// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
/// fallback).
typeinfo: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
typeinfo_composite: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// its fallback).
typeinfo_enum: Option<Statement>,
@@ -190,26 +190,6 @@ impl Client {
&self.inner
}
/// Creates a new prepared statement.
///
/// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
/// which are set when executed. Prepared statements can only be used with the connection that created them.
pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
self.prepare_typed(query, &[]).await
}
/// Like `prepare`, but allows the types of query parameters to be explicitly specified.
///
/// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
/// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
pub async fn prepare_typed(
&self,
query: &str,
parameter_types: &[Type],
) -> Result<Statement, Error> {
prepare::prepare(&self.inner, query, parameter_types).await
}
/// Executes a statement, returning a vector of the resulting rows.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -222,14 +202,11 @@ impl Client {
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn query<T>(
pub async fn query(
&self,
statement: &T,
statement: Statement,
params: &[&(dyn ToSql + Sync)],
) -> Result<Vec<Row>, Error>
where
T: ?Sized + ToStatement,
{
) -> Result<Vec<Row>, Error> {
self.query_raw(statement, slice_iter(params))
.await?
.try_collect()
@@ -250,13 +227,15 @@ impl Client {
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`query`]: #method.query
pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
pub async fn query_raw<'a, I>(
&self,
statement: Statement,
params: I,
) -> Result<RowStream, Error>
where
T: ?Sized + ToStatement,
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let statement = statement.__convert().into_statement(self).await?;
query::query(&self.inner, statement, params).await
}
@@ -271,55 +250,6 @@ impl Client {
query::query_txt(&self.inner, statement, params).await
}
/// Executes a statement, returning the number of rows modified.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn execute<T>(
&self,
statement: &T,
params: &[&(dyn ToSql + Sync)],
) -> Result<u64, Error>
where
T: ?Sized + ToStatement,
{
self.execute_raw(statement, slice_iter(params)).await
}
/// The maximally flexible version of [`execute`].
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`execute`]: #method.execute
pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
where
T: ?Sized + ToStatement,
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let statement = statement.__convert().into_statement(self).await?;
query::execute(self.inner(), statement, params).await
}
/// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
///
/// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that

View File

@@ -1,7 +1,8 @@
#![allow(async_fn_in_trait)]
use crate::query::RowStream;
use crate::types::Type;
use crate::{Client, Error, Transaction};
use async_trait::async_trait;
use postgres_protocol2::Oid;
mod private {
@@ -11,7 +12,6 @@ mod private {
/// A trait allowing abstraction over connections and transactions.
///
/// This trait is "sealed", and cannot be implemented outside of this crate.
#[async_trait]
pub trait GenericClient: private::Sealed {
/// Like `Client::query_raw_txt`.
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed {
impl private::Sealed for Client {}
#[async_trait]
impl GenericClient for Client {
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
where
@@ -39,14 +38,12 @@ impl GenericClient for Client {
/// Query for type information
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
self.get_type(oid).await
crate::prepare::get_type(self.inner(), oid).await
}
}
impl private::Sealed for Transaction<'_> {}
#[async_trait]
#[allow(clippy::needless_lifetimes)]
impl GenericClient for Transaction<'_> {
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
where

View File

@@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow};
pub use crate::simple_query::SimpleQueryStream;
pub use crate::statement::{Column, Statement};
pub use crate::tls::NoTls;
pub use crate::to_statement::ToStatement;
pub use crate::transaction::Transaction;
pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
use crate::types::ToSql;
@@ -65,7 +64,6 @@ pub mod row;
mod simple_query;
mod statement;
pub mod tls;
mod to_statement;
mod transaction;
mod transaction_builder;
pub mod types;

View File

@@ -1,7 +1,6 @@
use crate::client::InnerClient;
use crate::codec::FrontendMessage;
use crate::connection::RequestMessages;
use crate::error::SqlState;
use crate::types::{Field, Kind, Oid, Type};
use crate::{query, slice_iter};
use crate::{Column, Error, Statement};
@@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message;
use postgres_protocol2::message::frontend;
use std::future::Future;
use std::pin::Pin;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
// Range types weren't added until Postgres 9.2, so pg_range may not exist
const TYPEINFO_FALLBACK_QUERY: &str = "\
SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
FROM pg_catalog.pg_type t
INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
@@ -39,14 +29,6 @@ WHERE enumtypid = $1
ORDER BY enumsortorder
";
// Postgres 9.0 didn't have enumsortorder
const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY oid
";
pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
@@ -56,15 +38,13 @@ AND attnum > 0
ORDER BY attnum
";
static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
pub async fn prepare(
client: &Arc<InnerClient>,
name: &'static str,
query: &str,
types: &[Type],
) -> Result<Statement, Error> {
let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
let buf = encode(client, &name, query, types)?;
let buf = encode(client, name, query, types)?;
let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
match responses.next().await? {
@@ -105,10 +85,11 @@ pub async fn prepare(
fn prepare_rec<'a>(
client: &'a Arc<InnerClient>,
name: &'static str,
query: &'a str,
types: &'a [Type],
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
Box::pin(prepare(client, query, types))
Box::pin(prepare(client, name, query, types))
}
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
return Ok(stmt);
}
let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
Ok(stmt) => stmt,
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
}
Err(e) => return Err(e),
};
let typeinfo = "neon_proxy_typeinfo";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
client.set_typeinfo(&stmt);
Ok(stmt)
@@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
return Ok(stmt);
}
let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
Ok(stmt) => stmt,
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
}
Err(e) => return Err(e),
};
let typeinfo = "neon_proxy_typeinfo_enum";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
client.set_typeinfo_enum(&stmt);
Ok(stmt)
@@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
return Ok(stmt);
}
let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
let typeinfo = "neon_proxy_typeinfo_composite";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
client.set_typeinfo_composite(&stmt);
Ok(stmt)

View File

@@ -157,49 +157,6 @@ where
})
}
pub async fn execute<'a, I>(
client: &InnerClient,
statement: Statement,
params: I,
) -> Result<u64, Error>
where
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let buf = if log_enabled!(Level::Debug) {
let params = params.into_iter().collect::<Vec<_>>();
debug!(
"executing statement {} with parameters: {:?}",
statement.name(),
BorrowToSqlParamsDebug(params.as_slice()),
);
encode(client, &statement, params)?
} else {
encode(client, &statement, params)?
};
let mut responses = start(client, buf).await?;
let mut rows = 0;
loop {
match responses.next().await? {
Message::DataRow(_) => {}
Message::CommandComplete(body) => {
rows = body
.tag()
.map_err(Error::parse)?
.rsplit(' ')
.next()
.unwrap()
.parse()
.unwrap_or(0);
}
Message::EmptyQueryResponse => rows = 0,
Message::ReadyForQuery(_) => return Ok(rows),
_ => return Err(Error::unexpected_message()),
}
}
}
async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

View File

@@ -13,7 +13,7 @@ use std::{
struct StatementInner {
client: Weak<InnerClient>,
name: String,
name: &'static str,
params: Vec<Type>,
columns: Vec<Column>,
}
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
fn drop(&mut self) {
if let Some(client) = self.client.upgrade() {
let buf = client.with_buf(|buf| {
frontend::close(b'S', &self.name, buf).unwrap();
frontend::close(b'S', self.name, buf).unwrap();
frontend::sync(buf);
buf.split().freeze()
});
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
impl Statement {
pub(crate) fn new(
inner: &Arc<InnerClient>,
name: String,
name: &'static str,
params: Vec<Type>,
columns: Vec<Column>,
) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
Statement(Arc::new(StatementInner {
client: Weak::new(),
name: String::new(),
name: "<anonymous>",
params,
columns,
}))
}
pub(crate) fn name(&self) -> &str {
&self.0.name
self.0.name
}
/// Returns the expected types of the statement's parameters.

View File

@@ -1,57 +0,0 @@
use crate::to_statement::private::{Sealed, ToStatementType};
use crate::Statement;
mod private {
use crate::{Client, Error, Statement};
pub trait Sealed {}
pub enum ToStatementType<'a> {
Statement(&'a Statement),
Query(&'a str),
}
impl ToStatementType<'_> {
pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
match self {
ToStatementType::Statement(s) => Ok(s.clone()),
ToStatementType::Query(s) => client.prepare(s).await,
}
}
}
}
/// A trait abstracting over prepared and unprepared statements.
///
/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
/// was prepared previously.
///
/// This trait is "sealed" and cannot be implemented by anything outside this crate.
pub trait ToStatement: Sealed {
#[doc(hidden)]
fn __convert(&self) -> ToStatementType<'_>;
}
impl ToStatement for Statement {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Statement(self)
}
}
impl Sealed for Statement {}
impl ToStatement for str {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Query(self)
}
}
impl Sealed for str {}
impl ToStatement for String {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Query(self)
}
}
impl Sealed for String {}

View File

@@ -9,13 +9,43 @@ use anyhow::bail;
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
/// Number uniquely identifying safekeeper configuration.
/// Note: it is a part of sk control file.
pub type Generation = u32;
/// 1 is the first valid generation, 0 is used as
/// a placeholder before we fully migrate to generations.
pub const INVALID_GENERATION: Generation = 0;
pub const INITIAL_GENERATION: Generation = 1;
pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0);
pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1);
/// Number uniquely identifying safekeeper configuration.
/// Note: it is a part of sk control file.
///
/// Like tenant generations, but for safekeepers.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct SafekeeperGeneration(u32);
impl SafekeeperGeneration {
pub const fn new(v: u32) -> Self {
Self(v)
}
#[track_caller]
pub fn previous(&self) -> Option<Self> {
Some(Self(self.0.checked_sub(1)?))
}
#[track_caller]
pub fn next(&self) -> Self {
Self(self.0 + 1)
}
pub fn into_inner(self) -> u32 {
self.0
}
}
impl Display for SafekeeperGeneration {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
/// Membership is defined by ids so e.g. walproposer uses them to figure out
/// quorums, but we also carry host and port to give wp idea where to connect.
@@ -89,7 +119,7 @@ impl Display for MemberSet {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Configuration {
/// Unique id.
pub generation: Generation,
pub generation: SafekeeperGeneration,
/// Current members of the configuration.
pub members: MemberSet,
/// Some means it is a joint conf.

View File

@@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse {
pub struct SafekeeperUtilization {
pub timeline_count: u64,
}
/// pull_timeline request body.
#[derive(Debug, Deserialize, Serialize)]
pub struct PullTimelineRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub http_hosts: Vec<String>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct PullTimelineResponse {
// Donor safekeeper host
pub safekeeper_host: String,
// TODO: add more fields?
}

View File

@@ -24,11 +24,10 @@ diatomic-waker.workspace = true
git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
inferno.workspace = true
fail.workspace = true
futures = { workspace = true }
jsonwebtoken.workspace = true
nix.workspace = true
nix = {workspace = true, features = [ "ioctl" ] }
once_cell.workspace = true
pin-project-lite.workspace = true
regex.workspace = true

View File

@@ -286,6 +286,11 @@ mod tests {
const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
struct NewTypeStruct(u32);
const NT1: NewTypeStruct = NewTypeStruct(414243);
const NT1_INNER: u32 = 414243;
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct LongMsg {
pub tag: u8,
@@ -408,4 +413,42 @@ mod tests {
let msg2 = LongMsg::des(&encoded).unwrap();
assert_eq!(msg, msg2);
}
#[test]
/// Ensure that newtype wrappers around u32 don't change the serialization format
fn be_nt() {
use super::BeSer;
assert_eq!(NT1.serialized_size().unwrap(), 4);
let msg = NT1;
let encoded = msg.ser().unwrap();
let expected = hex_literal::hex!("0006 5223");
assert_eq!(encoded, expected);
assert_eq!(encoded, NT1_INNER.ser().unwrap());
let msg2 = NewTypeStruct::des(&encoded).unwrap();
assert_eq!(msg, msg2);
}
#[test]
/// Ensure that newtype wrappers around u32 don't change the serialization format
fn le_nt() {
use super::LeSer;
assert_eq!(NT1.serialized_size().unwrap(), 4);
let msg = NT1;
let encoded = msg.ser().unwrap();
let expected = hex_literal::hex!("2352 0600");
assert_eq!(encoded, expected);
assert_eq!(encoded, NT1_INNER.ser().unwrap());
let msg2 = NewTypeStruct::des(&encoded).unwrap();
assert_eq!(msg, msg2);
}
}

View File

@@ -93,6 +93,9 @@ pub mod try_rcu;
pub mod guard_arc_swap;
#[cfg(target_os = "linux")]
pub mod linux_socket_ioctl;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]
pub use git_version;

View File

@@ -0,0 +1,35 @@
//! Linux-specific socket ioctls.
//!
//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
use std::{
io,
mem::MaybeUninit,
os::{fd::RawFd, raw::c_int},
};
use nix::libc::{FIONREAD, TIOCOUTQ};
unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
if err == 0 {
Ok(inq.assume_init())
} else {
Err(io::Error::last_os_error())
}
}
/// # Safety
///
/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
do_ioctl(socket_fd, FIONREAD)
}
/// # Safety
///
/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
do_ioctl(socket_fd, TIOCOUTQ)
}

View File

@@ -117,6 +117,10 @@ impl TenantShardId {
)
}
pub fn range(&self) -> RangeInclusive<Self> {
RangeInclusive::new(*self, *self)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}

View File

@@ -477,6 +477,26 @@ impl Client {
self.request(Method::POST, &uri, ()).await.map(|_| ())
}
pub async fn timeline_download_heatmap_layers(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
concurrency: Option<usize>,
) -> Result<()> {
let mut path = reqwest::Url::parse(&format!(
"{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
))
.expect("Cannot build URL");
if let Some(concurrency) = concurrency {
path.query_pairs_mut()
.append_pair("concurrency", &format!("{}", concurrency));
}
self.request(Method::POST, path, ()).await.map(|_| ())
}
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/reset",

View File

@@ -13,7 +13,7 @@
use anyhow::{anyhow, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::Key;
use pageserver_api::key::{rel_block_to_key, Key};
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::{Instant, SystemTime};
@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
pub enum BasebackupError {
#[error("basebackup pageserver error {0:#}")]
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#}")]
Client(#[source] io::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
}
/// Create basebackup with non-rel data in it.
@@ -234,7 +234,7 @@ where
self.ar
.append(&header, self.buf.as_slice())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "flush"))?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -273,9 +273,9 @@ where
for dir in subdirs.iter() {
let header = new_tar_header_dir(dir)?;
self.ar
.append(&header, &mut io::empty())
.append(&header, io::empty())
.await
.context("could not add directory to basebackup tarball")?;
.map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
}
// Send config files.
@@ -286,13 +286,13 @@ where
self.ar
.append(&header, data)
.await
.context("could not add config file to basebackup tarball")?;
.map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
} else {
let header = new_tar_header(filepath, 0)?;
self.ar
.append(&header, &mut io::empty())
.append(&header, io::empty())
.await
.context("could not add config file to basebackup tarball")?;
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
}
}
if !lazy_slru_download {
@@ -406,7 +406,7 @@ where
self.ar
.append(&header, &*content)
.await
.context("could not add aux file to basebackup tarball")?;
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
}
if min_restart_lsn != Lsn::MAX {
@@ -419,7 +419,7 @@ where
self.ar
.append(&header, &data[..])
.await
.context("could not add restart.lsn file to basebackup tarball")?;
.map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
}
for xid in self
.timeline
@@ -451,9 +451,9 @@ where
let crc32 = crc32c::crc32c(&content);
content.extend_from_slice(&crc32.to_le_bytes());
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
self.ar.append(&header, &*content).await.context(
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
)?;
self.ar.append(&header, &*content).await.map_err(|e| {
BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
})?;
}
fail_point!("basebackup-before-control-file", |_| {
@@ -464,7 +464,10 @@ where
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
self.ar
.finish()
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
debug!("all tarred up!");
Ok(())
}
@@ -482,9 +485,9 @@ where
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar
.append(&header, &mut io::empty())
.append(&header, io::empty())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
return Ok(());
}
@@ -498,13 +501,9 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(
src,
blknum,
Version::Lsn(self.lsn),
self.ctx,
self.io_concurrency.clone(),
)
// TODO: investigate using get_vectored for the entire startblk..endblk range.
// But this code path is not on the critical path for most basebackups (?).
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
segment_data.extend_from_slice(&img[..]);
@@ -515,7 +514,7 @@ where
self.ar
.append(&header, segment_data.as_slice())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;
seg += 1;
startblk = endblk;
@@ -566,7 +565,7 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;
info!("timeline.pg_version {}", self.timeline.pg_version);
@@ -576,7 +575,7 @@ where
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -612,9 +611,9 @@ where
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, &mut io::empty())
.append(&header, io::empty())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -627,14 +626,14 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
}
};
Ok(())
@@ -663,7 +662,7 @@ where
self.ar
.append(&header, &buf[..])
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;
Ok(())
}
@@ -693,7 +692,7 @@ where
zenith_signal.as_bytes(),
)
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
let checkpoint_bytes = self
.timeline
@@ -718,7 +717,7 @@ where
self.ar
.append(&header, &pg_control_bytes[..])
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -742,7 +741,7 @@ where
self.ar
.append(&header, &wal_seg[..])
.await
.map_err(BasebackupError::Client)?;
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
Ok(())
}
}

View File

@@ -98,6 +98,7 @@ pub struct RequestContext {
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
read_path_debug: bool,
}
/// The kind of access to the page cache.
@@ -155,6 +156,7 @@ impl RequestContextBuilder {
download_behavior: DownloadBehavior::Download,
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
read_path_debug: false,
},
}
}
@@ -168,6 +170,7 @@ impl RequestContextBuilder {
download_behavior: original.download_behavior,
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
read_path_debug: original.read_path_debug,
},
}
}
@@ -191,6 +194,11 @@ impl RequestContextBuilder {
self
}
pub(crate) fn read_path_debug(mut self, b: bool) -> Self {
self.inner.read_path_debug = b;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
@@ -291,4 +299,8 @@ impl RequestContext {
pub(crate) fn page_content_kind(&self) -> PageContentKind {
self.page_content_kind
}
pub(crate) fn read_path_debug(&self) -> bool {
self.read_path_debug
}
}

View File

@@ -824,6 +824,38 @@ paths:
schema:
$ref: "#/components/schemas/TenantConfigResponse"
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
- name: concurrency
description: Maximum number of concurrent downloads (capped at remote storage concurrency)
in: query
required: false
schema:
type: integer
post:
description: |
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
may be used to target all shards of a tenant when the unsharded form is used, or a specific
tenant shard with the sharded form.
responses:
"200":
description: Success
delete:
description: Stop any on-going background downloads of heatmap layers for the specified timeline.
responses:
"200":
description: Success
/v1/utilization:
get:
description: |
@@ -882,6 +914,8 @@ components:
properties:
reason:
type: string
gc_blocking:
type: string
TenantCreateRequest:
allOf:
@@ -1080,9 +1114,15 @@ components:
type: integer
state:
type: string
min_readable_lsn:
type: string
format: hex
latest_gc_cutoff_lsn:
type: string
format: hex
applied_gc_cutoff_lsn:
type: string
format: hex
SyntheticSizeResponse:
type: object

View File

@@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use crate::config::PageServerConf;
use crate::context::RequestContextBuilder;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -482,6 +483,11 @@ async fn build_timeline_info_common(
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
let min_readable_lsn = std::cmp::max(
timeline.get_gc_cutoff_lsn(),
*timeline.get_applied_gc_cutoff_lsn(),
);
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
@@ -493,7 +499,12 @@ async fn build_timeline_info_common(
initdb_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
// Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
// we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
// actually trimmed data to), which can pass each other when PITR is changed.
latest_gc_cutoff_lsn: min_readable_lsn,
min_readable_lsn,
applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
current_logical_size_is_accurate: match current_logical_size.accuracy() {
tenant::timeline::logical_size::Accuracy::Approximate => false,
@@ -1453,6 +1464,59 @@ async fn timeline_layer_scan_disposable_keys(
)
}
async fn timeline_download_heatmap_layers_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
// Only used in the case where remote storage is not configured.
const DEFAULT_MAX_CONCURRENCY: usize = 100;
// A conservative default.
const DEFAULT_CONCURRENCY: usize = 16;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let desired_concurrency =
parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let max_concurrency = get_config(&request)
.remote_storage_config
.as_ref()
.map(|c| c.concurrency_limit())
.unwrap_or(DEFAULT_MAX_CONCURRENCY);
let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
timeline.start_heatmap_layers_download(concurrency).await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn timeline_shutdown_download_heatmap_layers_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
timeline.stop_and_drain_heatmap_layers_download().await;
json_response(StatusCode::OK, ())
}
async fn layer_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -2331,6 +2395,7 @@ async fn timeline_checkpoint_handler(
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
@@ -2508,14 +2573,30 @@ async fn deletion_queue_flush(
}
}
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
async fn getpage_at_lsn_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
getpage_at_lsn_handler_inner(false, request, cancel).await
}
async fn touchpage_at_lsn_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
getpage_at_lsn_handler_inner(true, request, cancel).await
}
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
async fn getpage_at_lsn_handler_inner(
touch: bool,
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
// Require pageserver admin permission for this API instead of only tenant-level token.
check_permission(&request, None)?;
let state = get_state(&request);
struct Key(pageserver_api::key::Key);
@@ -2530,22 +2611,29 @@ async fn getpage_at_lsn_handler(
let key: Key = parse_query_param(&request, "key")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
// Enable read path debugging
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
// Use last_record_lsn if no lsn is provided
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let page = timeline.get(key.0, lsn, &ctx).await?;
Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
if touch {
json_response(StatusCode::OK, ())
} else {
Result::<_, ApiError>::Ok(
Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/octet-stream")
.body(hyper::Body::from(page))
.unwrap(),
)
}
}
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await
@@ -3616,6 +3704,14 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
|r| api_handler(r, layer_map_info_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
|r| api_handler(r, timeline_download_heatmap_layers_handler),
)
.delete(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
|r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, layer_download_handler),
@@ -3672,6 +3768,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
|r| api_handler(r, touchpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|r| api_handler(r, timeline_collect_keyspace),

View File

@@ -1,5 +1,6 @@
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::os::fd::RawFd;
use std::pin::Pin;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, Mutex};
@@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
"Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
&["tenant_id", "shard_id", "timeline_id"],
// Low resolution to reduce cardinality.
vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
)
.expect("failed to define a metric")
});
@@ -1439,27 +1440,66 @@ impl Drop for SmgrOpTimer {
}
impl SmgrOpFlushInProgress {
pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
/// The caller must guarantee that `socket_fd`` outlives this function.
pub(crate) async fn measure<Fut, O>(
self,
started_at: Instant,
mut fut: Fut,
socket_fd: RawFd,
) -> O
where
Fut: std::future::Future<Output = O>,
{
let mut fut = std::pin::pin!(fut);
// Whenever observe_guard gets called, or dropped,
// it adds the time elapsed since its last call to metrics.
// Last call is tracked in `now`.
let mut logged = false;
let mut last_counter_increment_at = started_at;
let mut observe_guard = scopeguard::guard(
|| {
|is_timeout| {
let now = Instant::now();
let elapsed = now - started_at;
self.global_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
started_at = now;
// Increment counter
{
let elapsed_since_last_observe = now - last_counter_increment_at;
self.global_micros
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
last_counter_increment_at = now;
}
// Log something on every timeout, and on completion but only if we hit a timeout.
if is_timeout || logged {
logged = true;
let elapsed_total = now - started_at;
let msg = if is_timeout {
"slow flush ongoing"
} else {
"slow flush completed or cancelled"
};
let (inq, outq) = {
// SAFETY: caller guarantees that `socket_fd` outlives this function.
#[cfg(target_os = "linux")]
unsafe {
(
utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
)
}
#[cfg(not(target_os = "linux"))]
{
_ = socket_fd; // appease unused lint on macOS
(-1, -1)
}
};
let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
tracing::info!(elapsed_total_secs, inq, outq, msg);
}
},
|mut observe| {
observe();
observe(false);
},
);
@@ -1467,7 +1507,7 @@ impl SmgrOpFlushInProgress {
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
Ok(v) => return v,
Err(_timeout) => {
(*observe_guard)();
(*observe_guard)(true);
}
}
}

View File

@@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
/// is not yet in state [`TenantState::Active`].
@@ -236,7 +237,7 @@ pub async fn libpq_listener_main(
type ConnectionHandlerResult = anyhow::Result<()>;
#[instrument(skip_all, fields(peer_addr))]
#[instrument(skip_all, fields(peer_addr, application_name))]
#[allow(clippy::too_many_arguments)]
async fn page_service_conn_main(
conf: &'static PageServerConf,
@@ -257,6 +258,8 @@ async fn page_service_conn_main(
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
let socket_fd = socket.as_raw_fd();
let peer_addr = socket.peer_addr().context("get peer address")?;
tracing::Span::current().record("peer_addr", field::display(peer_addr));
@@ -305,7 +308,7 @@ async fn page_service_conn_main(
cancel.clone(),
gate_guard,
);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
match pgbackend.run(&mut conn_handler, &cancel).await {
Ok(()) => {
@@ -914,7 +917,7 @@ impl PageServerHandler {
&shard,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_latest_gc_cutoff_lsn(),
&shard.get_applied_gc_cutoff_lsn(),
ctx,
)
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
@@ -1286,12 +1289,15 @@ impl PageServerHandler {
))?;
// what we want to do
let socket_fd = pgb_writer.socket_fd;
let flush_fut = pgb_writer.flush();
// metric for how long flushing takes
let flush_fut = match flushing_timer {
Some(flushing_timer) => {
futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
}
Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
Instant::now(),
flush_fut,
socket_fd,
)),
None => futures::future::Either::Right(flush_fut),
};
// do it while respecting cancellation
@@ -1793,6 +1799,13 @@ impl PageServerHandler {
.as_millis()
.to_string()
});
info!(
"acquired lease for {} until {}",
lsn,
valid_until_str.as_deref().unwrap_or("<unknown>")
);
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
@@ -1810,7 +1823,7 @@ impl PageServerHandler {
req: &PagestreamExistsRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1837,7 +1850,7 @@ impl PageServerHandler {
req: &PagestreamNblocksRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1864,7 +1877,7 @@ impl PageServerHandler {
req: &PagestreamDbSizeRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1954,7 +1967,7 @@ impl PageServerHandler {
req: &PagestreamGetSlruSegmentRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -2050,7 +2063,8 @@ impl PageServerHandler {
{
fn map_basebackup_error(err: BasebackupError) -> QueryError {
match err {
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
// TODO: passthrough the error site to the final error message?
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
}
}
@@ -2071,7 +2085,7 @@ impl PageServerHandler {
//return Err(QueryError::NotFound("timeline is archived".into()))
}
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
@@ -2151,10 +2165,12 @@ impl PageServerHandler {
.await
.map_err(map_basebackup_error)?;
}
writer
.flush()
.await
.map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
writer.flush().await.map_err(|e| {
map_basebackup_error(BasebackupError::Client(
e,
"handle_basebackup_request,flush",
))
})?;
}
pgb.write_message_noflush(&BeMessage::CopyDone)
@@ -2454,9 +2470,16 @@ where
fn startup(
&mut self,
_pgb: &mut PostgresBackend<IO>,
_sm: &FeStartupPacket,
sm: &FeStartupPacket,
) -> Result<(), QueryError> {
fail::fail_point!("ps::connection-start::startup-packet");
if let FeStartupPacket::StartupMessage { params, .. } = sm {
if let Some(app_name) = params.get("application_name") {
Span::current().record("application_name", field::display(app_name));
}
};
Ok(())
}

View File

@@ -23,13 +23,14 @@ use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::key::{rel_tag_sparse_key, Key};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -490,12 +491,33 @@ impl Timeline {
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
return Ok(false);
}
// fetch directory listing
// Read path: first read the new reldir keyspace. Early return if the relation exists.
// Otherwise, read the old reldir keyspace.
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
if self.get_rel_size_v2_enabled() {
// fetch directory listing (new)
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
let exists_v2 = buf == RelDirExists::Exists;
// Fast path: if the relation exists in the new format, return true.
// TODO: we should have a verification mode that checks both keyspaces
// to ensure the relation only exists in one of them.
if exists_v2 {
return Ok(true);
}
}
// fetch directory listing (old)
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
Ok(exists_v1)
}
/// Get a list of all existing relations in given tablespace and database.
@@ -513,12 +535,12 @@ impl Timeline {
version: Version<'_>,
ctx: &RequestContext,
) -> Result<HashSet<RelTag>, PageReconstructError> {
// fetch directory listing
// fetch directory listing (old)
let key = rel_dir_to_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
let rels: HashSet<RelTag> =
let rels_v1: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
@@ -526,6 +548,46 @@ impl Timeline {
forknum: *forknum,
}));
if !self.get_rel_size_v2_enabled() {
return Ok(rels_v1);
}
// scan directory listing (new), merge with the old results
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
let results = self
.scan(
KeySpace::single(key_range),
version.get_lsn(),
ctx,
io_concurrency,
)
.await?;
let mut rels = rels_v1;
for (key, val) in results {
let val = RelDirExists::decode(&val?)
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
assert_eq!(key.field6, 1);
assert_eq!(key.field2, spcnode);
assert_eq!(key.field3, dbnode);
let tag = RelTag {
spcnode,
dbnode,
relnode: key.field4,
forknum: key.field5,
};
if val == RelDirExists::Removed {
debug_assert!(!rels.contains(&tag), "removed reltag in v2");
continue;
}
let did_not_contain = rels.insert(tag);
debug_assert!(did_not_contain, "duplicate reltag in v2");
}
Ok(rels)
}
@@ -611,7 +673,7 @@ impl Timeline {
) -> Result<LsnForTimestamp, PageReconstructError> {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
let gc_cutoff_planned = {
let gc_info = self.gc_info.read().unwrap();
gc_info.min_cutoff()
@@ -1144,7 +1206,11 @@ impl Timeline {
let dense_keyspace = result.to_keyspace();
let sparse_keyspace = SparseKeySpace(KeySpace {
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
ranges: vec![
Key::metadata_aux_key_range(),
repl_origin_key_range(),
Key::rel_dir_sparse_key_range(),
],
});
if cfg!(debug_assertions) {
@@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> {
/// For special "directory" keys that store key-value maps, track the size of the map
/// if it was updated in this modification.
pending_directory_entries: Vec<(DirectoryKind, usize)>,
pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
pending_metadata_bytes: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MetricsUpdate {
/// Set the metrics to this value
Set(u64),
/// Increment the metrics by this value
Add(u64),
/// Decrement the metrics by this value
Sub(u64),
}
impl DatadirModification<'_> {
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1359,7 +1435,8 @@ impl DatadirModification<'_> {
let buf = DbDirectory::ser(&DbDirectory {
dbdirs: HashMap::new(),
})?;
self.pending_directory_entries.push((DirectoryKind::Db, 0));
self.pending_directory_entries
.push((DirectoryKind::Db, MetricsUpdate::Set(0)));
self.put(DBDIR_KEY, Value::Image(buf.into()));
let buf = if self.tline.pg_version >= 17 {
@@ -1372,7 +1449,7 @@ impl DatadirModification<'_> {
})
}?;
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, 0));
.push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1382,17 +1459,23 @@ impl DatadirModification<'_> {
// harmless but they'd just be dropped on later compaction.
if self.tline.tenant_shard_id.is_shard_zero() {
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::Clog),
MetricsUpdate::Set(0),
));
self.put(
slru_dir_to_key(SlruKind::MultiXactMembers),
empty_dir.clone(),
);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::Clog),
MetricsUpdate::Set(0),
));
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
MetricsUpdate::Set(0),
));
}
Ok(())
@@ -1658,10 +1741,16 @@ impl DatadirModification<'_> {
}
if r.is_none() {
// Create RelDirectory
// TODO: if we have fully migrated to v2, no need to create this directory
let buf = RelDirectory::ser(&RelDirectory {
rels: HashSet::new(),
})?;
self.pending_directory_entries.push((DirectoryKind::Rel, 0));
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
if self.tline.get_rel_size_v2_enabled() {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
}
self.put(
rel_dir_to_key(spcnode, dbnode),
Value::Image(Bytes::from(buf)),
@@ -1685,8 +1774,10 @@ impl DatadirModification<'_> {
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid = xid as u32;
@@ -1694,8 +1785,10 @@ impl DatadirModification<'_> {
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1744,8 +1837,10 @@ impl DatadirModification<'_> {
let mut dir = DbDirectory::des(&buf)?;
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
let buf = DbDirectory::ser(&dir)?;
self.pending_directory_entries
.push((DirectoryKind::Db, dir.dbdirs.len()));
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dir.dbdirs.len() as u64),
));
self.put(DBDIR_KEY, Value::Image(buf.into()));
} else {
warn!(
@@ -1778,39 +1873,85 @@ impl DatadirModification<'_> {
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir =
let dbdir_exists =
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
// Didn't exist. Update dbdir
e.insert(false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.pending_directory_entries
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
));
self.put(DBDIR_KEY, Value::Image(buf.into()));
// and create the RelDirectory
RelDirectory::default()
false
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
true
};
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if !dbdir_exists {
// Create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
self.pending_directory_entries
.push((DirectoryKind::Rel, rel_dir.rels.len()));
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
if self.tline.get_rel_size_v2_enabled() {
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
let val = self
.sparse_get(sparse_rel_dir_key, ctx)
.await
.map_err(|e| RelationError::Other(e.into()))?;
let val = RelDirExists::decode_option(val)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
return Err(RelationError::AlreadyExists);
}
self.put(
sparse_rel_dir_key,
Value::Image(RelDirExists::Exists.encode()),
);
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
// We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
// TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
// will be key not found errors if we don't create an empty one for rel_size_v2.
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
)),
);
}
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
} else {
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
}
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
}
// Put size
let size_key = rel_size_to_key(rel);
let buf = nblocks.to_le_bytes();
@@ -1896,9 +2037,34 @@ impl DatadirModification<'_> {
let mut dirty = false;
for rel_tag in rel_tags {
if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
dirty = true;
true
} else if self.tline.get_rel_size_v2_enabled() {
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
// logic).
let key =
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
// put tombstone
self.put(key, Value::Image(RelDirExists::Removed.encode()));
// no need to set dirty to true
true
} else {
false
}
} else {
false
};
if found {
// update logical size
let size_key = rel_size_to_key(rel_tag);
let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1914,8 +2080,6 @@ impl DatadirModification<'_> {
if dirty {
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
self.pending_directory_entries
.push((DirectoryKind::Rel, dir.rels.len()));
}
}
@@ -1939,8 +2103,10 @@ impl DatadirModification<'_> {
if !dir.segments.insert(segno) {
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
}
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
MetricsUpdate::Set(dir.segments.len() as u64),
));
self.put(
dir_key,
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1987,8 +2153,10 @@ impl DatadirModification<'_> {
if !dir.segments.remove(&segno) {
warn!("slru segment {:?}/{} does not exist", kind, segno);
}
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
MetricsUpdate::Set(dir.segments.len() as u64),
));
self.put(
dir_key,
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2020,8 +2188,10 @@ impl DatadirModification<'_> {
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid: u32 = u32::try_from(xid)?;
@@ -2030,8 +2200,10 @@ impl DatadirModification<'_> {
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2147,7 +2319,7 @@ impl DatadirModification<'_> {
}
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
writer.update_directory_entries_count(kind, count as u64);
writer.update_directory_entries_count(kind, count);
}
Ok(())
@@ -2233,7 +2405,7 @@ impl DatadirModification<'_> {
}
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
writer.update_directory_entries_count(kind, count as u64);
writer.update_directory_entries_count(kind, count);
}
self.pending_metadata_bytes = 0;
@@ -2297,6 +2469,22 @@ impl DatadirModification<'_> {
self.tline.get(key, lsn, ctx).await
}
/// Get a key from the sparse keyspace. Automatically converts the missing key error
/// and the empty value into None.
async fn sparse_get(
&self,
key: Key,
ctx: &RequestContext,
) -> Result<Option<Bytes>, PageReconstructError> {
let val = self.get(key, ctx).await;
match val {
Ok(val) if val.is_empty() => Ok(None),
Ok(val) => Ok(Some(val)),
Err(PageReconstructError::MissingKey(_)) => Ok(None),
Err(e) => Err(e),
}
}
fn put(&mut self, key: Key, val: Value) {
if Self::is_data_key(&key) {
self.put_data(key.to_compact(), val)
@@ -2379,6 +2567,23 @@ impl Version<'_> {
}
}
/// Get a key from the sparse keyspace. Automatically converts the missing key error
/// and the empty value into None.
async fn sparse_get(
&self,
timeline: &Timeline,
key: Key,
ctx: &RequestContext,
) -> Result<Option<Bytes>, PageReconstructError> {
let val = self.get(timeline, key, ctx).await;
match val {
Ok(val) if val.is_empty() => Ok(None),
Ok(val) => Ok(Some(val)),
Err(PageReconstructError::MissingKey(_)) => Ok(None),
Err(e) => Err(e),
}
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,
@@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind {
Rel,
AuxFiles,
SlruSegment(SlruKind),
RelV2,
}
impl DirectoryKind {

View File

@@ -40,6 +40,8 @@ use remote_timeline_client::manifest::{
use remote_timeline_client::UploadQueueNotReadyError;
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
use secondary::heatmap::HeatMapTenant;
use secondary::heatmap::HeatMapTimeline;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
@@ -55,6 +57,7 @@ use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::PreviousHeatmap;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -262,6 +265,7 @@ struct TimelinePreload {
timeline_id: TimelineId,
client: RemoteTimelineClient,
index_part: Result<MaybeDeletedIndexPart, DownloadError>,
previous_heatmap: Option<PreviousHeatmap>,
}
pub(crate) struct TenantPreload {
@@ -1128,6 +1132,7 @@ impl Tenant {
resources: TimelineResources,
mut index_part: IndexPart,
metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1158,6 +1163,7 @@ impl Tenant {
let timeline = self.create_timeline_struct(
timeline_id,
&metadata,
previous_heatmap,
ancestor.clone(),
resources,
CreateTimelineCause::Load,
@@ -1557,8 +1563,18 @@ impl Tenant {
}
}
// TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
// pulled the first heatmap. Not entirely necessary since the storage controller
// will kick the secondary in any case and cause a download.
let maybe_heatmap_at = self.read_on_disk_heatmap().await;
let timelines = self
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
.load_timelines_metadata(
remote_timeline_ids,
remote_storage,
maybe_heatmap_at,
cancel,
)
.await?;
Ok(TenantPreload {
@@ -1571,6 +1587,26 @@ impl Tenant {
})
}
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
Err(err) => {
error!("Failed to deserialize old heatmap: {err}");
None
}
},
Err(err) => match err.kind() {
std::io::ErrorKind::NotFound => None,
_ => {
error!("Unexpected IO error reading old heatmap: {err}");
None
}
},
}
}
///
/// Background task that downloads all data for a tenant and brings it to Active state.
///
@@ -1658,7 +1694,10 @@ impl Tenant {
match index_part {
MaybeDeletedIndexPart::IndexPart(index_part) => {
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
remote_index_and_client.insert(timeline_id, (index_part, preload.client));
remote_index_and_client.insert(
timeline_id,
(index_part, preload.client, preload.previous_heatmap),
);
}
MaybeDeletedIndexPart::Deleted(index_part) => {
info!(
@@ -1677,7 +1716,7 @@ impl Tenant {
// layer file.
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
for (timeline_id, remote_metadata) in sorted_timelines {
let (index_part, remote_client) = remote_index_and_client
let (index_part, remote_client, previous_heatmap) = remote_index_and_client
.remove(&timeline_id)
.expect("just put it in above");
@@ -1697,6 +1736,7 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
previous_heatmap,
self.get_timeline_resources_for(remote_client),
LoadTimelineCause::Attach,
ctx,
@@ -1846,11 +1886,13 @@ impl Tenant {
}
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
#[allow(clippy::too_many_arguments)]
async fn load_remote_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
resources: TimelineResources,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1880,6 +1922,7 @@ impl Tenant {
resources,
index_part,
remote_metadata,
previous_heatmap,
ancestor,
cause,
ctx,
@@ -1891,14 +1934,29 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_ids: HashSet<TimelineId>,
remote_storage: &GenericRemoteStorage,
heatmap: Option<(HeatMapTenant, std::time::Instant)>,
cancel: CancellationToken,
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
let mut part_downloads = JoinSet::new();
for timeline_id in timeline_ids {
let cancel_clone = cancel.clone();
let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
heatmap: h,
read_at: hs.1,
})
});
part_downloads.spawn(
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
.instrument(info_span!("download_index_part", %timeline_id)),
self.load_timeline_metadata(
timeline_id,
remote_storage.clone(),
previous_timeline_heatmap,
cancel_clone,
)
.instrument(info_span!("download_index_part", %timeline_id)),
);
}
@@ -1946,6 +2004,7 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_id: TimelineId,
remote_storage: GenericRemoteStorage,
previous_heatmap: Option<PreviousHeatmap>,
cancel: CancellationToken,
) -> impl Future<Output = TimelinePreload> {
let client = self.build_timeline_client(timeline_id, remote_storage);
@@ -1961,6 +2020,7 @@ impl Tenant {
client,
timeline_id,
index_part,
previous_heatmap,
}
}
}
@@ -2072,7 +2132,12 @@ impl Tenant {
})?;
let timeline_preload = self
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
.load_timeline_metadata(
timeline_id,
self.remote_storage.clone(),
None,
cancel.clone(),
)
.await;
let index_part = match timeline_preload.index_part {
@@ -2106,6 +2171,7 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
None,
timeline_resources,
LoadTimelineCause::Unoffload,
&ctx,
@@ -2821,7 +2887,7 @@ impl Tenant {
};
let metadata = index_part.metadata.clone();
self
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
.load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
create_guard: timeline_create_guard, activate, }, &ctx)
.await?
.ready_to_activate()
@@ -3035,6 +3101,9 @@ impl Tenant {
if let Some(queue) = queue {
outcome = queue
.iteration(cancel, ctx, &self.gc_block, &timeline)
.instrument(
info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
)
.await?;
}
}
@@ -3081,6 +3150,12 @@ impl Tenant {
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
// shouldn't block compaction.
CompactionError::Offload(_) => {}
CompactionError::CollectKeySpaceError(err) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::Other(err) => {
self.compaction_circuit_breaker
.lock()
@@ -3858,6 +3933,13 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.rel_size_v2_enabled
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
pub fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
@@ -4030,6 +4112,7 @@ impl Tenant {
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
resources: TimelineResources,
cause: CreateTimelineCause,
@@ -4053,6 +4136,7 @@ impl Tenant {
self.conf,
Arc::clone(&self.tenant_conf),
new_metadata,
previous_heatmap,
ancestor,
new_timeline_id,
self.tenant_shard_id,
@@ -4695,24 +4779,24 @@ impl Tenant {
// We check it against both the planned GC cutoff stored in 'gc_info',
// and the 'latest_gc_cutoff' of the last GC that was performed. The
// planned GC cutoff in 'gc_info' is normally larger than
// 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
// 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
// changed the GC settings for the tenant to make the PITR window
// larger, but some of the data was already removed by an earlier GC
// iteration.
// check against last actual 'latest_gc_cutoff' first
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
{
let gc_info = src_timeline.gc_info.read().unwrap();
let planned_cutoff = gc_info.min_cutoff();
if gc_info.lsn_covered_by_lease(start_lsn) {
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
} else {
src_timeline
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*latest_gc_cutoff_lsn,
*applied_gc_cutoff_lsn,
))
.map_err(CreateTimelineError::AncestorLsn)?;
@@ -4751,7 +4835,7 @@ impl Tenant {
dst_prev,
Some(src_id),
start_lsn,
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
*src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
src_timeline.initdb_lsn,
src_timeline.pg_version,
);
@@ -5124,6 +5208,7 @@ impl Tenant {
.create_timeline_struct(
new_timeline_id,
new_metadata,
None,
ancestor,
resources,
CreateTimelineCause::Load,
@@ -5571,7 +5656,7 @@ pub(crate) mod harness {
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),
wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
gc_compaction_initial_threshold_kb: Some(
tenant_conf.gc_compaction_initial_threshold_kb,
@@ -6130,8 +6215,8 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
match tline.get(*TEST_KEY, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err.to_string().contains("not found at")),
@@ -7770,18 +7855,6 @@ mod tests {
}
tline.freeze_and_flush().await?;
// Force layers to L1
tline
.compact(
&cancel,
{
let mut flags = EnumSet::new();
flags.insert(CompactFlags::ForceL0Compaction);
flags
},
&ctx,
)
.await?;
if iter % 5 == 0 {
let (_, before_delta_file_accessed) =
@@ -7794,7 +7867,6 @@ mod tests {
let mut flags = EnumSet::new();
flags.insert(CompactFlags::ForceImageLayerCreation);
flags.insert(CompactFlags::ForceRepartition);
flags.insert(CompactFlags::ForceL0Compaction);
flags
},
&ctx,
@@ -8241,8 +8313,6 @@ mod tests {
let cancel = CancellationToken::new();
// Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
tline.force_set_disk_consistent_lsn(Lsn(0x40));
tline
.compact(
&cancel,
@@ -8256,7 +8326,8 @@ mod tests {
)
.await
.unwrap();
// Image layers are created at repartition LSN
// Image layers are created at last_record_lsn
let images = tline
.inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
.await
@@ -8427,7 +8498,7 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -8535,7 +8606,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -8703,8 +8774,8 @@ mod tests {
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
info!(
"latest_gc_cutoff_lsn: {}",
*timeline.get_latest_gc_cutoff_lsn()
"applied_gc_cutoff_lsn: {}",
*timeline.get_applied_gc_cutoff_lsn()
);
timeline.force_set_disk_consistent_lsn(end_lsn);
@@ -8730,7 +8801,7 @@ mod tests {
// Make lease on a already GC-ed LSN.
// 0/80 does not have a valid lease + is below latest_gc_cutoff
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
timeline
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
.expect_err("lease request on GC-ed LSN should fail");
@@ -8921,7 +8992,7 @@ mod tests {
};
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9008,7 +9079,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -9461,7 +9532,7 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9608,7 +9679,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x38))
.wait()
@@ -9709,7 +9780,7 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9960,7 +10031,7 @@ mod tests {
{
parent_tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x10))
.wait()
@@ -9980,7 +10051,7 @@ mod tests {
{
branch_tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x50))
.wait()
@@ -10336,7 +10407,7 @@ mod tests {
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10721,7 +10792,7 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10972,7 +11043,7 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()

View File

@@ -485,7 +485,9 @@ impl TenantConfOpt {
wal_receiver_protocol_override: self
.wal_receiver_protocol_override
.or(global_conf.wal_receiver_protocol_override),
rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
rel_size_v2_enabled: self
.rel_size_v2_enabled
.unwrap_or(global_conf.rel_size_v2_enabled),
gc_compaction_enabled: self
.gc_compaction_enabled
.unwrap_or(global_conf.gc_compaction_enabled),

View File

@@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 {
prev_record_lsn: Option<Lsn>,
ancestor_timeline: Option<TimelineId>,
ancestor_lsn: Lsn,
// The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`].
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
}

View File

@@ -1,4 +1,4 @@
use std::time::SystemTime;
use std::{collections::HashMap, time::SystemTime};
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(super) struct HeatMapTenant {
pub(crate) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
@@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant {
pub(super) upload_period_ms: Option<u128>,
}
impl HeatMapTenant {
pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
self.timelines
.into_iter()
.map(|htl| (htl.timeline_id, htl))
.collect()
}
}
#[serde_as]
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
@@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline {
}
#[serde_as]
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub(crate) struct HeatMapLayer {
pub(crate) name: LayerName,
pub(crate) metadata: LayerFileMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
pub(crate) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}

View File

@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
ancestor_lsn,
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
next_pitr_cutoff,
retention_param_cutoff,
lease_points,

View File

@@ -136,6 +136,22 @@ pub(crate) fn local_layer_path(
}
}
pub(crate) enum LastEviction {
Never,
At(std::time::Instant),
Evicting,
}
impl LastEviction {
pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
match self {
LastEviction::Never => false,
LastEviction::At(evicted_at) => evicted_at > &timepoint,
LastEviction::Evicting => true,
}
}
}
impl Layer {
/// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted(
@@ -405,6 +421,17 @@ impl Layer {
self.0.metadata()
}
pub(crate) fn last_evicted_at(&self) -> LastEviction {
match self.0.last_evicted_at.try_lock() {
Ok(lock) => match *lock {
None => LastEviction::Never,
Some(at) => LastEviction::At(at),
},
Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
}
}
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
self.0
.timeline
@@ -656,7 +683,9 @@ struct LayerInner {
/// When the Layer was last evicted but has not been downloaded since.
///
/// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
/// This is used for skipping evicted layers from the previous heatmap (see
/// `[Timeline::generate_heatmap]`) and for updating metrics
/// (see [`LayerImplMetrics::redownload_after`]).
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
#[cfg(test)]

View File

@@ -287,6 +287,7 @@ fn log_compaction_error(
sleep_duration: Duration,
task_cancelled: bool,
) {
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;
@@ -294,6 +295,8 @@ fn log_compaction_error(
let level = match err {
ShuttingDown => return,
Offload(_) => Level::ERROR,
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
CollectKeySpaceError(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {
let root_cause = err.root_cause();

View File

@@ -4,6 +4,7 @@ pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
pub(crate) mod handle;
mod heatmap_layers_downloader;
pub(crate) mod import_pgdata;
mod init;
pub mod layer_manager;
@@ -21,6 +22,7 @@ use chrono::{DateTime, Utc};
use compaction::CompactionOutcome;
use enumset::EnumSet;
use fail::fail_point;
use futures::FutureExt;
use futures::{stream::FuturesUnordered, StreamExt};
use handle::ShardTimelineId;
use layer_manager::Shutdown;
@@ -117,7 +119,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
use crate::tenant::config::TenantConfOpt;
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
@@ -150,16 +152,15 @@ use super::{
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
MaybeOffloaded,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{
debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
storage_layer::ReadableLayer,
};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
};
use super::{secondary::heatmap::HeatMapLayer, GcError};
#[cfg(test)]
use pageserver_api::value::Value;
@@ -328,6 +329,7 @@ pub struct Timeline {
// in `crate::page_service` writes these metrics.
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
/// Ensures layers aren't frozen by checkpointer between
@@ -352,8 +354,11 @@ pub struct Timeline {
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
// The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
// we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
// Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
// the planned GC cutoff.
pub applied_gc_cutoff_lsn: Rcu<Lsn>,
pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
@@ -462,6 +467,20 @@ pub struct Timeline {
/// If Some, collects GetPage metadata for an ongoing PageTrace.
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
previous_heatmap: ArcSwapOption<PreviousHeatmap>,
/// May host a background Tokio task which downloads all the layers from the current
/// heatmap on demand.
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
}
pub(crate) enum PreviousHeatmap {
Active {
heatmap: HeatMapTimeline,
read_at: std::time::Instant,
},
Obsolete,
}
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -1077,9 +1096,15 @@ impl Timeline {
(history, gc_info.within_ancestor_pitr)
}
/// Lock and get timeline's GC cutoff
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
/// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.applied_gc_cutoff_lsn.read()
}
/// Read timeline's planned GC cutoff: this is the logical end of history that users
/// are allowed to read (based on configured PITR), even if physically we have more history.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
self.gc_info.read().unwrap().cutoffs.time
}
/// Look up given page version.
@@ -1274,7 +1299,7 @@ impl Timeline {
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let read_path = if self.conf.enable_read_path_debugging {
let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
Some(ReadPath::new(keyspace.clone(), lsn))
} else {
None
@@ -1587,7 +1612,7 @@ impl Timeline {
};
if init || validate {
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
if lsn < *latest_gc_cutoff_lsn {
bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
@@ -1857,7 +1882,7 @@ impl Timeline {
// Signal compaction failure to avoid L0 flush stalls when it's broken.
match result {
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
Err(CompactionError::Other(_)) => {
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
// Don't change the current value on offload failure or shutdown. We don't want to
@@ -2020,6 +2045,11 @@ impl Timeline {
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// If we have a background task downloading heatmap layers stop it.
// The background downloads are sensitive to timeline cancellation (done above),
// so the drain will be immediate.
self.stop_and_drain_heatmap_layers_download().await;
// Ensure Prevent new page service requests from starting.
self.handles.shutdown();
@@ -2337,6 +2367,14 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.rel_size_v2_enabled
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2559,6 +2597,7 @@ impl Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
@@ -2645,6 +2684,7 @@ impl Timeline {
),
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
@@ -2659,7 +2699,7 @@ impl Timeline {
LastImageLayerCreationStatus::default(),
)),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
current_logical_size: if disk_consistent_lsn.is_valid() {
@@ -2721,6 +2761,10 @@ impl Timeline {
create_idempotency,
page_trace: Default::default(),
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
heatmap_layers_downloader: Mutex::new(None),
};
result.repartition_threshold =
@@ -3409,8 +3453,42 @@ impl Timeline {
}
}
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
// TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
// for each of the database, but we only store one value, and therefore each pgdirmodification
// would overwrite the previous value if they modify different databases.
match count {
MetricsUpdate::Set(count) => {
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
}
MetricsUpdate::Add(count) => {
// TODO: these operations are not atomic; but we only have one writer to the metrics, so
// it's fine.
if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
// The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
// the value reliably.
self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
}
// Otherwise, ignore this update
}
MetricsUpdate::Sub(count) => {
// TODO: these operations are not atomic; but we only have one writer to the metrics, so
// it's fine.
if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
// The metrics has been initialized with `MetricsUpdate::Set` before.
// The operation could overflow so we need to normalize the value.
let prev_val =
self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
let res = prev_val.saturating_sub(count);
self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
}
// Otherwise, ignore this update
}
};
// TODO: remove this, there's no place in the code that updates this aux metrics.
let aux_metric =
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
@@ -3459,12 +3537,52 @@ impl Timeline {
let guard = self.layers.read().await;
// Firstly, if there's any heatmap left over from when this location
// was a secondary, take that into account. Keep layers that are:
// * present in the layer map
// * visible
// * non-resident
// * not evicted since we read the heatmap
//
// Without this, a new cold, attached location would clobber the previous
// heatamp.
let previous_heatmap = self.previous_heatmap.load();
let visible_non_resident = match previous_heatmap.as_deref() {
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
Some(heatmap.layers.iter().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
if layer.visibility() == LayerVisibilityHint::Covered {
return None;
}
if layer.is_likely_resident() {
return None;
}
if layer.last_evicted_at().happened_after(*read_at) {
return None;
}
Some((desc, hl.metadata.clone(), hl.access_time))
}))
}
Some(PreviousHeatmap::Obsolete) => None,
None => None,
};
// Secondly, all currently visible, resident layers are included.
let resident = guard.likely_resident_layers().filter_map(|layer| {
match layer.visibility() {
LayerVisibilityHint::Visible => {
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
let last_activity_ts = layer.latest_activity();
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
Some((
layer.layer_desc().clone(),
layer.metadata(),
last_activity_ts,
))
}
LayerVisibilityHint::Covered => {
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -3473,7 +3591,18 @@ impl Timeline {
}
});
let mut layers = resident.collect::<Vec<_>>();
let mut layers = match visible_non_resident {
Some(non_resident) => {
let mut non_resident = non_resident.peekable();
if non_resident.peek().is_none() {
self.previous_heatmap
.store(Some(PreviousHeatmap::Obsolete.into()));
}
non_resident.chain(resident).collect::<Vec<_>>()
}
None => resident.collect::<Vec<_>>(),
};
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
@@ -3577,7 +3706,9 @@ impl Timeline {
// space. If that's not the case, we had at least one key encounter a gap in the image layer
// and stop the search as a result of that.
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
// Do not fire missing key error for sparse keys.
// Do not fire missing key error and end early for sparse keys. Note that we hava already removed
// non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
// figuring out what is the inherited key range and do a fine-grained pruning.
removed.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
@@ -3662,7 +3793,7 @@ impl Timeline {
// the timeline, then it will remove layers that are required for fulfilling
// the current get request (read-path cannot "look back" and notice the new
// image layer).
let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();
// See `compaction::compact_with_gc` for why we need this.
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
@@ -4349,7 +4480,7 @@ impl Timeline {
let update = crate::tenant::metadata::MetadataUpdate::new(
disk_consistent_lsn,
ondisk_prev_record_lsn,
*self.latest_gc_cutoff_lsn.read(),
*self.applied_gc_cutoff_lsn.read(),
);
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -4474,7 +4605,10 @@ impl Timeline {
));
}
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let (dense_ks, sparse_ks) = self
.collect_keyspace(lsn, ctx)
.await
.map_err(CompactionError::CollectKeySpaceError)?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
@@ -4995,20 +5129,26 @@ impl Timeline {
// image layer generation taking too long time and blocking L0 compaction. So in this
// mode, we also inspect the current number of L0 layers and skip image layer generation
// if there are too many of them.
let num_of_l0_layers = {
let layers = self.layers.read().await;
layers.layer_map()?.level0_deltas().len()
};
let image_preempt_threshold = self.get_image_creation_preempt_threshold()
* self.get_compaction_threshold();
if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
tracing::info!(
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
);
last_partition_processed = Some(partition.clone());
all_generated = false;
break;
// TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield
// when there is a single timeline with more than L0 threshold L0 layers. As long as the
// `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction.
if image_preempt_threshold != 0 {
let should_yield = self
.l0_compaction_trigger
.notified()
.now_or_never()
.is_some();
if should_yield {
tracing::info!(
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers",
partition.start().unwrap(), partition.end().unwrap()
);
last_partition_processed = Some(partition.clone());
all_generated = false;
break;
}
}
}
}
@@ -5037,14 +5177,16 @@ impl Timeline {
.map(|l| l.metadata().file_size)
.sum::<u64>();
info!(
"created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
image_layers.len(),
total_layer_size,
duration.as_secs_f64(),
partition_processed,
total_partitions
);
if !image_layers.is_empty() {
info!(
"created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
image_layers.len(),
total_layer_size,
duration.as_secs_f64(),
partition_processed,
total_partitions
);
}
Ok((
image_layers,
@@ -5187,6 +5329,8 @@ pub(crate) enum CompactionError {
#[error("Failed to offload timeline: {0}")]
Offload(OffloadError),
/// Compaction cannot be done right now; page reconstruction and so on.
#[error("Failed to collect keyspace: {0}")]
CollectKeySpaceError(CollectKeySpaceError),
#[error(transparent)]
Other(anyhow::Error),
}
@@ -5577,7 +5721,7 @@ impl Timeline {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: *self.get_latest_gc_cutoff_lsn(),
time: *self.get_applied_gc_cutoff_lsn(),
space: space_cutoff,
}
}
@@ -5698,7 +5842,7 @@ impl Timeline {
let mut result: GcResult = GcResult::default();
// Nothing to GC. Return early.
let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
if latest_gc_cutoff >= new_gc_cutoff {
info!(
"Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
@@ -5712,7 +5856,7 @@ impl Timeline {
//
// The GC cutoff should only ever move forwards.
let waitlist = {
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
if *write_guard > new_gc_cutoff {
return Err(GcError::BadLsn {
why: format!(
@@ -6652,18 +6796,32 @@ fn is_send() {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use tracing::Instrument;
use utils::{id::TimelineId, lsn::Lsn};
use crate::tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName},
storage_layer::{Layer, LayerName, LayerVisibilityHint},
timeline::{DeltaLayerTestDesc, EvictionError},
Timeline,
PreviousHeatmap, Timeline,
};
use super::HeatMapTimeline;
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
assert_eq!(lhs.layers.len(), rhs.layers.len());
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
for (l, r) in lhs_rhs {
assert_eq!(l.name, r.name);
assert_eq!(l.metadata, r.metadata);
}
}
#[tokio::test]
async fn test_heatmap_generation() {
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
@@ -6737,7 +6895,7 @@ mod tests {
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in heatmap.layers {
for layer in &heatmap.layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
@@ -6752,6 +6910,144 @@ mod tests {
last_lsn = layer_lsn;
}
}
// Evict all the layers and stash the old heatmap in the timeline.
// This simulates a migration to a cold secondary location.
let guard = timeline.layers.read().await;
let mut all_layers = Vec::new();
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
all_layers.push(layer.clone());
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Generate a new heatmap and assert that it contains the same layers as the old one.
let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
// Download each layer one by one. Generate the heatmap at each step and check
// that it's stable.
for layer in all_layers {
if layer.visibility() == LayerVisibilityHint::Covered {
continue;
}
eprintln!("Downloading {layer} and re-generating heatmap");
let _resident = layer
.download_and_keep_resident()
.instrument(tracing::info_span!(
parent: None,
"download_layer",
tenant_id = %timeline.tenant_shard_id.tenant_id,
shard_id = %timeline.tenant_shard_id.shard_slug(),
timeline_id = %timeline.timeline_id
))
.await
.unwrap();
let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
}
// Everything from the post-migration heatmap is now resident.
// Check that we drop it from memory.
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]
async fn test_previous_heatmap_obsoletion() {
let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
.await
.unwrap();
let l0_delta = DeltaLayerTestDesc::new(
Lsn(0x20)..Lsn(0x30),
Key::from_hex("000000000000000000000000000000000000").unwrap()
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
vec![(
Key::from_hex("720000000033333333444444445500000000").unwrap(),
Lsn(0x25),
Value::Image(test_img("foo")),
)],
);
let image_layer = (
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("bar"),
)],
);
let delta_layers = vec![l0_delta];
let image_layers = vec![image_layer];
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
delta_layers,
image_layers,
Lsn(0x100),
)
.await
.unwrap();
// Layer visibility is an input to heatmap generation, so refresh it first
timeline.update_layer_visibility().await.unwrap();
let heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
// Both layers should be in the heatmap
assert!(!heatmap.layers.is_empty());
// Now simulate a migration.
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Evict all the layers in the previous heatmap
let guard = timeline.layers.read().await;
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
// Generate a new heatmap and check that the previous heatmap
// has been marked obsolete.
let post_eviction_heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
assert!(post_eviction_heatmap.layers.is_empty());
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]

View File

@@ -11,7 +11,8 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
Timeline,
};
use anyhow::{anyhow, bail, Context};
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
@@ -301,18 +303,12 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
}
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
self.notify_and_unblock(id);
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
}
GcCompactionQueueItem::Notify(id) => {
self.notify_and_unblock(id);
@@ -692,21 +688,6 @@ impl Timeline {
// Define partitioning schema if needed
let l0_l1_boundary_lsn = {
// We do the repartition on the L0-L1 boundary. All data below the boundary
// are compacted by L0 with low read amplification, thus making the `repartition`
// function run fast.
let guard = self.layers.read().await;
let l0_min_lsn = guard
.layer_map()?
.level0_deltas()
.iter()
.map(|l| l.get_lsn_range().start)
.min()
.unwrap_or(self.get_disk_consistent_lsn());
l0_min_lsn.max(self.get_ancestor_lsn())
};
// 1. L0 Compact
let l0_outcome = {
let timer = self.metrics.compact_time_histo.start_timer();
@@ -733,86 +714,89 @@ impl Timeline {
return Ok(CompactionOutcome::YieldForL0);
}
if l0_l1_boundary_lsn < self.partitioning.read().1 {
// We never go backwards when repartition and create image layers.
info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
} else {
// 2. Repartition and create image layers if necessary
match self
.repartition(
l0_l1_boundary_lsn,
self.get_compaction_target_size(),
options.flags,
ctx,
)
.await
{
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::extend(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
.build();
// 2. Repartition and create image layers if necessary
match self
.repartition(
self.get_last_record_lsn(),
self.get_compaction_target_size(),
options.flags,
ctx,
)
.await
{
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::extend(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
.build();
let mut partitioning = dense_partitioning;
partitioning
.parts
.extend(sparse_partitioning.into_dense().parts);
let mut partitioning = dense_partitioning;
partitioning
.parts
.extend(sparse_partitioning.into_dense().parts);
// 3. Create new image layers for partitions that have been modified "enough".
let (image_layers, outcome) = self
.create_image_layers(
&partitioning,
lsn,
if options
.flags
.contains(CompactFlags::ForceImageLayerCreation)
{
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
self.last_image_layer_creation_status
.load()
.as_ref()
.clone(),
!options.flags.contains(CompactFlags::NoYield),
)
.await
.inspect_err(|err| {
if let CreateImageLayersError::GetVectoredError(
GetVectoredError::MissingKey(_),
) = err
{
critical!("missing key during compaction: {err:?}");
}
})?;
// 3. Create new image layers for partitions that have been modified "enough".
let (image_layers, outcome) = self
.create_image_layers(
&partitioning,
lsn,
if options
.flags
.contains(CompactFlags::ForceImageLayerCreation)
{
ImageLayerCreationMode::Force
} else {
ImageLayerCreationMode::Try
},
&image_ctx,
self.last_image_layer_creation_status
.load()
.as_ref()
.clone(),
!options.flags.contains(CompactFlags::NoYield),
)
.await
.inspect_err(|err| {
if let CreateImageLayersError::GetVectoredError(
GetVectoredError::MissingKey(_),
) = err
{
critical!("missing key during compaction: {err:?}");
}
})?;
self.last_image_layer_creation_status
.store(Arc::new(outcome.clone()));
self.last_image_layer_creation_status
.store(Arc::new(outcome.clone()));
self.upload_new_image_layers(image_layers)?;
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
// Yield and do not do any other kind of compaction.
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
return Ok(CompactionOutcome::YieldForL0);
}
self.upload_new_image_layers(image_layers)?;
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
// Yield and do not do any other kind of compaction.
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
return Ok(CompactionOutcome::YieldForL0);
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
// as an empty timeline. Also in unit tests, when we use the timeline
// as a simple key-value store, ignoring the datadir layout. Log the
// error but continue.
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() && !err.is_cancelled() {
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
// as an empty timeline. Also in unit tests, when we use the timeline
// as a simple key-value store, ignoring the datadir layout. Log the
// error but continue.
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() && !err.is_cancelled() {
if let CompactionError::CollectKeySpaceError(
CollectKeySpaceError::Decode(_)
| CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
) = err
{
critical!("could not compact, repartitioning keyspace failed: {err:?}");
} else {
tracing::error!(
"could not compact, repartitioning keyspace failed: {err:?}"
);
}
}
};
}
}
};
let partition_count = self.partitioning.read().0 .0.parts.len();
@@ -852,7 +836,7 @@ impl Timeline {
//
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
@@ -2202,7 +2186,7 @@ impl Timeline {
// TODO: ensure the child branches will not use anything below the watermark, or consider
// them when computing the watermark.
gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
}
/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.

View File

@@ -294,6 +294,7 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
None, // Previous heatmap is not needed for deletion
tenant.get_timeline_resources_for(remote_client),
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -0,0 +1,162 @@
//! Timeline utility module to hydrate everything from the current heatmap.
//!
//! Provides utilities to spawn and abort a background task where the downloads happen.
//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers.
use futures::StreamExt;
use http_utils::error::ApiError;
use std::sync::{Arc, Mutex};
use tokio_util::sync::CancellationToken;
use utils::sync::gate::Gate;
use super::Timeline;
// This status is not strictly necessary now, but gives us a nice place
// to store progress information if we ever wish to expose it.
pub(super) enum HeatmapLayersDownloadStatus {
InProgress,
Complete,
}
pub(super) struct HeatmapLayersDownloader {
handle: tokio::task::JoinHandle<()>,
status: Arc<Mutex<HeatmapLayersDownloadStatus>>,
cancel: CancellationToken,
downloads_guard: Arc<Gate>,
}
impl HeatmapLayersDownloader {
fn new(
timeline: Arc<Timeline>,
concurrency: usize,
) -> Result<HeatmapLayersDownloader, ApiError> {
let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
let cancel = timeline.cancel.child_token();
let downloads_guard = Arc::new(Gate::default());
let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress));
let handle = tokio::task::spawn({
let status = status.clone();
let downloads_guard = downloads_guard.clone();
let cancel = cancel.clone();
async move {
let _guard = tl_guard;
scopeguard::defer! {
*status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete;
}
let Some(heatmap) = timeline.generate_heatmap().await else {
tracing::info!("Heatmap layers download failed to generate heatmap");
return;
};
tracing::info!(
resident_size=%timeline.resident_physical_size(),
heatmap_layers=%heatmap.layers.len(),
"Starting heatmap layers download"
);
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
|layer| {
let tl = timeline.clone();
let dl_guard = match downloads_guard.enter() {
Ok(g) => g,
Err(_) => {
// [`Self::shutdown`] was called. Don't spawn any more downloads.
return None;
}
};
Some(async move {
let _dl_guard = dl_guard;
let res = tl.download_layer(&layer.name).await;
if let Err(err) = res {
if !err.is_cancelled() {
tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
}
}
})
}
)).buffered(concurrency);
tokio::select! {
_ = stream.collect::<()>() => {
tracing::info!(
resident_size=%timeline.resident_physical_size(),
"Heatmap layers download completed"
);
},
_ = cancel.cancelled() => {
tracing::info!("Heatmap layers download cancelled");
}
}
}
});
Ok(Self {
status,
handle,
cancel,
downloads_guard,
})
}
fn is_complete(&self) -> bool {
matches!(
*self.status.lock().unwrap(),
HeatmapLayersDownloadStatus::Complete
)
}
/// Drive any in-progress downloads to completion and stop spawning any new ones.
///
/// This has two callers and they behave differently
/// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves
/// are sensitive to timeline cancellation.
///
/// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress
/// downloads to complete.
async fn stop_and_drain(self) {
// Counterintuitive: close the guard before cancelling.
// Something needs to poll the already created download futures to completion.
// If we cancel first, then the underlying task exits and we lost
// the poller.
self.downloads_guard.close().await;
self.cancel.cancel();
if let Err(err) = self.handle.await {
tracing::warn!("Failed to join heatmap layer downloader task: {err}");
}
}
}
impl Timeline {
pub(crate) async fn start_heatmap_layers_download(
self: &Arc<Self>,
concurrency: usize,
) -> Result<(), ApiError> {
let mut locked = self.heatmap_layers_downloader.lock().unwrap();
if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
*locked = Some(dl);
Ok(())
} else {
Err(ApiError::Conflict("Already running".to_string()))
}
}
pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) {
// This can race with the start of a new downloader and lead to a situation
// where one donloader is shutting down and another one is in-flight.
// The only impact is that we'd end up using more remote storage semaphore
// units than expected.
let downloader = self.heatmap_layers_downloader.lock().unwrap().take();
if let Some(dl) = downloader {
dl.stop_and_drain().await;
}
}
}

View File

@@ -7,7 +7,9 @@ use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
use crate::tenant::{
DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
};
#[derive(thiserror::Error, Debug)]
pub(crate) enum OffloadError {
@@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline(
debug_assert_current_span_has_tenant_and_timeline_id();
tracing::info!("offloading archived timeline");
let (timeline, guard) = make_timeline_delete_guard(
let delete_guard_res = make_timeline_delete_guard(
tenant,
timeline.timeline_id,
TimelineDeleteGuardKind::Offload,
)
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
);
if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
let is_archived = timeline.is_archived();
if is_archived == Some(true) {
tracing::error!("timeline is archived but has non-archived children: {children:?}");
return Err(OffloadError::NotArchived);
}
tracing::info!(
?is_archived,
"timeline is not archived and has unarchived children"
);
return Err(OffloadError::NotArchived);
};
let (timeline, guard) =
delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
tracing::error!("timeline already offloaded, but given timeline object");

View File

@@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
/// bad storage or bad configuration, and we can't fix that from inside
/// a running process.
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
tracing::error!("Fatal I/O error: {e}: {context})");
let backtrace = std::backtrace::Backtrace::force_capture();
tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}");
std::process::abort();
}
@@ -947,13 +948,18 @@ impl VirtualFileInner {
where
Buf: tokio_epoll_uring::IoBufMut + Send,
{
let file_guard = match self.lock_file().await {
let file_guard = match self
.lock_file()
.await
.maybe_fatal_err("lock_file inside VirtualFileInner::read_at")
{
Ok(file_guard) => file_guard,
Err(e) => return (buf, Err(e)),
};
observe_duration!(StorageIoOperation::Read, {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
if let Ok(size) = res {
STORAGE_IO_SIZE
.with_label_values(&[

View File

@@ -14,10 +14,12 @@
#include "utils/guc.h"
#include "extension_server.h"
#include "extension_server.h"
#include "neon_utils.h"
static int extension_server_port = 0;
static int extension_server_request_timeout = 60;
static int extension_server_connect_timeout = 60;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
@@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
static bool
neon_download_extension_file_http(const char *filename, bool is_library)
{
static CURL *handle = NULL;
CURLcode res;
char *compute_ctl_url;
bool ret = false;
CURL *handle = NULL;
char *compute_ctl_url;
if (handle == NULL)
{
handle = alloc_curl_handle();
handle = alloc_curl_handle();
curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
}
curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
if (extension_server_request_timeout > 0)
curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ );
if (extension_server_connect_timeout > 0)
curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ );
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
@@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library)
/* Perform the request, res will get the return code */
res = curl_easy_perform(handle);
curl_easy_cleanup(handle);
/* Check for errors */
if (res == CURLE_OK)
{
@@ -88,6 +91,24 @@ pg_init_extension_server()
0, /* no flags required */
NULL, NULL, NULL);
DefineCustomIntVariable("neon.extension_server_request_timeout",
"timeout for fetching extensions in seconds",
NULL,
&extension_server_request_timeout,
60, 0, INT_MAX,
PGC_SUSET,
GUC_UNIT_S,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.extension_server_connect_timeout",
"timeout for connecting to the extension server in seconds",
NULL,
&extension_server_connect_timeout,
60, 0, INT_MAX,
PGC_SUSET,
GUC_UNIT_S,
NULL, NULL, NULL);
/* set download_extension_file_hook */
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;

View File

@@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
index = hash >> HLL_C_BITS;
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1;
Assert(count <= HLL_C_BITS);
cState->regs[index][count] = now;
}
@@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
{
if (reg[i] >= since)
{
max = i;
max = i + 1;
}
}

View File

@@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
{
case PS_Disconnected:
{
const char *keywords[3];
const char *values[3];
const char *keywords[4];
const char *values[4];
char pid_str[16];
int n_pgsql_params;
TimestampTz now;
int64 us_since_last_attempt;
@@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel)
* can override the password from the env variable. Seems useful, although
* we don't currently use that capability anywhere.
*/
keywords[0] = "dbname";
values[0] = connstr;
n_pgsql_params = 1;
n_pgsql_params = 0;
/*
* Pageserver logs include this in the connection's tracing span.
* This allows for reasier log correlation between compute and pageserver.
*/
keywords[n_pgsql_params] = "application_name";
{
int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid);
if (ret < 0 || ret >= (int)(sizeof(pid_str)))
elog(FATAL, "stack-allocated buffer too small to hold pid");
}
/* lifetime: PQconnectStartParams strdups internally */
values[n_pgsql_params] = (const char*) pid_str;
n_pgsql_params++;
keywords[n_pgsql_params] = "dbname";
values[n_pgsql_params] = connstr;
n_pgsql_params++;
if (neon_auth_token)
{
keywords[1] = "password";
values[1] = neon_auth_token;
keywords[n_pgsql_params] = "password";
values[n_pgsql_params] = neon_auth_token;
n_pgsql_params++;
}

View File

@@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode)
* neon_truncate() -- Truncate relation to specified number of blocks.
*/
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
XLogRecPtr lsn;
@@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
mdtruncate(reln, forknum, nblocks);
mdtruncate(reln, forknum, old_blocks, nblocks);
return;
default:
@@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdtruncate(reln, forknum, nblocks);
mdtruncate(reln, forknum, old_blocks, nblocks);
#endif
}

View File

@@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
BlockNumber old_blocks, BlockNumber nblocks);
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
#if PG_MAJORVERSION_NUM >= 17
static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
@@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
* inmem_truncate() -- Truncate relation to specified number of blocks.
*/
static void
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
}

25
poetry.lock generated
View File

@@ -412,6 +412,7 @@ files = [
[package.dependencies]
botocore-stubs = "*"
mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""}
mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""}
types-s3transfer = "*"
typing-extensions = ">=4.1.0"
@@ -2022,6 +2023,18 @@ install-types = ["pip"]
mypyc = ["setuptools (>=50)"]
reports = ["lxml"]
[[package]]
name = "mypy-boto3-kms"
version = "1.26.147"
description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"},
{file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"},
]
[[package]]
name = "mypy-boto3-s3"
version = "1.26.0.post1"
@@ -2758,18 +2771,18 @@ pytest = ">=5,<8"
[[package]]
name = "pytest-timeout"
version = "2.1.0"
version = "2.3.1"
description = "pytest plugin to abort hanging tests"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
{file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
{file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"},
{file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"},
]
[package.dependencies]
pytest = ">=5.0.0"
pytest = ">=7.0.0"
[[package]]
name = "pytest-xdist"
@@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "^3.11"
content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a"

View File

@@ -19,7 +19,6 @@ aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
base64.workspace = true
boxcar = "0.2.8"
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
@@ -63,7 +62,6 @@ postgres_backend.workspace = true
postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
pq_proto.workspace = true
prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
@@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
smol_str.workspace = true
smallvec.workspace = true
socket2.workspace = true
strum.workspace = true
strum_macros.workspace = true
subtle.workspace = true
thiserror.workspace = true
@@ -95,7 +92,6 @@ tracing-subscriber.workspace = true
tracing-utils.workspace = true
tracing.workspace = true
tracing-log.workspace = true
tracing-serde.workspace = true
tracing-opentelemetry.workspace = true
try-lock.workspace = true
typed-json.workspace = true

View File

@@ -140,9 +140,8 @@ async fn authenticate(
let (psql_session_id, waiter) = loop {
let psql_session_id = new_psql_session_id();
match control_plane::mgmt::get_waiter(&psql_session_id) {
Ok(waiter) => break (psql_session_id, waiter),
Err(_e) => continue,
if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) {
break (psql_session_id, waiter);
}
};

View File

@@ -220,11 +220,11 @@ async fn fetch_jwks(
}
impl JwkCacheEntryLock {
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
async fn acquire_permit(self: &Arc<Self>) -> JwkRenewalPermit<'_> {
JwkRenewalPermit::acquire_permit(self).await
}
fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
fn try_acquire_permit(self: &Arc<Self>) -> Option<JwkRenewalPermit<'_>> {
JwkRenewalPermit::try_acquire_permit(self)
}
@@ -393,7 +393,7 @@ impl JwkCacheEntryLock {
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
}
key => return Err(JwtError::UnsupportedKeyType(key.into())),
};
}
tracing::debug!(?payload, "JWT signature valid with claims");
@@ -510,7 +510,7 @@ fn verify_rsa_signature(
key.verify(data, &sig)?;
}
_ => return Err(JwtError::InvalidRsaSigningAlgorithm),
};
}
Ok(())
}

View File

@@ -4,6 +4,20 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{bail, ensure, Context};
use camino::{Utf8Path, Utf8PathBuf};
use clap::Parser;
use compute_api::spec::LocalProxySpec;
use futures::future::Either;
use thiserror::Error;
use tokio::net::TcpListener;
use tokio::sync::Notify;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
use utils::sentry_init::init_sentry;
use utils::{pid_file, project_build_tag, project_git_version};
use crate::auth::backend::jwt::JwkCache;
use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
use crate::auth::{self};
@@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions};
use crate::tls::client_config::compute_client_config_with_root_certs;
use crate::types::RoleName;
use crate::url::ApiUrl;
use anyhow::{bail, ensure, Context};
use camino::{Utf8Path, Utf8PathBuf};
use compute_api::spec::LocalProxySpec;
use futures::future::Either;
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
use clap::Parser;
use thiserror::Error;
use tokio::net::TcpListener;
use tokio::sync::Notify;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
use utils::sentry_init::init_sentry;
use utils::{pid_file, project_build_tag, project_git_version};
/// Neon proxy/router
#[derive(Parser)]
#[command(version = GIT_VERSION, about)]

View File

@@ -5,12 +5,6 @@
/// the outside. Similar to an ingress controller for HTTPS.
use std::{net::SocketAddr, sync::Arc};
use crate::context::RequestContext;
use crate::metrics::{Metrics, ThreadPoolMetrics};
use crate::protocol2::ConnectionInfo;
use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
use crate::stream::{PqStream, Stream};
use crate::tls::TlsServerEndPoint;
use anyhow::{anyhow, bail, ensure, Context};
use clap::Arg;
use futures::future::Either;
@@ -25,6 +19,13 @@ use tracing::{error, info, Instrument};
use utils::project_git_version;
use utils::sentry_init::init_sentry;
use crate::context::RequestContext;
use crate::metrics::{Metrics, ThreadPoolMetrics};
use crate::protocol2::ConnectionInfo;
use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
use crate::stream::{PqStream, Stream};
use crate::tls::TlsServerEndPoint;
project_git_version!(GIT_VERSION);
fn cli() -> clap::Command {

View File

@@ -3,6 +3,16 @@ use std::pin::pin;
use std::sync::Arc;
use std::time::Duration;
use anyhow::bail;
use futures::future::Either;
use remote_storage::RemoteStorageConfig;
use tokio::net::TcpListener;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{info, warn, Instrument};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version};
use crate::auth::backend::jwt::JwkCache;
use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
use crate::cancellation::{handle_cancel_messages, CancellationHandler};
@@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet;
use crate::serverless::GlobalConnPoolOptions;
use crate::tls::client_config::compute_client_config_with_root_certs;
use crate::{auth, control_plane, http, serverless, usage_metrics};
use anyhow::bail;
use futures::future::Either;
use remote_storage::RemoteStorageConfig;
use tokio::net::TcpListener;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::{info, warn, Instrument};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version};
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
@@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> {
match auth_backend {
Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
};
}
info!("Using region: {}", args.aws_region);
// TODO: untangle the config args
@@ -803,9 +804,10 @@ fn build_auth_backend(
mod tests {
use std::time::Duration;
use crate::rate_limiter::RateBucketInfo;
use clap::Parser;
use crate::rate_limiter::RateBucketInfo;
#[test]
fn parse_endpoint_rps_limit() {
let config = super::ProxyCliArgs::parse_from([

View File

@@ -242,7 +242,7 @@ impl EndpointsCache {
});
tracing::error!("error parsing value {value:?}: {err:?}");
}
};
}
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);

View File

@@ -501,7 +501,7 @@ impl Session {
_guard: Metrics::get()
.proxy
.cancel_channel_size
.guard(RedisMsgKind::HSet),
.guard(RedisMsgKind::HDel),
};
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {

View File

@@ -137,8 +137,8 @@ impl ConnCfg {
match k {
// Only set `user` if it's not present in the config.
// Console redirect auth flow takes username from the console's response.
"user" if self.user_is_set() => continue,
"database" if self.db_is_set() => continue,
"user" if self.user_is_set() => {}
"database" if self.db_is_set() => {}
"options" => {
if let Some(options) = filtered_options(v) {
self.set_param(k, &options);

View File

@@ -82,7 +82,7 @@ pub async fn task_main(
error!("per-client task finished with an error: failed to set socket option: {e:#}");
return;
}
};
}
let ctx = RequestContext::new(
session_id,

View File

@@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru};
use crate::config::ComputeConfig;
use crate::context::RequestContext;
use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
use crate::intern::AccountIdInt;
use crate::intern::ProjectIdInt;
use crate::intern::{AccountIdInt, ProjectIdInt};
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, scram};

View File

@@ -7,9 +7,8 @@ use chrono::{DateTime, Utc};
use opentelemetry::trace::TraceContextExt;
use scopeguard::defer;
use serde::ser::{SerializeMap, Serializer};
use tracing::span;
use tracing::subscriber::Interest;
use tracing::{callsite, Event, Metadata, Span, Subscriber};
use tracing::{callsite, span, Event, Metadata, Span, Subscriber};
use tracing_opentelemetry::OpenTelemetrySpanExt;
use tracing_subscriber::filter::{EnvFilter, LevelFilter};
use tracing_subscriber::fmt::format::{Format, Full};

View File

@@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
// if no more bytes available then exit
if bytes_read == 0 {
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
};
}
// check if we have enough bytes to continue
if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
@@ -169,7 +169,7 @@ fn process_proxy_payload(
header.version_and_command
),
)),
};
}
let size_err =
"invalid proxy protocol length. payload not large enough to fit requested IP addresses";

View File

@@ -198,7 +198,7 @@ where
warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT);
}
};
}
let wait_duration = retry_after(num_retries, compute.retry);
num_retries += 1;

View File

@@ -118,7 +118,7 @@ pub async fn task_main(
error!("per-client task finished with an error: failed to set socket option: {e:#}");
return;
}
};
}
let ctx = RequestContext::new(
session_id,

View File

@@ -169,7 +169,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
});
tracing::error!("broken message: {e}");
}
};
}
return Ok(());
}
Ok(msg) => msg,
@@ -180,7 +180,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
match serde_json::from_str::<NotificationHeader>(&payload) {
Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
Err(_) => tracing::error!("broken message: {e}"),
};
}
return Ok(());
}
};

Some files were not shown because too many files have changed in this diff Show More