Compare commits

..

1 Commits

202 changed files with 3624 additions and 7202 deletions

View File

@@ -8,10 +8,8 @@
!scripts/ninstall.sh
!docker-compose/run-tests.sh
!.cargo/config.toml
# Directories
#!.cargo
!.cargo/
!.config/
!compute/
!compute_tools/

View File

@@ -348,10 +348,6 @@ jobs:
rerun_failed: true
pg_version: ${{ matrix.pg_version }}
aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
# `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
# Attempt to stop tests gracefully to generate test reports
# until they are forcibly stopped by the stricter `timeout-minutes` limit.
extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

56
.github/workflows/_push-to-acr.yml vendored Normal file
View File

@@ -0,0 +1,56 @@
name: Push images to ACR
on:
workflow_call:
inputs:
client_id:
description: Client ID of Azure managed identity or Entra app
required: true
type: string
image_tag:
description: Tag for the container image
required: true
type: string
images:
description: Images to push
required: true
type: string
registry_name:
description: Name of the container registry
required: true
type: string
subscription_id:
description: Azure subscription ID
required: true
type: string
tenant_id:
description: Azure tenant ID
required: true
type: string
jobs:
push-to-acr:
runs-on: ubuntu-22.04
permissions:
contents: read # This is required for actions/checkout
id-token: write # This is required for Azure Login to work.
steps:
- name: Azure login
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ inputs.client_id }}
subscription-id: ${{ inputs.subscription_id }}
tenant-id: ${{ inputs.tenant_id }}
- name: Login to ACR
run: |
az acr login --name=${{ inputs.registry_name }}
- name: Copy docker images to ACR ${{ inputs.registry_name }}
run: |
images='${{ inputs.images }}'
for image in ${images}; do
docker buildx imagetools create \
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
neondatabase/${image}:${{ inputs.image_tag }}
done

View File

@@ -1,101 +0,0 @@
name: Push images to Container Registry
on:
workflow_call:
inputs:
# Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
image-map:
description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
required: true
type: string
aws-region:
description: AWS region to log in to. Required when pushing to ECR.
required: false
type: string
aws-account-ids:
description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
required: false
type: string
azure-client-id:
description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR.
required: false
type: string
azure-subscription-id:
description: Azure subscription ID. Required when pushing to ACR.
required: false
type: string
azure-tenant-id:
description: Azure tenant ID. Required when pushing to ACR.
required: false
type: string
acr-registry-name:
description: ACR registry name. Required when pushing to ACR.
required: false
type: string
secrets:
docker-hub-username:
description: Docker Hub username. Required when pushing to Docker Hub.
required: false
docker-hub-password:
description: Docker Hub password. Required when pushing to Docker Hub.
required: false
aws-role-to-assume:
description: AWS role to assume. Required when pushing to ECR.
required: false
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
push-to-container-registry:
runs-on: ubuntu-22.04
permissions:
id-token: write # Required for aws/azure login
steps:
- uses: actions/checkout@v4
with:
sparse-checkout: scripts/push_with_image_map.py
sparse-checkout-cone-mode: false
- name: Print image-map
run: echo '${{ inputs.image-map }}' | jq
- name: Configure AWS credentials
if: contains(inputs.image-map, 'amazonaws.com/')
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: "${{ inputs.aws-region }}"
role-to-assume: "${{ secrets.aws-role-to-assume }}"
role-duration-seconds: 3600
- name: Login to ECR
if: contains(inputs.image-map, 'amazonaws.com/')
uses: aws-actions/amazon-ecr-login@v2
with:
registries: "${{ inputs.aws-account-ids }}"
- name: Configure Azure credentials
if: contains(inputs.image-map, 'azurecr.io/')
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ inputs.azure-client-id }}
subscription-id: ${{ inputs.azure-subscription-id }}
tenant-id: ${{ inputs.azure-tenant-id }}
- name: Login to ACR
if: contains(inputs.image-map, 'azurecr.io/')
run: |
az acr login --name=${{ inputs.acr-registry-name }}
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.docker-hub-username }}
password: ${{ secrets.docker-hub-password }}
- name: Copy docker images to target registries
run: python scripts/push_with_image_map.py
env:
IMAGE_MAP: ${{ inputs.image-map }}

View File

@@ -263,9 +263,8 @@ jobs:
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
@@ -498,7 +497,7 @@ jobs:
trigger-e2e-tests:
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
needs: [ check-permissions, promote-images-dev, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
@@ -572,6 +571,21 @@ jobs:
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- name: Push multi-arch image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, tag ]
permissions:
@@ -618,6 +632,16 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- uses: docker/login-action@v3
with:
registry: cache.neon.build
@@ -705,6 +729,21 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
@@ -837,109 +876,133 @@ jobs:
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
generate-image-maps:
needs: [ tag ]
promote-images-dev:
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
runs-on: ubuntu-22.04
outputs:
neon-dev: ${{ steps.generate.outputs.neon-dev }}
neon-prod: ${{ steps.generate.outputs.neon-prod }}
compute-dev: ${{ steps.generate.outputs.compute-dev }}
compute-prod: ${{ steps.generate.outputs.compute-prod }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
env:
VERSIONS: v14 v15 v16 v17
steps:
- uses: actions/checkout@v4
- uses: docker/login-action@v3
with:
sparse-checkout: scripts/generate_image_maps.py
sparse-checkout-cone-mode: false
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Generate Image Maps
id: generate
run: python scripts/generate_image_maps.py
env:
BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
push-neon-image-dev:
needs: [ generate-image-maps, neon-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
push-compute-image-dev:
needs: [ generate-image-maps, vm-compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Copy vm-compute-node images to ECR
run: |
for version in ${VERSIONS}; do
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
push-neon-image-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, neon-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
promote-images-prod:
needs: [ check-permissions, tag, test-images, promote-images-dev ]
runs-on: ubuntu-22.04
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
push-compute-image-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, vm-compute-node-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
env:
VERSIONS: v14 v15 v16 v17
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Add latest tag to images
if: github.ref_name == 'main'
run: |
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
- name: Configure AWS-prod credentials
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
mask-aws-account-id: true
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
- name: Login to prod ECR
uses: docker/login-action@v3
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
with:
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
- name: Copy all images to prod ECR
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
run: |
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
push-to-acr-dev:
if: github.ref_name == 'main'
needs: [ tag, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
needs: [ tag, promote-images-dev ]
uses: ./.github/workflows/_push-to-acr.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets:
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
push-to-acr-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ tag, promote-images-prod ]
uses: ./.github/workflows/_push-to-acr.yml
with:
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
@@ -1021,7 +1084,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
permissions:
@@ -1274,7 +1337,7 @@ jobs:
done
pin-build-tools-image:
needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
@@ -1299,8 +1362,7 @@ jobs:
- check-codestyle-rust
- check-dependencies-rust
- files-changed
- push-compute-image-dev
- push-neon-image-dev
- promote-images-dev
- test-images
- trigger-custom-extensions-build-and-wait
runs-on: ubuntu-22.04
@@ -1317,7 +1379,6 @@ jobs:
|| needs.check-codestyle-python.result == 'skipped'
|| needs.check-codestyle-rust.result == 'skipped'
|| needs.files-changed.result == 'skipped'
|| needs.push-compute-image-dev.result == 'skipped'
|| needs.push-neon-image-dev.result == 'skipped'
|| needs.promote-images-dev.result == 'skipped'
|| needs.test-images.result == 'skipped'
|| needs.trigger-custom-extensions-build-and-wait.result == 'skipped'

View File

@@ -1,76 +0,0 @@
name: Force Test Upgrading of Extension
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '45 2 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow
group: ${{ github.workflow }}
cancel-in-progress: true
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
jobs:
regress:
strategy:
fail-fast: false
matrix:
pg-version: [16, 17]
runs-on: small
steps:
- uses: actions/checkout@v4
with:
submodules: false
- name: Get the last compute release tag
id: get-last-compute-release-tag
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${GITHUB_REPOSITORY}/releases")
echo tag=${tag} >> ${GITHUB_OUTPUT}
- name: Test extension upgrade
timeout-minutes: 20
env:
NEWTAG: latest
OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
PG_VERSION: ${{ matrix.pg-version }}
FORCE_ALL_UPGRADE_TESTS: true
run: ./docker-compose/test_extensions_upgrade.sh
- name: Print logs and clean up
if: always()
run: |
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
- name: Post to the Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
slack-message: |
Test upgrading of extensions: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -1,41 +0,0 @@
name: Regenerate Postgres Settings
on:
pull_request:
types:
- opened
- synchronize
- reopened
paths:
- pgxn/neon/**.c
- vendor/postgres-v*
- vendor/revisions.json
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
pull-requests: write
jobs:
regenerate-pg-settings:
runs-on: ubuntu-22.04
steps:
- name: Add comment
uses: thollander/actions-comment-pull-request@v3
with:
comment-tag: ${{ github.job }}
pr-number: ${{ github.event.number }}
message: |
If this PR added a GUC in the Postgres fork or `neon` extension,
please regenerate the Postgres settings in the `cloud` repo:
```
make NEON_WORKDIR=path/to/neon/checkout \
-C goapp/internal/shareddomain/postgres generate
```
If you're an external contributor, a Neon employee will assist in
making sure this step is done.

View File

@@ -15,14 +15,7 @@ env:
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name }}
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
@@ -36,7 +29,6 @@ jobs:
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
@@ -76,7 +68,7 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Wait for `push-{neon,compute}-image-dev` job to finish
- name: Wait for `promote-images-dev` job to finish
# It's important to have a timeout here, the script in the step can run infinitely
timeout-minutes: 60
run: |
@@ -87,20 +79,20 @@ jobs:
# For PRs we use the run id as the tag
BUILD_AND_TEST_RUN_ID=${TAG}
while true; do
gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
break
fi
jq -c '.[]' jobs.json | while read -r job; do
case $(echo $job | jq .conclusion) in
failure | cancelled | skipped)
echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..."
exit 1
;;
esac
done
echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..."
sleep 60
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
case "$conclusion" in
success)
break
;;
failure | cancelled | skipped)
echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
exit 1
;;
*)
echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
sleep 60
;;
esac
done
- name: Set e2e-platforms

145
Cargo.lock generated
View File

@@ -300,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "aws-config"
version = "1.5.10"
version = "1.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
checksum = "c03a50b30228d3af8865ce83376b4e99e1ffa34728220fe2860e4df0bb5278d6"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -311,7 +311,7 @@ dependencies = [
"aws-sdk-sts",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.60.7",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -342,9 +342,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
version = "1.4.4"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
checksum = "b16d1aa50accc11a4b4d5c50f7fb81cc0cf60328259c587d0e6b0f11385bde46"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -368,15 +368,15 @@ dependencies = [
[[package]]
name = "aws-sdk-iam"
version = "1.53.0"
version = "1.56.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
checksum = "f2a089a65ceba0f649be19c7b2213d2e009bd7700159e280f03ad5ed2828d30c"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -391,15 +391,15 @@ dependencies = [
[[package]]
name = "aws-sdk-kms"
version = "1.51.0"
version = "1.54.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
checksum = "a6cf16c0e5853312995505557b876dd3f9fb9941e96d031383528ccef14ace57"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -413,9 +413,9 @@ dependencies = [
[[package]]
name = "aws-sdk-s3"
version = "1.65.0"
version = "1.68.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -424,7 +424,7 @@ dependencies = [
"aws-smithy-checksums",
"aws-smithy-eventstream",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -447,15 +447,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sso"
version = "1.50.0"
version = "1.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
checksum = "1605dc0bf9f0a4b05b451441a17fcb0bda229db384f23bf5cead3adbab0664ac"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -469,15 +469,15 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
version = "1.51.0"
version = "1.54.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
checksum = "59f3f73466ff24f6ad109095e0f3f2c830bfb4cd6c8b12f744c8e61ebf4d3ba1"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -491,15 +491,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
version = "1.51.0"
version = "1.54.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
checksum = "249b2acaa8e02fd4718705a9494e3eb633637139aa4bb09d70965b0448e865db"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json 0.61.1",
"aws-smithy-json",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -543,9 +543,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
version = "1.2.1"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
checksum = "427cb637d15d63d6f9aae26358e1c9a9c09d5aa490d64b09354c8217cfef0f28"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -605,15 +605,6 @@ dependencies = [
"tracing",
]
[[package]]
name = "aws-smithy-json"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
dependencies = [
"aws-smithy-types",
]
[[package]]
name = "aws-smithy-json"
version = "0.61.1"
@@ -635,9 +626,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.7.4"
version = "1.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
checksum = "a05dd41a70fc74051758ee75b5c4db2c0ca070ed9229c3df50e9475cda1cb985"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -679,9 +670,9 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
version = "1.2.9"
version = "1.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
checksum = "38ddc9bd6c28aeb303477170ddd183760a956a03e083b3902a990238a7e3792d"
dependencies = [
"base64-simd",
"bytes",
@@ -786,7 +777,7 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"async-trait",
"base64 0.22.1",
@@ -815,7 +806,7 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"async-lock",
"async-trait",
@@ -834,7 +825,7 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"RustyXML",
"async-lock",
@@ -852,7 +843,7 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"RustyXML",
"azure_core",
@@ -872,7 +863,7 @@ dependencies = [
[[package]]
name = "azure_svc_blobstorage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"azure_core",
"bytes",
@@ -1029,6 +1020,12 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boxcar"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
[[package]]
name = "bstr"
version = "1.5.0"
@@ -1287,7 +1284,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"jsonwebtoken",
"regex",
"remote_storage",
"serde",
@@ -1315,10 +1311,6 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"http-body-util",
"hyper 1.4.1",
"hyper-util",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"notify",
@@ -1432,7 +1424,6 @@ dependencies = [
"comfy-table",
"compute_api",
"futures",
"http-utils",
"humantime",
"humantime-serde",
"hyper 0.14.30",
@@ -2757,38 +2748,6 @@ dependencies = [
"url",
]
[[package]]
name = "http-utils"
version = "0.1.0"
dependencies = [
"anyhow",
"backtrace",
"bytes",
"fail",
"flate2",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"metrics",
"once_cell",
"pprof",
"regex",
"routerify",
"serde",
"serde_json",
"serde_path_to_error",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"url",
"utils",
"uuid",
"workspace_hack",
]
[[package]]
name = "httparse"
version = "1.8.0"
@@ -4143,7 +4102,6 @@ dependencies = [
"futures",
"hex",
"hex-literal",
"http-utils",
"humantime",
"humantime-serde",
"hyper 0.14.30",
@@ -4244,7 +4202,6 @@ dependencies = [
"anyhow",
"bytes",
"futures",
"http-utils",
"pageserver_api",
"postgres",
"reqwest",
@@ -4926,6 +4883,7 @@ dependencies = [
"aws-sdk-iam",
"aws-sigv4",
"base64 0.13.1",
"boxcar",
"bstr",
"bytes",
"camino",
@@ -4950,7 +4908,6 @@ dependencies = [
"hostname",
"http 1.1.0",
"http-body-util",
"http-utils",
"humantime",
"humantime-serde",
"hyper 0.14.30",
@@ -4977,6 +4934,7 @@ dependencies = [
"postgres-protocol2",
"postgres_backend",
"pq_proto",
"prometheus",
"rand 0.8.5",
"rand_distr",
"rcgen",
@@ -5001,6 +4959,7 @@ dependencies = [
"smallvec",
"smol_str",
"socket2",
"strum",
"strum_macros",
"subtle",
"thiserror 1.0.69",
@@ -5015,6 +4974,7 @@ dependencies = [
"tracing",
"tracing-log",
"tracing-opentelemetry",
"tracing-serde",
"tracing-subscriber",
"tracing-utils",
"try-lock",
@@ -5795,7 +5755,6 @@ dependencies = [
"futures",
"hex",
"http 1.1.0",
"http-utils",
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
@@ -5860,7 +5819,6 @@ dependencies = [
name = "safekeeper_client"
version = "0.1.0"
dependencies = [
"http-utils",
"reqwest",
"safekeeper_api",
"serde",
@@ -6443,7 +6401,6 @@ dependencies = [
"fail",
"futures",
"hex",
"http-utils",
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
@@ -6455,13 +6412,10 @@ dependencies = [
"pageserver_client",
"postgres_connection",
"rand 0.8.5",
"regex",
"reqwest",
"routerify",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"safekeeper_api",
"safekeeper_client",
"scoped-futures",
"scopeguard",
"serde",
@@ -7611,38 +7565,48 @@ dependencies = [
"criterion",
"diatomic-waker",
"fail",
"flate2",
"futures",
"git-version",
"hex",
"hex-literal",
"humantime",
"hyper 0.14.30",
"inferno 0.12.0",
"itertools 0.10.5",
"jemalloc_pprof",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"once_cell",
"pin-project-lite",
"postgres_connection",
"pprof",
"pq_proto",
"rand 0.8.5",
"regex",
"routerify",
"scopeguard",
"sentry",
"serde",
"serde_assert",
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"strum",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"uuid",
"walkdir",
]
@@ -8237,7 +8201,6 @@ dependencies = [
"tracing-core",
"tracing-log",
"url",
"uuid",
"zerocopy",
"zeroize",
"zstd",

View File

@@ -18,7 +18,6 @@ members = [
"storage_scrubber",
"workspace_hack",
"libs/compute_api",
"libs/http-utils",
"libs/pageserver_api",
"libs/postgres_ffi",
"libs/safekeeper_api",
@@ -230,7 +229,6 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
## Local libraries
compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
http-utils = { version = "0.1", path = "./libs/http-utils/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
pageserver = { path = "./pageserver" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }

View File

@@ -10,28 +10,6 @@ ARG STABLE_PG_VERSION=16
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
# Build Postgres
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
WORKDIR /home/nonroot
@@ -50,14 +28,6 @@ RUN set -e \
&& rm -rf pg_install/build \
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
# Prepare cargo-chef recipe
FROM $REPOSITORY/$IMAGE:$TAG AS plan
WORKDIR /home/nonroot
COPY --chown=nonroot . .
RUN cargo chef prepare --recipe-path recipe.json
# Build neon binaries
FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
@@ -71,15 +41,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
COPY --from=plan /home/nonroot/recipe.json recipe.json
ARG ADDITIONAL_RUSTFLAGS=""
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \
@@ -95,7 +59,7 @@ RUN set -e \
# Build final image
#
FROM $BASE_IMAGE_SHA
FROM debian:${DEBIAN_FLAVOR}
ARG DEFAULT_PG_VERSION
WORKDIR /data
@@ -148,3 +112,4 @@ EXPOSE 6400
EXPOSE 9898
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]

View File

@@ -1,29 +1,6 @@
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
FROM $BASE_IMAGE_SHA AS pgcopydb_builder
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
@@ -32,7 +9,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# By default, /bin/sh used in debian images will treat '\n' as eol,
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
@@ -81,7 +58,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
fi
FROM $BASE_IMAGE_SHA AS build_tools
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
ARG DEBIAN_VERSION
# Add nonroot user
@@ -98,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
# System deps
@@ -161,8 +138,7 @@ RUN curl -fsSL \
--output sql_exporter.tar.gz \
&& mkdir /tmp/sql_exporter \
&& tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \
&& rm sql_exporter.tar.gz
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
# protobuf-compiler (protoc)
ENV PROTOC_VERSION=25.1
@@ -300,7 +276,6 @@ ARG CARGO_HAKARI_VERSION=0.9.33
ARG CARGO_DENY_VERSION=0.16.2
ARG CARGO_HACK_VERSION=0.6.33
ARG CARGO_NEXTEST_VERSION=0.9.85
ARG CARGO_CHEF_VERSION=0.1.71
ARG CARGO_DIESEL_CLI_VERSION=2.2.6
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
@@ -315,7 +290,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
--features postgres-bundled --no-default-features && \
rm -rf /home/nonroot/.cargo/registry && \

View File

@@ -5,39 +5,6 @@
# We use Debian as the base for all the steps. The production images use Debian bookworm
# for v17, and Debian bullseye for older PostgreSQL versions.
#
# This same Dockerfile can be used to build several kinds of target images:
#
# Target: compute-node
# --------------------
#
# Contains compute_ctl, Postgres, extensions, pgbouncer, and metrics exporters.
# Everything that's needed to provide the user-visible services of a compute
# endpoint. The target produces a docker image that's suitable for running
# compute_ctl in a docker container (compute_ctl is set as the entrypoint). The
# other services like pgbouncer are not launched when you execute this
# container, although the binaries are included in the image.
#
# When building old-style VM images with vm-builder, this is the input to
# vm-builder. See the vm-compute-node-image job in the build_and_test.yml github
# workflow for how that's done. For backwards-compatibility with the github
# action and any other scripts lying around, this is the default target.
#
# Target: compute-node-bootable
# -----------------------------
#
# Produces an image with systemd, and systemd configuration to run all the
# services. This is suitable for running in a VM. For testing, it can also be
# launched in a docker container with:
#
# docker run --name=compute-node --privileged neondatabase/compute-node-bootable:local /sbin/init
#
# Target: compute-node-neonvm-payload
# -----------------------------------
#
# Processes 'compute-node-bootable' into a QCOW2 image, suitable for loading with
# neonvm-guest
#
#
# ## Intermediary layers
#
# build-tools: This contains Rust compiler toolchain and other tools needed at compile
@@ -95,6 +62,19 @@
# The configuration files for the metrics exporters are under etc/ directory. We use
# a templating system to handle variations between different PostgreSQL versions,
# building slightly different config files for each PostgreSQL version.
#
#
# ## Final image
#
# The final image puts together the PostgreSQL binaries (pg-build), the compute tools
# (compute-tools), all the extensions (all-extensions) and the extra components into
# one image.
#
# VM image: The final image built by this dockerfile isn't actually the final image that
# we use in computes VMs. There's an extra step that adds some files and makes other
# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM.
# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the
# build_and_test.yml github workflow for how that's done.
ARG PG_VERSION
ARG REPOSITORY=neondatabase
@@ -103,28 +83,7 @@ ARG TAG=pinned
ARG BUILD_TAG
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
ARG ALPINE_CURL_VERSION=8.11.1
# By default, build all PostgreSQL extensions. For quick local testing when you don't
# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
@@ -135,7 +94,7 @@ ARG EXTENSIONS=all
# Layer "build-deps"
#
#########################################################################################
FROM $BASE_IMAGE_SHA AS build-deps
FROM debian:$DEBIAN_FLAVOR AS build-deps
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
@@ -144,7 +103,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# By default, /bin/sh used in debian images will treat '\n' as eol,
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
RUN case $DEBIAN_VERSION in \
@@ -168,7 +127,7 @@ RUN case $DEBIAN_VERSION in \
apt install --no-install-recommends --no-install-suggests -y \
ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
$VERSION_INSTALLS \
&& apt clean && rm -rf /var/lib/apt/lists/*
@@ -180,11 +139,11 @@ RUN case $DEBIAN_VERSION in \
#########################################################################################
FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
if [ "${PG_VERSION}" != "v14" ]; then \
# zstd is available only from PG15
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
fi && \
@@ -278,7 +237,7 @@ RUN case "${DEBIAN_VERSION}" in \
# Postgis 3.5.0 supports v17
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export POSTGIS_VERSION=3.5.0 \
export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \
@@ -353,7 +312,7 @@ FROM build-deps AS pgrouting-src
ARG DEBIAN_VERSION
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export PGROUTING_VERSION=3.6.2 \
export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \
@@ -399,7 +358,7 @@ COPY compute/patches/plv8-3.1.10.patch .
#
# Use new version only for v17
# because since v3.2, plv8 doesn't include plcoffee and plls extensions
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export PLV8_TAG=v3.2.3 \
;; \
@@ -413,7 +372,7 @@ RUN case "${PG_VERSION:?}" in \
git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
tar -czf plv8.tar.gz --exclude .git plv8-src && \
cd plv8-src && \
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
FROM pg-build AS plv8-build
ARG PG_VERSION
@@ -433,7 +392,7 @@ RUN \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
case "${PG_VERSION:?}" in \
case "${PG_VERSION}" in \
"v17") \
ln -s plv8-3.2.3.so plv8-3.1.8.so && \
ln -s plv8-3.2.3.so plv8-3.1.5.so && \
@@ -770,7 +729,7 @@ FROM build-deps AS timescaledb-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export TIMESCALEDB_VERSION=2.10.1 \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -808,7 +767,7 @@ ARG PG_VERSION
# version-specific, has separate releases for each version
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14") \
export PG_HINT_PLAN_VERSION=14_1_4_1 \
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -884,7 +843,7 @@ ARG PG_VERSION
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export RDKIT_VERSION=Release_2024_09_1 \
export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
@@ -1011,7 +970,7 @@ ARG PG_VERSION
#
# last release v0.40.0 - Jul 22, 2024
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export SEMVER_VERSION=0.40.0 \
export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \
@@ -1047,7 +1006,7 @@ ARG PG_VERSION
# This is our extension, support stopped in favor of pgvector
# TODO: deprecate it
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.5 \
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -1080,7 +1039,7 @@ ARG PG_VERSION
# This is an experimental extension, never got to real production.
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in "v17") \
RUN case "${PG_VERSION}" in "v17") \
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
@@ -1132,7 +1091,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
FROM pg-build-nonroot-with-cargo AS rust-extensions-build
ARG PG_VERSION
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
'v17') \
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
esac && \
@@ -1311,7 +1270,7 @@ FROM build-deps AS pgx_ulid-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16") \
;; \
*) \
@@ -1343,7 +1302,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
;; \
*) \
@@ -1471,8 +1430,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
FROM build-deps AS pg_mooncake-src
ARG PG_VERSION
WORKDIR /ext-src
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
chmod a+x neon-test.sh
@@ -1484,31 +1443,6 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
#########################################################################################
#
# Layer "pg-duckdb-pg-build"
# compile pg_duckdb extension
#
#########################################################################################
FROM build-deps AS pg_duckdb-src
WORKDIR /ext-src
COPY compute/patches/pg_duckdb_v031.patch .
# pg_duckdb build requires source dir to be a git repo to get submodules
# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
# - extension management function duckdb.install_extension()
# - access to duckdb.extensions table and its sequence
RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
cd pg_duckdb-src && \
git submodule update --init --recursive && \
patch -p1 < /ext-src/pg_duckdb_v031.patch
FROM pg-build AS pg_duckdb-build
ARG PG_VERSION
COPY --from=pg_duckdb-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_duckdb-src
RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
#########################################################################################
#
# Layer "pg_repack"
@@ -1622,7 +1556,6 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
#########################################################################################
@@ -1644,19 +1577,7 @@ ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot Cargo.lock Cargo.toml rust-toolchain.toml .
COPY .cargo .cargo
COPY .config .config
COPY compute_tools compute_tools
COPY control_plane control_plane
COPY libs libs
COPY pageserver pageserver
COPY proxy proxy
COPY storage_scrubber storage_scrubber
COPY safekeeper safekeeper
COPY storage_broker storage_broker
COPY storage_controller storage_controller
COPY workspace_hack workspace_hack
COPY --chown=nonroot . .
RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
--mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
--mount=type=cache,uid=1000,target=/home/nonroot/target \
@@ -1673,7 +1594,7 @@ RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
#
#########################################################################################
FROM $BASE_IMAGE_SHA AS pgbouncer
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
RUN set -e \
&& echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
&& apt update \
@@ -1684,7 +1605,6 @@ RUN set -e \
autoconf \
automake \
libevent-dev \
libsystemd-dev \
libtool \
pkg-config \
&& apt clean && rm -rf /var/lib/apt/lists/*
@@ -1695,7 +1615,7 @@ RUN set -e \
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
&& cd pgbouncer \
&& ./autogen.sh \
&& ./configure --prefix=/usr/local/pgbouncer --with-systemd --without-openssl \
&& ./configure --prefix=/usr/local/pgbouncer --without-openssl \
&& make -j $(nproc) dist_man_MANS= \
&& make install dist_man_MANS=
@@ -1704,12 +1624,13 @@ RUN set -e \
# Layer "exporters"
#
#########################################################################################
FROM build-deps AS exporters
FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
ARG TARGETARCH
# Keep sql_exporter version same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
@@ -1733,7 +1654,7 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
# Layer "awscli"
#
#########################################################################################
FROM build-deps AS awscli
FROM alpine/curl:${ALPINE_CURL_VERSION} AS awscli
ARG TARGETARCH
RUN set -ex; \
if [ "${TARGETARCH}" = "amd64" ]; then \
@@ -1751,50 +1672,6 @@ RUN set -ex; \
/tmp/awscliv2/aws/install; \
rm -rf /tmp/awscliv2.zip /tmp/awscliv2
#########################################################################################
#
# Layer "cgroup-tools"
#
#########################################################################################
# Build cgroup-tools
#
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
# requires cgroup v2, so we'll build cgroup-tools ourselves.
#
# At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
# and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
# for debian version migration.
#
FROM debian:bookworm-slim as cgroup-tools
ENV LIBCGROUP_VERSION=v2.0.3
RUN set -exu \
&& apt update \
&& apt install --no-install-recommends -y \
git \
ca-certificates \
automake \
cmake \
make \
gcc \
byacc \
flex \
libtool \
libpam0g-dev \
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
&& INSTALL_DIR="/libcgroup-install" \
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
&& cd libcgroup \
# extracted from bootstrap.sh, with modified flags:
&& (test -d m4 || mkdir m4) \
&& autoreconf -fi \
&& rm -rf autom4te.cache \
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
# actually build the thing...
&& make install
#########################################################################################
#
# Clean up postgres folder before inclusion
@@ -1827,7 +1704,7 @@ USER nonroot
COPY --chown=nonroot compute compute
RUN make PG_VERSION="${PG_VERSION:?}" -C compute
RUN make PG_VERSION="${PG_VERSION}" -C compute
#########################################################################################
#
@@ -1853,15 +1730,15 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
COPY --from=hypopg-src /ext-src/ /ext-src/
COPY --from=pg_hashids-src /ext-src/ /ext-src/
COPY --from=rum-src /ext-src/ /ext-src/
COPY --from=pgtap-src /ext-src/ /ext-src/
#COPY --from=pgtap-src /ext-src/ /ext-src/
COPY --from=ip4r-src /ext-src/ /ext-src/
COPY --from=prefix-src /ext-src/ /ext-src/
COPY --from=hll-src /ext-src/ /ext-src/
COPY --from=plpgsql_check-src /ext-src/ /ext-src/
#COPY --from=timescaledb-src /ext-src/ /ext-src/
COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch
COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
COPY --from=pg_cron-src /ext-src/ /ext-src/
#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
@@ -1886,19 +1763,58 @@ ENV PGDATABASE=postgres
#########################################################################################
#
# Target: compute-node
#
# Put it all together into the final 'compute-node' image. It can be executed directly
# with docker, to run the 'compute_ctl'. The other services will not be launched in
# that case.
# Final layer
# Put it all together into the final image
#
#########################################################################################
FROM $BASE_IMAGE_SHA as compute-node-build
FROM debian:$DEBIAN_FLAVOR
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
# create folder for file cache
mkdir -p -m 777 /neon/cache
# aws cli is used by fast_import
COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
# pgbouncer and its config
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
# local_proxy and its config
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
# Metrics exporter binaries and configuration files
COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
COPY --from=exporters ./sql_exporter /bin/sql_exporter
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
# Create remote extension download directory
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
# Install:
# libreadline8 for psql
# liblz4-1 for lz4
@@ -1909,8 +1825,10 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# libboost* for rdkit
# ca-certificates for communicating with s3 by compute_ctl
# libevent for pgbouncer
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
RUN apt update && \
case $DEBIAN_VERSION in \
# Version-specific installs for Bullseye (PG14-PG16):
@@ -1950,145 +1868,9 @@ RUN apt update && \
procps \
ca-certificates \
$VERSION_INSTALLS && \
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
mkdir /var/db/postgres/pgbouncer && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
chmod 0750 /var/db/postgres/pgbouncer && \
# create folder for file cache
mkdir -p -m 777 /neon/cache && \
# Create remote extension download directory
mkdir /usr/local/download_extensions && \
chown -R postgres:postgres /usr/local/download_extensions
# aws cli is used by fast_import
COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
# locally built cgroup-tools
COPY --from=cgroup-tools /libcgroup-install/bin/* /usr/local/bin/
COPY --from=cgroup-tools /libcgroup-install/lib/* /usr/local/lib/
COPY --from=cgroup-tools /libcgroup-install/sbin/* /usr/local/sbin/
COPY --chmod=0644 compute/etc/cgconfig.conf /etc/cgconfig.conf
# pgbouncer and its config
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
# local_proxy and its config
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
# Metrics exporter binaries and configuration files
COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
COPY --from=exporters ./sql_exporter /bin/sql_exporter
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
# Make the libraries we built available
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
FROM compute-node-build as compute-node
ARG DEBIAN_VERSION
RUN apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# If this image is executed as a stand-alone docker container, these are used.
ENV LANG=en_US.utf8
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
#########################################################################################
#
# Target: compute-node-bootable
#
# A "bootable" image which includes systemd, configured to launch all the services.
#
# For testing purposes, this can be run directly with docker:
#
# docker run --name=compute-node --privileged neondatabase/compute-node-bootable:local /sbin/init
#
#########################################################################################
FROM compute-node-build as compute-node-bootable
# dbus is required so that you can "machinectl shell" into this when run in an systemd-nspawn
# container
RUN apt install --no-install-recommends -y \
systemd \
systemd-sysv \
dbus && \
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
## copy systemd unit files for the services and enable them
COPY compute/etc/systemd/ /etc/systemd
RUN systemctl enable \
systemd-networkd.service \
pgbouncer \
postgres_exporter sql_exporter sql_exporter-autoscaling \
local_proxy \
compute_ctl \
chown-pgdata \
make-cgroup-procs-writable \
load-cgconfig.service
ENTRYPOINT ["/sbin/init"]
#########################################################################################
#
# Target: compute-node-neonvm-payload
#
# Contains 'compute-node-bootable', as a QCOW2 disk image, suitable for booting with
# neonvm-guest
#
#########################################################################################
# Wrap the same in a QCOW2 image
FROM debian:bookworm-slim AS compute-node-neonvm-payload-build
ARG DISK_SIZE=5G
# tools for qemu disk creation. procps is for sysctl, needed because neonvm-controller
# launches this in an init container that runs sysctl.
RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
qemu-utils \
e2fsprogs \
procps
COPY --from=compute-node-bootable / /rootdisk/
RUN set -e \
&& mkfs.ext4 -L neonvm-payload -d /rootdisk /disk.raw ${DISK_SIZE} \
&& qemu-img convert -f raw -O qcow2 -o cluster_size=2M,lazy_refcounts=on /disk.raw /neonvm-payload.qcow2
FROM debian:bookworm-slim AS compute-node-neonvm-payload
ARG DISK_SIZE=5G
ARG DISK_SIZE=5G
# procps is for sysctl, needed because neonvm-controller launches this in an init
# container that runs sysctl.
RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
procps
COPY --from=compute-node-neonvm-payload-build /neonvm-payload.qcow2 /
RUN apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
#########################################################################################
#
# make 'compute-node' the default target
#
#########################################################################################
FROM compute-node

View File

@@ -1,12 +0,0 @@
# Configuration for cgroups in VM compute nodes
group neon-postgres {
perm {
admin {
uid = postgres;
}
task {
gid = users;
}
}
memory {}
}

View File

@@ -1,12 +0,0 @@
# When running under neonvm-guest a separate disk is mounted to
# /var/db/postgres/compute. Make it owned by the postgres user.
[Unit]
Description=Change owner of /var/db/postgres/compute to postgres
[Service]
Type=oneshot
ExecStart=chown postgres:postgres /var/db/postgres/compute
RemainAfterExit=yes
[Install]
WantedBy=compute_ctl.service

View File

@@ -1,16 +0,0 @@
[Unit]
Description=Neon PostgreSQL launcher tool
After=network-online.target
Wants=network-online.target
[Service]
Type=exec
User=postgres
# neonvm-runner mounts and populates this directory based on the k8s VM spec
EnvironmentFile=/neonvm/runtime/command.env
ExecStart=/usr/local/bin/compute_ctl $COMPUTE_CTL_ARGS
Restart=on-failure
Delegate=yes
[Install]
WantedBy=multi-user.target

View File

@@ -1,10 +0,0 @@
[Unit]
Description=Create neonvm-postgres cgroup
[Service]
Type=oneshot
ExecStart=cgconfigparser -l /etc/cgconfig.conf
RemainAfterExit=yes
[Install]
WantedBy=compute_ctl.service

View File

@@ -1,13 +0,0 @@
[Unit]
Description=Neon local proxy
After=network-online.target
Wants=network-online.target
[Service]
Type=notify
User=postgres
ExecStart=/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@@ -1,20 +0,0 @@
# Allow all users to move processes to/from the root cgroup.
#
# This is required in order to be able to 'cgexec' anything, if the entrypoint is not being run as
# root, because moving tasks between one cgroup and another *requires write access to the
# cgroup.procs file of the common ancestor*, and because the entrypoint isn't already in a cgroup,
# any new tasks are automatically placed in the top-level cgroup.
#
# This *would* be bad for security, if we relied on cgroups for security; but instead because they
# are just used for cooperative signaling, this should be mostly ok.
[Unit]
Description=Allow all users to move processes to/from the root cgroup.
[Service]
Type=oneshot
ExecStart=chmod go+w /sys/fs/cgroup/cgroup.procs
RemainAfterExit=yes
[Install]
WantedBy=compute_ctl.service

View File

@@ -1,45 +0,0 @@
# Example systemd service unit for PgBouncer
#
# - Adjust the paths in ExecStart for your installation.
#
# - For systemd 253 and later, PgBouncer supports Type=notify-reload
# (instead of Type=notify with ExecReload= command).
#
# - The User setting requires careful consideration. PgBouncer needs
# to be able to place a Unix-domain socket file where PostgreSQL
# clients will look for it. In the olden days, this was in /tmp,
# but systems using systemd now prefer something like
# /var/run/postgresql/. But then some systems also lock down that
# directory so that only the postgres user can write to it. That
# means you need to either
#
# - run PgBouncer as the postgres user, or
#
# - create a separate user and add it to the postgres group and
# make /var/run/postgresql/ group-writable, or
#
# - use systemd to create the sockets; see pgbouncer.socket nearby.
#
# For packagers and deployment systems, this requires some
# coordination between the PgBouncer and the PostgreSQL
# packages/components.
#
[Unit]
Description=connection pooler for PostgreSQL
Documentation=man:pgbouncer(1)
Documentation=https://www.pgbouncer.org/
After=network-online.target
Wants=network-online.target
#Requires=pgbouncer.socket
[Service]
Type=notify
User=postgres
ExecStart=/usr/local/bin/pgbouncer /etc/pgbouncer.ini
ExecReload=/bin/kill -HUP $MAINPID
KillSignal=SIGINT
Restart=on-failure
#LimitNOFILE=1024
[Install]
WantedBy=multi-user.target

View File

@@ -1,14 +0,0 @@
[Unit]
Description=Postgres metrics exporter
After=network-online.target
Wants=network-online.target
[Service]
Type=notify
User=nobody
Environment=DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter"
ExecStart=/bin/postgres_exporter --config.file=/etc/postgres_exporter.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@@ -1,13 +0,0 @@
[Unit]
Description=SQL metrics exporter (autoscaling)
After=network-online.target
Wants=network-online.target
[Service]
Type=notify
User=nobody
ExecStart=/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@@ -1,13 +0,0 @@
[Unit]
Description=SQL metrics exporter
After=network-online.target
Wants=network-online.target
[Service]
Type=notify
User=nobody
ExecStart=/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@@ -1,11 +0,0 @@
diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
index d777d76..af60106 100644
--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;

View File

@@ -6,16 +6,16 @@ index da723b8..5328114 100644
----
-- No.A-1-1-3
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
-- No.A-1-2-3
DROP EXTENSION pg_hint_plan;
-- No.A-1-1-4
CREATE SCHEMA other_schema;
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
DROP SCHEMA other_schema;
----
---- No. A-5-1 comment pattern
@@ -35,7 +35,7 @@ index d372459..6282afe 100644
SET client_min_messages TO LOG;
SET pg_hint_plan.enable_hint TO on;
CREATE EXTENSION file_fdw;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');

View File

@@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644
----
-- No.A-1-1-3
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
-- No.A-1-2-3
DROP EXTENSION pg_hint_plan;
-- No.A-1-1-4
CREATE SCHEMA other_schema;
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
DROP SCHEMA other_schema;
----
---- No. A-5-1 comment pattern
@@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644
SET client_min_messages TO LOG;
SET pg_hint_plan.enable_hint TO on;
CREATE EXTENSION file_fdw;
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');

View File

@@ -74,8 +74,8 @@ build: |
# At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
# and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
# for debian version migration.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder
#
FROM debian:bookworm-slim as libcgroup-builder
ENV LIBCGROUP_VERSION=v2.0.3
RUN set -exu \

View File

@@ -47,9 +47,7 @@ files:
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
#
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -70,8 +68,7 @@ build: |
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
# requires cgroup v2, so we'll build cgroup-tools ourselves.
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder
FROM debian:bullseye-slim as libcgroup-builder
ENV LIBCGROUP_VERSION=v2.0.3
RUN set -exu \

View File

@@ -24,10 +24,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
http-body-util.workspace = true
hyper-util.workspace = true
hyper.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true

View File

@@ -47,15 +47,14 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Parser;
use compute_tools::http::server::Server;
use compute_tools::disk_quota::set_disk_quota;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use compute_tools::neonvmd_client::{resize_swap, set_disk_quota};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
@@ -63,10 +62,12 @@ use compute_tools::compute::{
};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version_string;
use compute_tools::http::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{setrlimit, Resource};
use utils::failpoint_support;
@@ -107,20 +108,8 @@ struct Cli {
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
pub remote_ext_config: Option<String>,
/// The port to bind the external listening HTTP server to. Clients running
/// outside the compute will talk to the compute through this port. Keep
/// the previous name for this argument around for a smoother release
/// with the control plane.
///
/// TODO: Remove the alias after the control plane release which teaches the
/// control plane about the renamed argument.
#[arg(long, alias = "http-port", default_value_t = 3080)]
pub external_http_port: u16,
/// The port to bind the internal listening HTTP server to. Clients like
/// the neon extension (for installing remote extensions) and local_proxy.
#[arg(long)]
pub internal_http_port: Option<u16>,
#[arg(long, default_value_t = 3080)]
pub http_port: u16,
#[arg(short = 'D', long, value_name = "DATADIR")]
pub pgdata: String,
@@ -146,7 +135,6 @@ struct Cli {
#[arg(long, action = clap::ArgAction::SetTrue)]
pub resize_swap_on_bind: bool,
/// This is no longer used for anything. It's kept for now just for backwards-compatibility.
#[arg(long)]
pub set_disk_quota_for_fs: Option<String>,
@@ -281,7 +269,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
info!("got spec from cli argument {}", spec_json);
return Ok(CliSpecParams {
spec: Some(serde_json::from_str(spec_json)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: false,
});
}
@@ -291,7 +278,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: true,
});
}
@@ -301,9 +287,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
};
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(resp) => Ok(CliSpecParams {
spec: resp.0,
compute_ctl_config: resp.1,
Ok(spec) => Ok(CliSpecParams {
spec,
live_config_allowed: true,
}),
Err(e) => {
@@ -320,8 +305,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
#[allow(dead_code)]
compute_ctl_config: ComputeCtlConfig,
live_config_allowed: bool,
}
@@ -331,7 +314,6 @@ fn wait_spec(
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();
@@ -358,8 +340,7 @@ fn wait_spec(
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
http_port: cli.http_port,
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
@@ -377,13 +358,9 @@ fn wait_spec(
compute.prewarm_postgres()?;
}
// Launch the external HTTP server first, so that we can serve control plane
// requests while configuration is still in progress.
Server::External(cli.external_http_port).launch(&compute);
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
// Launch http service first, so that we can serve control-plane requests
// while configuration is still in progress.
let _http_handle = launch_http_server(cli.http_port, &compute);
if !spec_set {
// No spec provided, hang waiting for it.
@@ -474,8 +451,10 @@ fn start_postgres(
}
// Set disk quota if the compute spec says so
if let Some(disk_quota_bytes) = disk_quota_bytes {
match set_disk_quota(disk_quota_bytes) {
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
(disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
{
match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");

View File

@@ -25,10 +25,10 @@
//! docker push localhost:3030/localregistry/compute-node-v14:latest
//! ```
use anyhow::{bail, Context};
use anyhow::Context;
use aws_config::BehaviorVersion;
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use clap::Parser;
use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
use nix::unistd::Pid;
use tracing::{error, info, info_span, warn, Instrument};
@@ -44,59 +44,22 @@ mod s3_uri;
const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
#[derive(Subcommand, Debug)]
enum Command {
/// Runs local postgres (neon binary), restores into it,
/// uploads pgdata to s3 to be consumed by pageservers
Pgdata {
/// Raw connection string to the source database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
source_connection_string: Option<String>,
/// If specified, will not shut down the local postgres after the import. Used in local testing
#[clap(short, long)]
interactive: bool,
/// Port to run postgres on. Default is 5432.
#[clap(long, default_value_t = 5432)]
pg_port: u16, // port to run postgres on, 5432 is default
/// Number of CPUs in the system. This is used to configure # of
/// parallel worker processes, for index creation.
#[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
num_cpus: Option<usize>,
/// Amount of RAM in the system. This is used to configure shared_buffers
/// and maintenance_work_mem.
#[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
memory_mb: Option<usize>,
},
/// Runs pg_dump-pg_restore from source to destination without running local postgres.
DumpRestore {
/// Raw connection string to the source database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
source_connection_string: Option<String>,
/// Raw connection string to the destination database. Used only in tests,
/// real scenario uses encrypted connection string in spec.json from s3.
#[clap(long)]
destination_connection_string: Option<String>,
},
}
#[derive(clap::Parser)]
struct Args {
#[clap(long, env = "NEON_IMPORTER_WORKDIR")]
#[clap(long)]
working_directory: Utf8PathBuf,
#[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
s3_prefix: Option<s3_uri::S3Uri>,
#[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
#[clap(long)]
source_connection_string: Option<String>,
#[clap(short, long)]
interactive: bool,
#[clap(long)]
pg_bin_dir: Utf8PathBuf,
#[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
#[clap(long)]
pg_lib_dir: Utf8PathBuf,
#[clap(subcommand)]
command: Command,
#[clap(long)]
pg_port: Option<u16>, // port to run postgres on, 5432 is default
}
#[serde_with::serde_as]
@@ -105,8 +68,6 @@ struct Spec {
encryption_secret: EncryptionSecret,
#[serde_as(as = "serde_with::base64::Base64")]
source_connstring_ciphertext_base64: Vec<u8>,
#[serde_as(as = "Option<serde_with::base64::Base64>")]
destination_connstring_ciphertext_base64: Option<Vec<u8>>,
}
#[derive(serde::Deserialize)]
@@ -122,150 +83,180 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
"C.UTF-8"
};
async fn decode_connstring(
kms_client: &aws_sdk_kms::Client,
key_id: &String,
connstring_ciphertext_base64: Vec<u8>,
) -> Result<String, anyhow::Error> {
let mut output = kms_client
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt connection string")?;
#[tokio::main]
pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Plain,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::Output::Stdout,
)?;
let plaintext = output
.plaintext
.take()
.context("get plaintext connection string")?;
info!("starting");
String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
}
let args = Args::parse();
struct PostgresProcess {
pgdata_dir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pgbin: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
postgres_proc: Option<tokio::process::Child>,
}
impl PostgresProcess {
fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
Self {
pgdata_dir,
pgbin: pg_bin_dir.join("postgres"),
pg_bin_dir,
pg_lib_dir,
postgres_proc: None,
}
// Validate arguments
if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
anyhow::bail!("either s3_prefix or source_connection_string must be specified");
}
if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
}
async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
tokio::fs::create_dir(&self.pgdata_dir)
.await
.context("create pgdata directory")?;
let working_directory = args.working_directory;
let pg_bin_dir = args.pg_bin_dir;
let pg_lib_dir = args.pg_lib_dir;
let pg_port = args.pg_port.unwrap_or_else(|| {
info!("pg_port not specified, using default 5432");
5432
});
let pg_version = match get_pg_version(self.pgbin.as_ref()) {
PostgresMajorVersion::V14 => 14,
PostgresMajorVersion::V15 => 15,
PostgresMajorVersion::V16 => 16,
PostgresMajorVersion::V17 => 17,
// Initialize AWS clients only if s3_prefix is specified
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms = aws_sdk_kms::Client::new(&config);
(Some(config), Some(kms))
} else {
(None, None)
};
// Get source connection string either from S3 spec or direct argument
let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
let spec: Spec = {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
};
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser: initdb_user,
locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
pg_version,
initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
pgdata: &self.pgdata_dir,
})
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let mut output = kms_client
.unwrap()
.decrypt()
.key_id(key_id)
.ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
spec.source_connstring_ciphertext_base64,
))
.send()
.await
.context("decrypt source connection string")?;
let plaintext = output
.plaintext
.take()
.context("get plaintext source connection string")?;
String::from_utf8(plaintext.into_inner())
.context("parse source connection string as utf8")?
}
}
} else {
args.source_connection_string.unwrap()
};
match tokio::fs::create_dir(&working_directory).await {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
if !is_directory_empty(&working_directory)
.await
.context("check if working directory is empty")?
{
anyhow::bail!("working directory is not empty");
} else {
// ok
}
}
Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
}
let pgdata_dir = working_directory.join("pgdata");
tokio::fs::create_dir(&pgdata_dir)
.await
.context("initdb")
}
.context("create pgdata directory")?;
async fn start(
&mut self,
initdb_user: &str,
port: u16,
nproc: usize,
memory_mb: usize,
) -> Result<&tokio::process::Child, anyhow::Error> {
self.prepare(initdb_user).await?;
let pgbin = pg_bin_dir.join("postgres");
let pg_version = match get_pg_version(pgbin.as_ref()) {
PostgresMajorVersion::V14 => 14,
PostgresMajorVersion::V15 => 15,
PostgresMajorVersion::V16 => 16,
PostgresMajorVersion::V17 => 17,
};
let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
superuser,
locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
pg_version,
initdb_bin: pg_bin_dir.join("initdb").as_ref(),
library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
pgdata: &pgdata_dir,
})
.await
.context("initdb")?;
// Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
// maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
// available for misc other stuff that PostgreSQL uses memory for.
let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
let nproc = num_cpus::get();
//
// Launch postgres process
//
let mut proc = tokio::process::Command::new(&self.pgbin)
.arg("-D")
.arg(&self.pgdata_dir)
.args(["-p", &format!("{port}")])
.args(["-c", "wal_level=minimal"])
.args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
.args(["-c", "max_wal_senders=0"])
.args(["-c", "fsync=off"])
.args(["-c", "full_page_writes=off"])
.args(["-c", "synchronous_commit=off"])
.args([
"-c",
&format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
])
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args(["-c", "effective_io_concurrency=100"])
.env_clear()
.env("LD_LIBRARY_PATH", &self.pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn postgres")?;
info!("spawned postgres, waiting for it to become ready");
tokio::spawn(
child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
.instrument(info_span!("postgres")),
);
self.postgres_proc = Some(proc);
Ok(self.postgres_proc.as_ref().unwrap())
}
async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
info!("shutdown postgres");
nix::sys::signal::kill(
Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
nix::sys::signal::SIGTERM,
//
// Launch postgres process
//
let mut postgres_proc = tokio::process::Command::new(pgbin)
.arg("-D")
.arg(&pgdata_dir)
.args(["-p", &format!("{pg_port}")])
.args(["-c", "wal_level=minimal"])
.args(["-c", "shared_buffers=10GB"])
.args(["-c", "max_wal_senders=0"])
.args(["-c", "fsync=off"])
.args(["-c", "full_page_writes=off"])
.args(["-c", "synchronous_commit=off"])
.args(["-c", "maintenance_work_mem=8388608"])
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers={nproc}")])
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
.args(["-c", &format!("max_worker_processes={nproc}")])
.args([
"-c",
&format!(
"effective_io_concurrency={}",
if cfg!(target_os = "macos") { 0 } else { 100 }
),
])
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
.env(
"ASAN_OPTIONS",
std::env::var("ASAN_OPTIONS").unwrap_or_default(),
)
.context("signal postgres to shut down")?;
proc.wait()
.await
.context("wait for postgres to shut down")
.map(|_| ())
}
}
.env(
"UBSAN_OPTIONS",
std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
)
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.context("spawn postgres")?;
info!("spawned postgres, waiting for it to become ready");
tokio::spawn(
child_stdio_to_log::relay_process_output(
postgres_proc.stdout.take(),
postgres_proc.stderr.take(),
)
.instrument(info_span!("postgres")),
);
async fn wait_until_ready(connstring: String, create_dbname: String) {
// Create neondb database in the running postgres
let restore_pg_connstring =
format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
let start_time = std::time::Instant::now();
loop {
@@ -276,12 +267,7 @@ async fn wait_until_ready(connstring: String, create_dbname: String) {
std::process::exit(1);
}
match tokio_postgres::connect(
&connstring.replace("dbname=neondb", "dbname=postgres"),
tokio_postgres::NoTls,
)
.await
{
match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
Ok((client, connection)) => {
// Spawn the connection handling task to maintain the connection
tokio::spawn(async move {
@@ -290,12 +276,9 @@ async fn wait_until_ready(connstring: String, create_dbname: String) {
}
});
match client
.simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
.await
{
match client.simple_query("CREATE DATABASE neondb;").await {
Ok(_) => {
info!("created {} database", create_dbname);
info!("created neondb database");
break;
}
Err(e) => {
@@ -319,16 +302,10 @@ async fn wait_until_ready(connstring: String, create_dbname: String) {
}
}
}
}
async fn run_dump_restore(
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
source_connstring: String,
destination_connstring: String,
) -> Result<(), anyhow::Error> {
let dumpdir = workdir.join("dumpdir");
let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
let dumpdir = working_directory.join("dumpdir");
let common_args = [
// schema mapping (prob suffices to specify them on one side)
@@ -357,7 +334,7 @@ async fn run_dump_restore(
.arg("--no-sync")
// POSITIONAL args
// source db (db name included in connection string)
.arg(&source_connstring)
.arg(&source_connection_string)
// how we run it
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir)
@@ -377,18 +354,19 @@ async fn run_dump_restore(
let st = pg_dump.wait().await.context("wait for pg_dump")?;
info!(status=?st, "pg_dump exited");
if !st.success() {
error!(status=%st, "pg_dump failed, restore will likely fail as well");
bail!("pg_dump failed");
warn!(status=%st, "pg_dump failed, restore will likely fail as well");
}
}
// TODO: maybe do it in a streaming way, plenty of internal research done on this already
// TODO: do it in a streaming way, plenty of internal research done on this already
// TODO: do the unlogged table trick
info!("restore from working directory into vanilla postgres");
{
let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
.args(&common_args)
.arg("-d")
.arg(&destination_connstring)
.arg(&restore_pg_connstring)
// POSITIONAL args
.arg(&dumpdir)
// how we run it
@@ -411,82 +389,33 @@ async fn run_dump_restore(
let st = pg_restore.wait().await.context("wait for pg_restore")?;
info!(status=?st, "pg_restore exited");
if !st.success() {
error!(status=%st, "pg_restore failed, restore will likely fail as well");
bail!("pg_restore failed");
warn!(status=%st, "pg_restore failed, restore will likely fail as well");
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
async fn cmd_pgdata(
kms_client: Option<aws_sdk_kms::Client>,
maybe_s3_prefix: Option<s3_uri::S3Uri>,
maybe_spec: Option<Spec>,
source_connection_string: Option<String>,
interactive: bool,
pg_port: u16,
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
num_cpus: Option<usize>,
memory_mb: Option<usize>,
) -> Result<(), anyhow::Error> {
if maybe_spec.is_none() && source_connection_string.is_none() {
bail!("spec must be provided for pgdata command");
}
if maybe_spec.is_some() && source_connection_string.is_some() {
bail!("only one of spec or source_connection_string can be provided");
}
let source_connection_string = if let Some(spec) = maybe_spec {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
decode_connstring(
kms_client.as_ref().unwrap(),
&key_id,
spec.source_connstring_ciphertext_base64,
)
.await?
}
}
} else {
source_connection_string.unwrap()
};
let superuser = "cloud_admin";
let destination_connstring = format!(
"host=localhost port={} user={} dbname=neondb",
pg_port, superuser
);
let pgdata_dir = workdir.join("pgdata");
let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
let nproc = num_cpus.unwrap_or_else(num_cpus::get);
let memory_mb = memory_mb.unwrap_or(256);
proc.start(superuser, pg_port, nproc, memory_mb).await?;
wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
run_dump_restore(
workdir.clone(),
pg_bin_dir,
pg_lib_dir,
source_connection_string,
destination_connstring,
)
.await?;
// If interactive mode, wait for Ctrl+C
if interactive {
if args.interactive {
info!("Running in interactive mode. Press Ctrl+C to shut down.");
tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
}
proc.shutdown().await?;
info!("shutdown postgres");
{
nix::sys::signal::kill(
Pid::from_raw(
i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
),
nix::sys::signal::SIGTERM,
)
.context("signal postgres to shut down")?;
postgres_proc
.wait()
.await
.context("wait for postgres to shut down")?;
}
// Only sync if s3_prefix was specified
if let Some(s3_prefix) = maybe_s3_prefix {
if let Some(s3_prefix) = args.s3_prefix {
info!("upload pgdata");
aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
.await
@@ -494,7 +423,7 @@ async fn cmd_pgdata(
info!("write status");
{
let status_dir = workdir.join("status");
let status_dir = working_directory.join("status");
std::fs::create_dir(&status_dir).context("create status directory")?;
let status_file = status_dir.join("pgdata");
std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
@@ -507,153 +436,3 @@ async fn cmd_pgdata(
Ok(())
}
async fn cmd_dumprestore(
kms_client: Option<aws_sdk_kms::Client>,
maybe_spec: Option<Spec>,
source_connection_string: Option<String>,
destination_connection_string: Option<String>,
workdir: Utf8PathBuf,
pg_bin_dir: Utf8PathBuf,
pg_lib_dir: Utf8PathBuf,
) -> Result<(), anyhow::Error> {
let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
match spec.encryption_secret {
EncryptionSecret::KMS { key_id } => {
let source = decode_connstring(
kms_client.as_ref().unwrap(),
&key_id,
spec.source_connstring_ciphertext_base64,
)
.await?;
let dest = if let Some(dest_ciphertext) =
spec.destination_connstring_ciphertext_base64
{
decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
.await?
} else {
bail!("destination connection string must be provided in spec for dump_restore command");
};
(source, dest)
}
}
} else {
(
source_connection_string.unwrap(),
if let Some(val) = destination_connection_string {
val
} else {
bail!("destination connection string must be provided for dump_restore command");
},
)
};
run_dump_restore(
workdir,
pg_bin_dir,
pg_lib_dir,
source_connstring,
destination_connstring,
)
.await
}
#[tokio::main]
pub(crate) async fn main() -> anyhow::Result<()> {
utils::logging::init(
utils::logging::LogFormat::Json,
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
utils::logging::Output::Stdout,
)?;
info!("starting");
let args = Args::parse();
// Initialize AWS clients only if s3_prefix is specified
let (aws_config, kms_client) = if args.s3_prefix.is_some() {
let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
let kms = aws_sdk_kms::Client::new(&config);
(Some(config), Some(kms))
} else {
(None, None)
};
let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
let spec_key = s3_prefix.append("/spec.json");
let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
let object = s3_client
.get_object()
.bucket(&spec_key.bucket)
.key(spec_key.key)
.send()
.await
.context("get spec from s3")?
.body
.collect()
.await
.context("download spec body")?;
serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
} else {
None
};
match tokio::fs::create_dir(&args.working_directory).await {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
if !is_directory_empty(&args.working_directory)
.await
.context("check if working directory is empty")?
{
bail!("working directory is not empty");
} else {
// ok
}
}
Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
}
match args.command {
Command::Pgdata {
source_connection_string,
interactive,
pg_port,
num_cpus,
memory_mb,
} => {
cmd_pgdata(
kms_client,
args.s3_prefix,
spec,
source_connection_string,
interactive,
pg_port,
args.working_directory,
args.pg_bin_dir,
args.pg_lib_dir,
num_cpus,
memory_mb,
)
.await?;
}
Command::DumpRestore {
source_connection_string,
destination_connection_string,
} => {
cmd_dumprestore(
kms_client,
spec,
source_connection_string,
destination_connection_string,
args.working_directory,
args.pg_bin_dir,
args.pg_lib_dir,
)
.await?;
}
}
Ok(())
}

View File

@@ -82,10 +82,8 @@ pub struct ComputeNode {
/// - we push spec and it does configuration
/// - but then it is restarted without any spec again
pub live_config_allowed: bool,
/// The port that the compute's external HTTP server listens on
pub external_http_port: u16,
/// The port that the compute's internal HTTP server listens on
pub internal_http_port: u16,
/// The port that the compute's HTTP server listens on
pub http_port: u16,
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
/// To allow HTTP API server to serving status requests, while configuration
/// is in progress, lock should be held only for short periods of time to do
@@ -633,7 +631,7 @@ impl ComputeNode {
config::write_postgres_conf(
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
self.internal_http_port,
self.http_port,
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1398,29 +1396,28 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
if !spec.skip_pg_catalog_updates {
let max_concurrent_connections = spec.reconfigure_concurrency;
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc.
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
self.pg_reload_conf()?;
let max_concurrent_connections = spec.reconfigure_concurrency;
if spec.mode == ComputeMode::Primary {
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
conf.application_name("apply_config");
let conf = Arc::new(conf);
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc.
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
self.pg_reload_conf()?;
let spec = Arc::new(spec.clone());
if spec.mode == ComputeMode::Primary {
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
conf.application_name("apply_config");
let conf = Arc::new(conf);
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
}
let spec = Arc::new(spec.clone());
Ok(())
})?;
}
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
}
Ok(())
})?;
self.pg_reload_conf()?;

View File

@@ -0,0 +1,25 @@
use anyhow::Context;
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
let size_kb = size_bytes / 1024;
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(DISK_QUOTA_BIN)
.arg(size_kb.to_string())
.arg(fs_mountpoint)
.spawn();
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => Err(anyhow::anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
}

View File

@@ -4,9 +4,11 @@ use http::{header::CONTENT_TYPE, StatusCode};
use serde::Serialize;
use tracing::error;
pub use server::launch_http_server;
mod extract;
mod routes;
pub mod server;
mod server;
/// Convenience response builder for JSON responses
struct JsonResponse;

View File

@@ -1,21 +1,7 @@
use axum::response::{IntoResponse, Response};
use http::StatusCode;
use serde::{Deserialize, Serialize};
use tracing::info;
use utils::failpoint_support::apply_failpoint;
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
use crate::http::{extract::Json, JsonResponse};

View File

@@ -1,11 +1,9 @@
use std::{
fmt::Display,
net::{IpAddr, Ipv6Addr, SocketAddr},
sync::Arc,
time::Duration,
};
use anyhow::Result;
use axum::{
extract::Request,
middleware::{self, Next},
@@ -26,65 +24,45 @@ use super::routes::{
};
use crate::compute::ComputeNode;
async fn handle_404() -> Response {
StatusCode::NOT_FOUND.into_response()
}
const X_REQUEST_ID: &str = "x-request-id";
/// `compute_ctl` has two servers: internal and external. The internal server
/// binds to the loopback interface and handles communication from clients on
/// the compute. The external server is what receives communication from the
/// control plane, the metrics scraper, etc. We make the distinction because
/// certain routes in `compute_ctl` only need to be exposed to local processes
/// like Postgres via the neon extension and local_proxy.
#[derive(Clone, Copy, Debug)]
pub enum Server {
Internal(u16),
External(u16),
}
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
impl Display for Server {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Server::Internal(_) => f.write_str("internal"),
Server::External(_) => f.write_str("external"),
}
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
next.run(request).await
}
impl From<Server> for Router<Arc<ComputeNode>> {
fn from(server: Server) -> Self {
let mut router = Router::<Arc<ComputeNode>>::new();
router = match server {
Server::Internal(_) => {
router = router
.route(
"/extension_server/{*filename}",
post(extension_server::download_extension),
)
.route("/extensions", post(extensions::install_extension))
.route("/grants", post(grants::add_grant));
// Add in any testing support
if cfg!(feature = "testing") {
use super::routes::failpoints;
router = router.route("/failpoints", post(failpoints::configure_failpoints));
}
router
}
Server::External(_) => router
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate)),
};
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
/// Run the HTTP server and wait on it forever.
async fn serve(port: u16, compute: Arc<ComputeNode>) {
let mut app = Router::new()
.route("/check_writability", post(check_writability::is_writable))
.route("/configure", post(configure::configure))
.route("/database_schema", get(database_schema::get_schema_dump))
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
.route(
"/extension_server/{*filename}",
post(extension_server::download_extension),
)
.route("/extensions", post(extensions::install_extension))
.route("/grants", post(grants::add_grant))
.route("/insights", get(insights::get_insights))
.route("/metrics", get(metrics::get_metrics))
.route("/metrics.json", get(metrics_json::get_metrics))
.route("/status", get(status::get_status))
.route("/terminate", post(terminate::terminate))
.fallback(handle_404)
.layer(
ServiceBuilder::new()
// Add this middleware since we assume the request ID exists
.layer(middleware::from_fn(maybe_add_request_id_header))
@@ -124,88 +102,43 @@ impl From<Server> for Router<Arc<ComputeNode>> {
)
.layer(PropagateRequestIdLayer::x_request_id()),
)
}
}
.with_state(compute);
impl Server {
async fn handle_404() -> impl IntoResponse {
StatusCode::NOT_FOUND
// Add in any testing support
if cfg!(feature = "testing") {
use super::routes::failpoints;
app = app.route("/failpoints", post(failpoints::configure_failpoints))
}
async fn handle_405() -> impl IntoResponse {
StatusCode::METHOD_NOT_ALLOWED
}
async fn listener(&self) -> Result<TcpListener> {
let addr = SocketAddr::new(self.ip(), self.port());
let listener = TcpListener::bind(&addr).await?;
Ok(listener)
}
fn ip(&self) -> IpAddr {
match self {
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
// allow binding to localhost
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
}
}
fn port(self) -> u16 {
match self {
Server::Internal(port) => port,
Server::External(port) => port,
}
}
async fn serve(self, compute: Arc<ComputeNode>) {
let listener = self.listener().await.unwrap_or_else(|e| {
// If we can't bind, the compute cannot operate correctly
panic!(
"failed to bind the compute_ctl {} HTTP server to {}: {}",
self,
SocketAddr::new(self.ip(), self.port()),
e
);
});
if tracing::enabled!(tracing::Level::INFO) {
let local_addr = match listener.local_addr() {
Ok(local_addr) => local_addr,
Err(_) => SocketAddr::new(self.ip(), self.port()),
};
info!(
"compute_ctl {} HTTP server listening at {}",
self, local_addr
// This usually binds to both IPv4 and IPv6 on Linux, see
// https://github.com/rust-lang/rust/pull/34440 for more information
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let listener = match TcpListener::bind(&addr).await {
Ok(listener) => listener,
Err(e) => {
error!(
"failed to bind the compute_ctl HTTP server to port {}: {}",
port, e
);
return;
}
};
let router = Router::from(self).with_state(compute);
if let Err(e) = axum::serve(listener, router).await {
error!("compute_ctl {} HTTP server error: {}", self, e);
}
if let Ok(local_addr) = listener.local_addr() {
info!("compute_ctl HTTP server listening on {}", local_addr);
} else {
info!("compute_ctl HTTP server listening on port {}", port);
}
pub fn launch(self, compute: &Arc<ComputeNode>) {
let state = Arc::clone(compute);
info!("Launching the {} server", self);
tokio::spawn(self.serve(state));
if let Err(e) = axum::serve(listener, app).await {
error!("compute_ctl HTTP server error: {}", e);
}
}
/// This middleware function allows compute_ctl to generate its own request ID
/// if one isn't supplied. The control plane will always send one as a UUID. The
/// neon Postgres extension on the other hand does not send one.
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
let headers = request.headers_mut();
if headers.get(X_REQUEST_ID).is_none() {
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
}
/// Launch HTTP server in a new task and return its `JoinHandle`.
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> tokio::task::JoinHandle<()> {
let state = Arc::clone(state);
next.run(request).await
tokio::spawn(serve(port, state))
}

View File

@@ -11,6 +11,7 @@ pub mod http;
pub mod logger;
pub mod catalog;
pub mod compute;
pub mod disk_quota;
pub mod extension_server;
pub mod installed_extensions;
pub mod local_proxy;
@@ -18,9 +19,9 @@ pub mod lsn_lease;
pub mod metrics;
mod migration;
pub mod monitor;
pub mod neonvmd_client;
pub mod params;
pub mod pg_helpers;
pub mod spec;
mod spec_apply;
pub mod swap;
pub mod sync_sk;

View File

@@ -1,102 +0,0 @@
use anyhow::Context;
use hyper::client::conn;
use hyper::client::conn::http1::SendRequest;
use hyper::{Request, StatusCode};
use hyper_util::rt::TokioIo;
use tracing::warn;
const NEONVM_DAEMON_CONTROL_SOCKET_PATH: &str = "/neonvm/run/neonvm-daemon-socket";
/// Open a connection to neonvm-daemon's control socket, prepare to send
/// requests to it with hyper.
async fn connect_neonvm_daemon<B>() -> anyhow::Result<SendRequest<B>>
where
B: hyper::body::Body + 'static + Send,
B::Data: Send,
B::Error: Into<Box<dyn std::error::Error + Send + Sync>>,
{
let mut attempts = 0;
let stream = loop {
match tokio::net::UnixStream::connect(NEONVM_DAEMON_CONTROL_SOCKET_PATH).await {
Ok(stream) => break stream,
Err(err) if err.kind() == std::io::ErrorKind::NotFound && attempts < 50 => {
// Retry
warn!("neonvm-daemon control socket not found, retrying...");
attempts += 1;
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
Err(err) => Err(err).context("opening neonvm-daemon control socket")?,
}
};
let io = TokioIo::new(stream);
let (request_sender, connection) = conn::http1::handshake(io).await.unwrap();
// spawn a task to poll the connection and drive the HTTP state
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("Error in connection: {}", e);
}
});
Ok(request_sender)
}
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
let rt = tokio::runtime::Handle::current();
rt.block_on(resize_swap_async(size_bytes))
}
pub async fn resize_swap_async(size_bytes: u64) -> anyhow::Result<()> {
let mut neonvmd = connect_neonvm_daemon().await?;
// Passing 'once' causes neonvm-daemon to reject any future resize requests
let request = Request::builder()
.method("POST")
.uri("/resize-swap-once")
.header("Host", "localhost") // hyper requires Host, even though the server won't care
.body(format!("{}", size_bytes))
.unwrap();
let resp = neonvmd.send_request(request).await?;
let status = resp.status();
match status {
StatusCode::OK => Ok(()),
StatusCode::CONFLICT => {
// 409 Conflict means that the swap was already resized. That happens if the
// compute_ctl restarts within the VM. That's considered OK.
warn!("Swap was already resized");
Ok(())
}
_ => Err(anyhow::anyhow!(
"error resizing swap: {}",
status.to_string()
)),
}
}
pub fn set_disk_quota(size_bytes: u64) -> anyhow::Result<()> {
let rt = tokio::runtime::Handle::current();
rt.block_on(set_disk_quota_async(size_bytes))
}
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
pub async fn set_disk_quota_async(size_bytes: u64) -> anyhow::Result<()> {
let mut neonvmd = connect_neonvm_daemon().await?;
let request = Request::builder()
.method("POST")
.uri("/set-disk-quota")
.header("Host", "localhost") // hyper requires Host, even though the server won't care
.body(format!("{}", size_bytes))
.unwrap();
let resp = neonvmd.send_request(request).await?;
let status = resp.status();
match status {
StatusCode::OK => Ok(()),
_ => Err(anyhow::anyhow!(
"error setting disk quota: {}",
status.to_string()
)),
}
}

View File

@@ -11,9 +11,7 @@ use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
use compute_api::responses::{
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
use compute_api::spec::ComputeSpec;
// Do control plane request and return response if any. In case of error it
@@ -75,13 +73,14 @@ fn do_control_plane_request(
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
) -> Result<Option<ComputeSpec>> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
let mut spec: Result<Option<ComputeSpec>> = Ok(None);
info!("getting spec from control plane: {}", cp_uri);
@@ -91,7 +90,7 @@ pub fn get_spec_from_control_plane(
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
let result = match do_control_plane_request(&cp_uri, &jwt) {
spec = match do_control_plane_request(&cp_uri, &jwt) {
Ok(spec_resp) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[
@@ -100,10 +99,10 @@ pub fn get_spec_from_control_plane(
])
.inc();
match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
ControlPlaneComputeStatus::Empty => Ok(None),
ControlPlaneComputeStatus::Attached => {
if let Some(spec) = spec_resp.spec {
Ok((Some(spec), spec_resp.compute_ctl_config))
Ok(Some(spec))
} else {
bail!("compute is attached, but spec is empty")
}
@@ -122,10 +121,10 @@ pub fn get_spec_from_control_plane(
}
};
if let Err(e) = &result {
if let Err(e) = &spec {
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return result;
return spec;
}
attempt += 1;
@@ -133,9 +132,7 @@ pub fn get_spec_from_control_plane(
}
// All attempts failed, return error.
Err(anyhow::anyhow!(
"Exhausted all attempts to retrieve the spec from the control plane"
))
spec
}
/// Check `pg_hba.conf` and update if needed to allow external connections.

45
compute_tools/src/swap.rs Normal file
View File

@@ -0,0 +1,45 @@
use std::path::Path;
use anyhow::{anyhow, Context};
use tracing::warn;
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
// run `/neonvm/bin/resize-swap --once {size_bytes}`
//
// Passing '--once' causes resize-swap to delete itself after successful completion, which
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
// postgres is running.
//
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(RESIZE_SWAP_BIN)
.arg("--once")
.arg(size_bytes.to_string())
.spawn();
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => {
// The command failed. Maybe it was because the resize-swap file doesn't exist?
// The --once flag causes it to delete itself on success so we don't disable swap
// while postgres is running; maybe this is fine.
match Path::new(RESIZE_SWAP_BIN).try_exists() {
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
// The path doesn't exist; we're actually ok
Ok(false) => {
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
Ok(())
},
}
}
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| {
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
})
}

View File

@@ -33,7 +33,6 @@ postgres_backend.workspace = true
safekeeper_api.workspace = true
postgres_connection.workspace = true
storage_broker.workspace = true
http-utils.workspace = true
utils.workspace = true
whoami.workspace = true

View File

@@ -552,10 +552,8 @@ struct EndpointCreateCmdArgs {
lsn: Option<Lsn>,
#[clap(long)]
pg_port: Option<u16>,
#[clap(long, alias = "http-port")]
external_http_port: Option<u16>,
#[clap(long)]
internal_http_port: Option<u16>,
http_port: Option<u16>,
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
@@ -1355,8 +1353,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
tenant_id,
timeline_id,
args.pg_port,
args.external_http_port,
args.internal_http_port,
args.http_port,
args.pg_version,
mode,
!args.update_catalog,

View File

@@ -37,8 +37,6 @@
//! ```
//!
use std::collections::BTreeMap;
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::PathBuf;
@@ -48,8 +46,6 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
@@ -77,8 +73,7 @@ pub struct EndpointConf {
timeline_id: TimelineId,
mode: ComputeMode,
pg_port: u16,
external_http_port: u16,
internal_http_port: u16,
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
drop_subscriptions_before_start: bool,
@@ -133,7 +128,7 @@ impl ComputeControlPlane {
1 + self
.endpoints
.values()
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port()))
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
.max()
.unwrap_or(self.base_port)
}
@@ -145,27 +140,18 @@ impl ComputeControlPlane {
tenant_id: TenantId,
timeline_id: TimelineId,
pg_port: Option<u16>,
external_http_port: Option<u16>,
internal_http_port: Option<u16>,
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
skip_pg_catalog_updates: bool,
drop_subscriptions_before_start: bool,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
external_http_address: SocketAddr::new(
IpAddr::from(Ipv4Addr::UNSPECIFIED),
external_http_port,
),
internal_http_address: SocketAddr::new(
IpAddr::from(Ipv4Addr::LOCALHOST),
internal_http_port,
),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
env: self.env.clone(),
timeline_id,
mode,
@@ -190,8 +176,7 @@ impl ComputeControlPlane {
tenant_id,
timeline_id,
mode,
external_http_port,
internal_http_port,
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates,
@@ -245,10 +230,9 @@ pub struct Endpoint {
pub timeline_id: TimelineId,
pub mode: ComputeMode,
// port and address of the Postgres server and `compute_ctl`'s HTTP APIs
// port and address of the Postgres server and `compute_ctl`'s HTTP API
pub pg_address: SocketAddr,
pub external_http_address: SocketAddr,
pub internal_http_address: SocketAddr,
pub http_address: SocketAddr,
// postgres major version in the format: 14, 15, etc.
pg_version: u32,
@@ -303,15 +287,8 @@ impl Endpoint {
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
Ok(Endpoint {
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
external_http_address: SocketAddr::new(
IpAddr::from(Ipv4Addr::UNSPECIFIED),
conf.external_http_port,
),
internal_http_address: SocketAddr::new(
IpAddr::from(Ipv4Addr::LOCALHOST),
conf.internal_http_port,
),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
endpoint_id,
env: env.clone(),
timeline_id: conf.timeline_id,
@@ -673,51 +650,40 @@ impl Endpoint {
println!("Also at '{}'", conn_str);
}
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
//cmd.args([
// "--external-http-port",
// &self.external_http_address.port().to_string(),
//])
//.args([
// "--internal-http-port",
// &self.internal_http_address.port().to_string(),
//])
cmd.args([
"--http-port",
&self.external_http_address.port().to_string(),
])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
.pg_bin_dir(self.pg_version)?
.join("postgres")
.to_str()
.unwrap(),
])
// TODO: It would be nice if we generated compute IDs with the same
// algorithm as the real control plane.
//
// TODO: Add this back when
// https://github.com/neondatabase/neon/pull/10747 is merged.
//
//.args([
// "--compute-id",
// &format!(
// "compute-{}",
// SystemTime::now()
// .duration_since(UNIX_EPOCH)
// .unwrap()
// .as_secs()
// ),
//])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
.pg_bin_dir(self.pg_version)?
.join("postgres")
.to_str()
.unwrap(),
])
// TODO: It would be nice if we generated compute IDs with the same
// algorithm as the real control plane.
//
// TODO: Add this back when
// https://github.com/neondatabase/neon/pull/10747 is merged.
//
//.args([
// "--compute-id",
// &format!(
// "compute-{}",
// SystemTime::now()
// .duration_since(UNIX_EPOCH)
// .unwrap()
// .as_secs()
// ),
//])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
if let Some(remote_ext_config) = remote_ext_config {
cmd.args(["--remote-ext-config", remote_ext_config]);
@@ -804,8 +770,8 @@ impl Endpoint {
reqwest::Method::GET,
format!(
"http://{}:{}/status",
self.external_http_address.ip(),
self.external_http_address.port()
self.http_address.ip(),
self.http_address.port()
),
)
.send()
@@ -878,17 +844,14 @@ impl Endpoint {
let response = client
.post(format!(
"http://{}:{}/configure",
self.external_http_address.ip(),
self.external_http_address.port()
self.http_address.ip(),
self.http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.body(
serde_json::to_string(&ConfigurationRequest {
spec,
compute_ctl_config: ComputeCtlConfig::default(),
})
.unwrap(),
)
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.send()
.await?;

View File

@@ -357,16 +357,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
compaction_l0_first: settings
.remove("compaction_l0_first")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_l0_first' as a bool")?,
compaction_l0_semaphore: settings
.remove("compaction_l0_semaphore")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_l0_semaphore' as a bool")?,
l0_flush_delay_threshold: settings
.remove("l0_flush_delay_threshold")
.map(|x| x.parse::<usize>())

View File

@@ -17,10 +17,8 @@ use camino::Utf8PathBuf;
use postgres_connection::PgConnectionConfig;
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use http_utils::error::HttpErrorBody;
use utils::auth::{Claims, Scope};
use utils::id::NodeId;
use utils::{http::error::HttpErrorBody, id::NodeId};
use crate::{
background_process,

View File

@@ -838,10 +838,7 @@ impl StorageController {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
Some(TenantShardMigrateRequest { node_id }),
)
.await
}

View File

@@ -609,10 +609,7 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -626,10 +623,7 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -1088,10 +1082,7 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
migration_config: None,
}),
Some(TenantShardMigrateRequest { node_id: mv.to }),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))

View File

@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
# We are running tests now
rm -f testout.txt testout_contrib.txt
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
$TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
$TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0

View File

@@ -1,15 +0,0 @@
diff --git a/test/schedule/create.sql b/test/schedule/create.sql
index ba355ed..7e250f5 100644
--- a/test/schedule/create.sql
+++ b/test/schedule/create.sql
@@ -1,3 +1,2 @@
\unset ECHO
\i test/psql.sql
-CREATE EXTENSION pgtap;
diff --git a/test/schedule/main.sch b/test/schedule/main.sch
index a8a5fbc..0463fc4 100644
--- a/test/schedule/main.sch
+++ b/test/schedule/main.sch
@@ -1,2 +1 @@
-test: build
test: create

View File

@@ -1,6 +0,0 @@
#!/bin/sh
set -ex
cd "$(dirname ${0})"
patch -p1 <test-upgrade.patch
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --max-connections=86 --schedule test/schedule/main.sch --schedule test/build/run.sch --dbname contrib_regression --use-existing

View File

@@ -11,7 +11,6 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
exit 1
fi
export PG_VERSION=${PG_VERSION:-16}
export PG_TEST_VERSION=${PG_VERSION}
function wait_for_ready {
TIME=0
while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -42,8 +41,7 @@ EXTENSIONS='[
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
{"extname": "semver", "extdir": "pg_semver-src"},
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
{"extname": "pgjwt", "extdir": "pgjwt-src"},
{"extname": "pgtap", "extdir": "pgtap-src"}
{"extname": "pgjwt", "extdir": "pgjwt-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
@@ -60,12 +58,8 @@ docker compose cp ext-src neon-test-extensions:/
docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
create_extensions "${EXTNAMES}"
if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
exts="${EXTNAMES}"
else
query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
fi
query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
if [ -z "${exts}" ]; then
echo "No extensions were upgraded"
else
@@ -93,10 +87,7 @@ else
exit 1
fi
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs
exit 1
fi
docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
done

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true
regex.workspace = true

View File

@@ -1,20 +1,18 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use crate::{
privilege::Privilege,
responses::ComputeCtlConfig,
spec::{ComputeSpec, ExtVersion, PgIdent},
};
use serde::{Deserialize, Serialize};
use serde::Deserialize;
/// Request of the /configure API
///
/// We now pass only `spec` in the configuration request, but later we can
/// extend it and something like `restart: bool` or something else. So put
/// `spec` into a struct initially to be more flexible in the future.
#[derive(Debug, Deserialize, Serialize)]
#[derive(Deserialize, Debug)]
pub struct ConfigurationRequest {
pub spec: ComputeSpec,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Debug)]

View File

@@ -3,7 +3,6 @@
use std::fmt::Display;
use chrono::{DateTime, Utc};
use jsonwebtoken::jwk::JwkSet;
use serde::{Deserialize, Serialize, Serializer};
use crate::{
@@ -136,27 +135,13 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
pub jwks: JwkSet,
}
impl Default for ComputeCtlConfig {
fn default() -> Self {
Self {
jwks: JwkSet {
keys: Vec::default(),
},
}
}
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving
/// to a different place.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]

View File

@@ -1,37 +0,0 @@
[package]
name = "http-utils"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
backtrace.workspace = true
bytes.workspace = true
inferno.workspace = true
fail.workspace = true
flate2.workspace = true
hyper0.workspace = true
itertools.workspace = true
jemalloc_pprof.workspace = true
once_cell.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
serde.workspace = true
serde_json.workspace = true
serde_path_to_error.workspace = true
thiserror.workspace = true
tracing.workspace = true
tokio.workspace = true
tokio-util.workspace = true
url.workspace = true
uuid.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
metrics.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,50 +0,0 @@
use crate::error::ApiError;
use crate::json::{json_request, json_response};
use hyper::{Body, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use utils::failpoint_support::apply_failpoint;
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
/// Configure failpoints through http.
pub async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot manage failpoints because neon was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}

View File

@@ -94,6 +94,7 @@ pub struct ConfigToml {
pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration,
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<reqwest::Url>,
pub control_plane_api_token: Option<String>,
pub control_plane_emergency_mode: bool,
@@ -263,11 +264,6 @@ pub struct TenantConfigToml {
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
pub compaction_l0_first: bool,
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
/// has an effect if `compaction_l0_first` is `true`.
pub compaction_l0_semaphore: bool,
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
@@ -351,7 +347,7 @@ pub struct TenantConfigToml {
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
/// `index_part.json`, and it cannot be reversed.
pub rel_size_v2_enabled: bool,
pub rel_size_v2_enabled: Option<bool>,
// gc-compaction related configs
/// Enable automatic gc-compaction trigger on this tenant.
@@ -476,6 +472,7 @@ impl Default for ConfigToml {
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
use_compaction_semaphore: false,
control_plane_api: (None),
control_plane_api_token: (None),
@@ -496,7 +493,7 @@ impl Default for ConfigToml {
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
image_compression: (DEFAULT_IMAGE_COMPRESSION),
timeline_offloading: true,
timeline_offloading: false,
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: None,
virtual_file_io_mode: None,
@@ -548,8 +545,6 @@ pub mod tenant_conf_defaults {
// most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
@@ -599,8 +594,6 @@ impl Default for TenantConfigToml {
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
l0_flush_delay_threshold: None,
l0_flush_stall_threshold: None,
l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
@@ -631,9 +624,9 @@ impl Default for TenantConfigToml {
image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: true,
timeline_offloading: false,
wal_receiver_protocol_override: None,
rel_size_v2_enabled: false,
rel_size_v2_enabled: None,
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,

View File

@@ -182,18 +182,6 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
#[serde(default)]
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
#[derive(Serialize, Clone, Debug)]

View File

@@ -1,12 +1,10 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::Bytes;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::Oid;
use postgres_ffi::RepOriginId;
use serde::{Deserialize, Serialize};
use std::{fmt, ops::Range};
use utils::const_assert;
use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -51,64 +49,6 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
/// The key prefix of ReplOrigin keys.
pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
/// The key prefix of db directory keys.
pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
/// The key prefix of rel directory keys.
pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RelDirExists {
Exists,
Removed,
}
#[derive(Debug)]
pub struct DecodeError;
impl fmt::Display for DecodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid marker")
}
}
impl std::error::Error for DecodeError {}
impl RelDirExists {
/// The value of the rel directory keys that indicates the existence of a relation.
const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
pub fn encode(&self) -> Bytes {
match self {
Self::Exists => Self::REL_EXISTS_MARKER.clone(),
Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
}
}
pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
match data {
Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
// Any other marker is invalid
Some(_) => Err(DecodeError),
None => Ok(Self::Removed),
}
}
pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
let data = data.as_ref();
if data == Self::REL_EXISTS_MARKER {
Ok(Self::Exists)
} else if data == SPARSE_TOMBSTONE_MARKER {
Ok(Self::Removed)
} else {
Err(DecodeError)
}
}
}
/// A tombstone in the sparse keyspace, which is an empty buffer.
pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -170,24 +110,6 @@ impl Key {
}
}
pub fn rel_dir_sparse_key_range() -> Range<Self> {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REL_DIR_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// This function checks more extensively what keys we can take on the write path.
/// If a key beginning with 00 does not have a global/default tablespace OID, it
/// will be rejected on the write path.
@@ -518,36 +440,6 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
}
}
#[inline(always)]
pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: relnode,
field5: forknum,
field6: 1,
}
}
pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: REL_DIR_KEY_PREFIX,
field2: spcnode,
field3: dbnode,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX,
} // it's fine to exclude the last key b/c we only use field6 == 1
}
#[inline(always)]
pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
@@ -842,9 +734,9 @@ impl Key {
self.field1 == RELATION_SIZE_PREFIX
}
pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
Key {
field1: AUX_KEY_PREFIX,
field2: 0,

View File

@@ -464,10 +464,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_first: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_semaphore: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub l0_flush_delay_threshold: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub l0_flush_stall_threshold: FieldPatch<usize>,
@@ -533,8 +529,6 @@ pub struct TenantConfig {
pub compaction_upper_limit: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_l0_first: Option<bool>,
pub compaction_l0_semaphore: Option<bool>,
pub l0_flush_delay_threshold: Option<usize>,
pub l0_flush_stall_threshold: Option<usize>,
pub l0_flush_wait_upload: Option<bool>,
@@ -573,8 +567,6 @@ impl TenantConfig {
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
mut l0_flush_wait_upload,
@@ -614,10 +606,6 @@ impl TenantConfig {
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
.apply(&mut compaction_l0_semaphore);
patch
.l0_flush_delay_threshold
.apply(&mut l0_flush_delay_threshold);
@@ -681,8 +669,6 @@ impl TenantConfig {
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
l0_flush_wait_upload,
@@ -1136,26 +1122,7 @@ pub struct TimelineInfo {
pub ancestor_lsn: Option<Lsn>,
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
/// as it is easier to reason about.
#[serde(default)]
pub applied_gc_cutoff_lsn: Lsn,
/// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
/// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
/// LSN at which it is legal to create a branch or ephemeral endpoint.
///
/// Note that holders of valid LSN leases may be able to create branches and read pages earlier
/// than this LSN, but new leases may not be taken out earlier than this LSN.
#[serde(default)]
pub min_readable_lsn: Lsn,
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage

View File

@@ -9,8 +9,6 @@ use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::io::ErrorKind;
use std::net::SocketAddr;
use std::os::fd::AsRawFd;
use std::os::fd::RawFd;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{ready, Poll};
@@ -270,7 +268,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
}
pub struct PostgresBackend<IO> {
pub socket_fd: RawFd,
framed: MaybeWriteOnly<IO>,
pub state: ProtoState,
@@ -296,11 +293,9 @@ impl PostgresBackend<tokio::net::TcpStream> {
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> io::Result<Self> {
let peer_addr = socket.peer_addr()?;
let socket_fd = socket.as_raw_fd();
let stream = MaybeTlsStream::Unencrypted(socket);
Ok(Self {
socket_fd,
framed: MaybeWriteOnly::Full(Framed::new(stream)),
state: ProtoState::Initialization,
auth_type,
@@ -312,7 +307,6 @@ impl PostgresBackend<tokio::net::TcpStream> {
impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
pub fn new_from_io(
socket_fd: RawFd,
socket: IO,
peer_addr: SocketAddr,
auth_type: AuthType,
@@ -321,7 +315,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
let stream = MaybeTlsStream::Unencrypted(socket);
Ok(Self {
socket_fd,
framed: MaybeWriteOnly::Full(Framed::new(stream)),
state: ProtoState::Initialization,
auth_type,

View File

@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
use crate::types::{Oid, ToSql, Type};
use crate::{
query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
};
use bytes::BytesMut;
use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
}
/// A cache of type info and prepared statements for fetching type info
/// (corresponding to the queries in the [crate::prepare] module).
/// (corresponding to the queries in the [prepare] module).
#[derive(Default)]
struct CachedTypeInfo {
/// A statement for basic information for a type from its
/// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
/// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
/// fallback).
typeinfo: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
/// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
typeinfo_composite: Option<Statement>,
/// A statement for getting information for a composite type from its OID.
/// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
/// its fallback).
typeinfo_enum: Option<Statement>,
@@ -190,6 +190,26 @@ impl Client {
&self.inner
}
/// Creates a new prepared statement.
///
/// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
/// which are set when executed. Prepared statements can only be used with the connection that created them.
pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
self.prepare_typed(query, &[]).await
}
/// Like `prepare`, but allows the types of query parameters to be explicitly specified.
///
/// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
/// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
pub async fn prepare_typed(
&self,
query: &str,
parameter_types: &[Type],
) -> Result<Statement, Error> {
prepare::prepare(&self.inner, query, parameter_types).await
}
/// Executes a statement, returning a vector of the resulting rows.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -202,11 +222,14 @@ impl Client {
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn query(
pub async fn query<T>(
&self,
statement: Statement,
statement: &T,
params: &[&(dyn ToSql + Sync)],
) -> Result<Vec<Row>, Error> {
) -> Result<Vec<Row>, Error>
where
T: ?Sized + ToStatement,
{
self.query_raw(statement, slice_iter(params))
.await?
.try_collect()
@@ -227,15 +250,13 @@ impl Client {
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`query`]: #method.query
pub async fn query_raw<'a, I>(
&self,
statement: Statement,
params: I,
) -> Result<RowStream, Error>
pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
where
T: ?Sized + ToStatement,
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let statement = statement.__convert().into_statement(self).await?;
query::query(&self.inner, statement, params).await
}
@@ -250,6 +271,55 @@ impl Client {
query::query_txt(&self.inner, statement, params).await
}
/// Executes a statement, returning the number of rows modified.
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
pub async fn execute<T>(
&self,
statement: &T,
params: &[&(dyn ToSql + Sync)],
) -> Result<u64, Error>
where
T: ?Sized + ToStatement,
{
self.execute_raw(statement, slice_iter(params)).await
}
/// The maximally flexible version of [`execute`].
///
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
/// provided, 1-indexed.
///
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
/// with the `prepare` method.
///
/// # Panics
///
/// Panics if the number of parameters provided does not match the number expected.
///
/// [`execute`]: #method.execute
pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
where
T: ?Sized + ToStatement,
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let statement = statement.__convert().into_statement(self).await?;
query::execute(self.inner(), statement, params).await
}
/// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
///
/// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that

View File

@@ -1,8 +1,7 @@
#![allow(async_fn_in_trait)]
use crate::query::RowStream;
use crate::types::Type;
use crate::{Client, Error, Transaction};
use async_trait::async_trait;
use postgres_protocol2::Oid;
mod private {
@@ -12,6 +11,7 @@ mod private {
/// A trait allowing abstraction over connections and transactions.
///
/// This trait is "sealed", and cannot be implemented outside of this crate.
#[async_trait]
pub trait GenericClient: private::Sealed {
/// Like `Client::query_raw_txt`.
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,6 +26,7 @@ pub trait GenericClient: private::Sealed {
impl private::Sealed for Client {}
#[async_trait]
impl GenericClient for Client {
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
where
@@ -38,12 +39,14 @@ impl GenericClient for Client {
/// Query for type information
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
crate::prepare::get_type(self.inner(), oid).await
self.get_type(oid).await
}
}
impl private::Sealed for Transaction<'_> {}
#[async_trait]
#[allow(clippy::needless_lifetimes)]
impl GenericClient for Transaction<'_> {
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
where

View File

@@ -14,6 +14,7 @@ pub use crate::row::{Row, SimpleQueryRow};
pub use crate::simple_query::SimpleQueryStream;
pub use crate::statement::{Column, Statement};
pub use crate::tls::NoTls;
pub use crate::to_statement::ToStatement;
pub use crate::transaction::Transaction;
pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
use crate::types::ToSql;
@@ -64,6 +65,7 @@ pub mod row;
mod simple_query;
mod statement;
pub mod tls;
mod to_statement;
mod transaction;
mod transaction_builder;
pub mod types;

View File

@@ -1,6 +1,7 @@
use crate::client::InnerClient;
use crate::codec::FrontendMessage;
use crate::connection::RequestMessages;
use crate::error::SqlState;
use crate::types::{Field, Kind, Oid, Type};
use crate::{query, slice_iter};
use crate::{Column, Error, Statement};
@@ -12,6 +13,7 @@ use postgres_protocol2::message::backend::Message;
use postgres_protocol2::message::frontend;
use std::future::Future;
use std::pin::Pin;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -22,6 +24,14 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
// Range types weren't added until Postgres 9.2, so pg_range may not exist
const TYPEINFO_FALLBACK_QUERY: &str = "\
SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
FROM pg_catalog.pg_type t
INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
@@ -29,6 +39,14 @@ WHERE enumtypid = $1
ORDER BY enumsortorder
";
// Postgres 9.0 didn't have enumsortorder
const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY oid
";
pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
@@ -38,13 +56,15 @@ AND attnum > 0
ORDER BY attnum
";
static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
pub async fn prepare(
client: &Arc<InnerClient>,
name: &'static str,
query: &str,
types: &[Type],
) -> Result<Statement, Error> {
let buf = encode(client, name, query, types)?;
let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
let buf = encode(client, &name, query, types)?;
let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
match responses.next().await? {
@@ -85,11 +105,10 @@ pub async fn prepare(
fn prepare_rec<'a>(
client: &'a Arc<InnerClient>,
name: &'static str,
query: &'a str,
types: &'a [Type],
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
Box::pin(prepare(client, name, query, types))
Box::pin(prepare(client, query, types))
}
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -173,8 +192,13 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
Ok(stmt) => stmt,
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
}
Err(e) => return Err(e),
};
client.set_typeinfo(&stmt);
Ok(stmt)
@@ -195,8 +219,13 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_enum";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
Ok(stmt) => stmt,
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
}
Err(e) => return Err(e),
};
client.set_typeinfo_enum(&stmt);
Ok(stmt)
@@ -226,8 +255,7 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
return Ok(stmt);
}
let typeinfo = "neon_proxy_typeinfo_composite";
let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
client.set_typeinfo_composite(&stmt);
Ok(stmt)

View File

@@ -157,6 +157,49 @@ where
})
}
pub async fn execute<'a, I>(
client: &InnerClient,
statement: Statement,
params: I,
) -> Result<u64, Error>
where
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
I::IntoIter: ExactSizeIterator,
{
let buf = if log_enabled!(Level::Debug) {
let params = params.into_iter().collect::<Vec<_>>();
debug!(
"executing statement {} with parameters: {:?}",
statement.name(),
BorrowToSqlParamsDebug(params.as_slice()),
);
encode(client, &statement, params)?
} else {
encode(client, &statement, params)?
};
let mut responses = start(client, buf).await?;
let mut rows = 0;
loop {
match responses.next().await? {
Message::DataRow(_) => {}
Message::CommandComplete(body) => {
rows = body
.tag()
.map_err(Error::parse)?
.rsplit(' ')
.next()
.unwrap()
.parse()
.unwrap_or(0);
}
Message::EmptyQueryResponse => rows = 0,
Message::ReadyForQuery(_) => return Ok(rows),
_ => return Err(Error::unexpected_message()),
}
}
}
async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

View File

@@ -13,7 +13,7 @@ use std::{
struct StatementInner {
client: Weak<InnerClient>,
name: &'static str,
name: String,
params: Vec<Type>,
columns: Vec<Column>,
}
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
fn drop(&mut self) {
if let Some(client) = self.client.upgrade() {
let buf = client.with_buf(|buf| {
frontend::close(b'S', self.name, buf).unwrap();
frontend::close(b'S', &self.name, buf).unwrap();
frontend::sync(buf);
buf.split().freeze()
});
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
impl Statement {
pub(crate) fn new(
inner: &Arc<InnerClient>,
name: &'static str,
name: String,
params: Vec<Type>,
columns: Vec<Column>,
) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
Statement(Arc::new(StatementInner {
client: Weak::new(),
name: "<anonymous>",
name: String::new(),
params,
columns,
}))
}
pub(crate) fn name(&self) -> &str {
self.0.name
&self.0.name
}
/// Returns the expected types of the statement's parameters.

View File

@@ -0,0 +1,57 @@
use crate::to_statement::private::{Sealed, ToStatementType};
use crate::Statement;
mod private {
use crate::{Client, Error, Statement};
pub trait Sealed {}
pub enum ToStatementType<'a> {
Statement(&'a Statement),
Query(&'a str),
}
impl ToStatementType<'_> {
pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
match self {
ToStatementType::Statement(s) => Ok(s.clone()),
ToStatementType::Query(s) => client.prepare(s).await,
}
}
}
}
/// A trait abstracting over prepared and unprepared statements.
///
/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
/// was prepared previously.
///
/// This trait is "sealed" and cannot be implemented by anything outside this crate.
pub trait ToStatement: Sealed {
#[doc(hidden)]
fn __convert(&self) -> ToStatementType<'_>;
}
impl ToStatement for Statement {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Statement(self)
}
}
impl Sealed for Statement {}
impl ToStatement for str {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Query(self)
}
}
impl Sealed for str {}
impl ToStatement for String {
fn __convert(&self) -> ToStatementType<'_> {
ToStatementType::Query(self)
}
}
impl Sealed for String {}

View File

@@ -21,17 +21,23 @@ bytes.workspace = true
camino.workspace = true
chrono.workspace = true
diatomic-waker.workspace = true
flate2.workspace = true
git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper0 = { workspace = true, features = ["full"] }
inferno.workspace = true
itertools.workspace = true
fail.workspace = true
futures = { workspace = true }
jemalloc_pprof.workspace = true
jsonwebtoken.workspace = true
nix = {workspace = true, features = [ "ioctl" ] }
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true
pprof.workspace = true
regex.workspace = true
routerify.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
@@ -48,6 +54,8 @@ rand.workspace = true
scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
pq_proto.workspace = true
@@ -56,6 +64,12 @@ metrics.workspace = true
const_format.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
serde_path_to_error.workspace = true
[dev-dependencies]
byteorder.workspace = true
bytes.workspace = true

View File

@@ -10,7 +10,7 @@ use jsonwebtoken::{
};
use serde::{Deserialize, Serialize};
use crate::id::TenantId;
use crate::{http::error::ApiError, id::TenantId};
/// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -90,6 +90,15 @@ impl Display for AuthError {
}
}
impl From<AuthError> for ApiError {
fn from(_value: AuthError) -> Self {
// Don't pass on the value of the AuthError as a precautionary measure.
// Being intentionally vague in public error communication hurts debugability
// but it is more secure.
ApiError::Forbidden("JWT authentication error".to_string())
}
}
pub struct JwtAuth {
decoding_keys: Vec<DecodingKey>,
validation: Validation,

View File

@@ -1,6 +1,13 @@
//! Failpoint support code shared between pageserver and safekeepers.
use crate::http::{
error::ApiError,
json::{json_request, json_response},
};
use hyper::{Body, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::*;
/// Declare a failpoint that can use to `pause` failpoint action.
/// We don't want to block the executor thread, hence, spawn_blocking + await.
@@ -177,3 +184,45 @@ fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
/// Configure failpoints through http.
pub async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot manage failpoints because neon was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}

View File

@@ -1,6 +1,7 @@
use crate::error::{api_error_handler, route_error_handler, ApiError};
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use crate::http::request::{get_query_param, parse_query_param};
use crate::pprof;
use crate::request::{get_query_param, parse_query_param};
use ::pprof::protos::Message as _;
use ::pprof::ProfilerGuardBuilder;
use anyhow::{anyhow, Context};
@@ -18,7 +19,6 @@ use tokio::sync::{mpsc, Mutex, Notify};
use tokio_stream::wrappers::ReceiverStream;
use tokio_util::io::ReaderStream;
use tracing::{debug, info, info_span, warn, Instrument};
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
use std::future::Future;
use std::io::Write as _;
@@ -718,9 +718,9 @@ pub fn check_permission_with(
#[cfg(test)]
mod tests {
use super::*;
use futures::future::poll_fn;
use hyper::service::Service;
use routerify::RequestServiceBuilder;
use std::future::poll_fn;
use std::net::{IpAddr, SocketAddr};
#[tokio::test]

View File

@@ -5,8 +5,6 @@ use std::error::Error as StdError;
use thiserror::Error;
use tracing::{error, info, warn};
use utils::auth::AuthError;
#[derive(Debug, Error)]
pub enum ApiError {
#[error("Bad request: {0:#?}")]
@@ -98,15 +96,6 @@ impl ApiError {
}
}
impl From<AuthError> for ApiError {
fn from(_value: AuthError) -> Self {
// Don't pass on the value of the AuthError as a precautionary measure.
// Being intentionally vague in public error communication hurts debugability
// but it is more secure.
ApiError::Forbidden("JWT authentication error".to_string())
}
}
#[derive(Serialize, Deserialize)]
pub struct HttpErrorBody {
pub msg: String,

View File

@@ -1,12 +1,8 @@
pub mod endpoint;
pub mod error;
pub mod failpoints;
pub mod json;
pub mod pprof;
pub mod request;
extern crate hyper0 as hyper;
/// Current fast way to apply simple http routing in various Neon binaries.
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};

View File

@@ -2,6 +2,8 @@
//! between other crates in this repository.
#![deny(clippy::undocumented_unsafe_blocks)]
extern crate hyper0 as hyper;
pub mod backoff;
/// `Lsn` type implements common tasks on Log Sequence Numbers
@@ -31,6 +33,9 @@ pub mod shard;
mod hex;
pub use hex::Hex;
// http endpoint utils
pub mod http;
// definition of the Generation type for pageserver attachment APIs
pub mod generation;
@@ -91,10 +96,9 @@ pub mod circuit_breaker;
pub mod try_rcu;
pub mod guard_arc_swap;
pub mod pprof;
#[cfg(target_os = "linux")]
pub mod linux_socket_ioctl;
pub mod guard_arc_swap;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]

View File

@@ -1,35 +0,0 @@
//! Linux-specific socket ioctls.
//!
//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
use std::{
io,
mem::MaybeUninit,
os::{fd::RawFd, raw::c_int},
};
use nix::libc::{FIONREAD, TIOCOUTQ};
unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
if err == 0 {
Ok(inq.assume_init())
} else {
Err(io::Error::last_os_error())
}
}
/// # Safety
///
/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
do_ioctl(socket_fd, FIONREAD)
}
/// # Safety
///
/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
do_ioctl(socket_fd, TIOCOUTQ)
}

View File

@@ -79,7 +79,6 @@ pq_proto.workspace = true
remote_storage.workspace = true
storage_broker.workspace = true
tenant_size_model.workspace = true
http-utils.workspace = true
utils.workspace = true
workspace_hack.workspace = true
reqwest.workspace = true

View File

@@ -11,7 +11,6 @@ testing = [ "pageserver_api/testing" ]
pageserver_api.workspace = true
thiserror.workspace = true
reqwest = { workspace = true, features = [ "stream" ] }
http-utils.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,12 +1,11 @@
use std::{collections::HashMap, error::Error as _};
use bytes::Bytes;
use reqwest::{IntoUrl, Method, StatusCode};
use detach_ancestor::AncestorDetached;
use http_utils::error::HttpErrorBody;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
};

View File

@@ -13,7 +13,7 @@
use anyhow::{anyhow, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{rel_block_to_key, Key};
use pageserver_api::key::Key;
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::{Instant, SystemTime};
@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
pub enum BasebackupError {
#[error("basebackup pageserver error {0:#}")]
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
#[error("basebackup client error {0:#}")]
Client(#[source] io::Error),
}
/// Create basebackup with non-rel data in it.
@@ -234,7 +234,7 @@ where
self.ar
.append(&header, self.buf.as_slice())
.await
.map_err(|e| BasebackupError::Client(e, "flush"))?;
.map_err(BasebackupError::Client)?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -273,9 +273,9 @@ where
for dir in subdirs.iter() {
let header = new_tar_header_dir(dir)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
.context("could not add directory to basebackup tarball")?;
}
// Send config files.
@@ -286,13 +286,13 @@ where
self.ar
.append(&header, data)
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
.context("could not add config file to basebackup tarball")?;
} else {
let header = new_tar_header(filepath, 0)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
.context("could not add config file to basebackup tarball")?;
}
}
if !lazy_slru_download {
@@ -406,7 +406,7 @@ where
self.ar
.append(&header, &*content)
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
.context("could not add aux file to basebackup tarball")?;
}
if min_restart_lsn != Lsn::MAX {
@@ -419,7 +419,7 @@ where
self.ar
.append(&header, &data[..])
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
.context("could not add restart.lsn file to basebackup tarball")?;
}
for xid in self
.timeline
@@ -451,9 +451,9 @@ where
let crc32 = crc32c::crc32c(&content);
content.extend_from_slice(&crc32.to_le_bytes());
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
self.ar.append(&header, &*content).await.map_err(|e| {
BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
})?;
self.ar.append(&header, &*content).await.context(
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
)?;
}
fail_point!("basebackup-before-control-file", |_| {
@@ -464,10 +464,7 @@ where
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
self.ar
.finish()
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
debug!("all tarred up!");
Ok(())
}
@@ -485,9 +482,9 @@ where
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
.map_err(BasebackupError::Client)?;
return Ok(());
}
@@ -501,9 +498,13 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
// TODO: investigate using get_vectored for the entire startblk..endblk range.
// But this code path is not on the critical path for most basebackups (?).
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
.get_rel_page_at_lsn(
src,
blknum,
Version::Lsn(self.lsn),
self.ctx,
self.io_concurrency.clone(),
)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
segment_data.extend_from_slice(&img[..]);
@@ -514,7 +515,7 @@ where
self.ar
.append(&header, segment_data.as_slice())
.await
.map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;
.map_err(BasebackupError::Client)?;
seg += 1;
startblk = endblk;
@@ -565,7 +566,7 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;
.map_err(BasebackupError::Client)?;
info!("timeline.pg_version {}", self.timeline.pg_version);
@@ -575,7 +576,7 @@ where
self.ar
.append(&header, &img[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
.map_err(BasebackupError::Client)?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -611,9 +612,9 @@ where
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
.map_err(BasebackupError::Client)?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -626,14 +627,14 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
.map_err(BasebackupError::Client)?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
.map_err(BasebackupError::Client)?;
}
};
Ok(())
@@ -662,7 +663,7 @@ where
self.ar
.append(&header, &buf[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;
.map_err(BasebackupError::Client)?;
Ok(())
}
@@ -692,7 +693,7 @@ where
zenith_signal.as_bytes(),
)
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
.map_err(BasebackupError::Client)?;
let checkpoint_bytes = self
.timeline
@@ -717,7 +718,7 @@ where
self.ar
.append(&header, &pg_control_bytes[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;
.map_err(BasebackupError::Client)?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -741,7 +742,7 @@ where
self.ar
.append(&header, &wal_seg[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
.map_err(BasebackupError::Client)?;
Ok(())
}
}

View File

@@ -592,7 +592,7 @@ fn start_pageserver(
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
.build()
.map_err(|err| anyhow!(err))?;
let service = http_utils::RouterService::new(router).unwrap();
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper0::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown({

View File

@@ -140,6 +140,10 @@ pub struct PageServerConf {
/// not terrible.
pub background_task_maximum_delay: Duration,
/// If true, use a separate semaphore for compaction tasks instead of the common background task
/// semaphore. Defaults to false.
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
@@ -336,6 +340,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_api_token,
control_plane_emergency_mode,
@@ -390,6 +395,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_emergency_mode,
heatmap_upload_concurrency,

View File

@@ -1080,10 +1080,7 @@ components:
type: integer
state:
type: string
min_readable_lsn:
type: string
format: hex
applied_gc_cutoff_lsn:
latest_gc_cutoff_lsn:
type: string
format: hex

View File

@@ -13,12 +13,6 @@ use enumset::EnumSet;
use futures::future::join_all;
use futures::StreamExt;
use futures::TryFutureExt;
use http_utils::endpoint::{
profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
};
use http_utils::failpoints::failpoints_handler;
use http_utils::request::must_parse_query_param;
use http_utils::request::{get_request_param, must_get_query_param, parse_query_param};
use humantime::format_rfc3339;
use hyper::header;
use hyper::StatusCode;
@@ -66,6 +60,13 @@ use tokio::time::Instant;
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::{
profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
};
use utils::http::request::must_parse_query_param;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -103,13 +104,6 @@ use crate::tenant::OffloadedTimeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::DEFAULT_PG_VERSION;
use crate::{disk_usage_eviction_task, tenant};
use http_utils::{
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
error::{ApiError, HttpErrorBody},
json::{json_request, json_request_maybe, json_response},
request::parse_request_param,
RequestExt, RouterBuilder,
};
use pageserver_api::models::{
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
TimelineInfo,
@@ -117,6 +111,13 @@ use pageserver_api::models::{
use utils::{
auth::SwappableJwtAuth,
generation::Generation,
http::{
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
error::{ApiError, HttpErrorBody},
json::{json_request, json_request_maybe, json_response},
request::parse_request_param,
RequestExt, RouterBuilder,
},
id::{TenantId, TimelineId},
lsn::Lsn,
};
@@ -482,11 +483,6 @@ async fn build_timeline_info_common(
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
let min_readable_lsn = std::cmp::max(
timeline.get_gc_cutoff_lsn(),
*timeline.get_applied_gc_cutoff_lsn(),
);
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
@@ -498,12 +494,7 @@ async fn build_timeline_info_common(
initdb_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
// Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
// we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
// actually trimmed data to), which can pass each other when PITR is changed.
latest_gc_cutoff_lsn: min_readable_lsn,
min_readable_lsn,
applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
current_logical_size_is_accurate: match current_logical_size.accuracy() {
tenant::timeline::logical_size::Accuracy::Approximate => false,
@@ -570,7 +561,7 @@ async fn reload_auth_validation_keys_handler(
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
match utils::auth::JwtAuth::from_key_path(key_path) {
match JwtAuth::from_key_path(key_path) {
Ok(new_auth) => {
shared_auth.swap(new_auth);
json_response(StatusCode::OK, ())
@@ -2161,7 +2152,6 @@ async fn timeline_compact_handler(
let state = get_state(&request);
let mut flags = EnumSet::empty();
flags |= CompactFlags::NoYield; // run compaction to completion
if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
flags |= CompactFlags::ForceL0Compaction;

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::os::fd::RawFd;
use std::pin::Pin;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, Mutex};
@@ -130,7 +129,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
"Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
&["tenant_id", "shard_id", "timeline_id"],
// Low resolution to reduce cardinality.
vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
@@ -1367,7 +1366,10 @@ impl SmgrOpTimer {
/// The first callers receives Some, subsequent ones None.
///
/// See [`SmgrOpTimerState`] for more context.
pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
pub(crate) fn observe_execution_end_flush_start(
&mut self,
at: Instant,
) -> Option<SmgrOpFlushInProgress> {
// NB: unlike the other observe_* methods, this one take()s.
#[allow(clippy::question_mark)] // maintain similar code pattern.
let Some(mut inner) = self.0.take() else {
@@ -1401,6 +1403,7 @@ impl SmgrOpTimer {
..
} = inner;
Some(SmgrOpFlushInProgress {
flush_started_at: at,
global_micros: global_flush_in_progress_micros,
per_timeline_micros: per_timeline_flush_in_progress_micros,
})
@@ -1416,6 +1419,7 @@ impl SmgrOpTimer {
/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
/// and remove this struct from the code base.
pub(crate) struct SmgrOpFlushInProgress {
flush_started_at: Instant,
global_micros: IntCounter,
per_timeline_micros: IntCounter,
}
@@ -1434,72 +1438,32 @@ impl Drop for SmgrOpTimer {
self.observe_throttle_start(now);
self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
self.observe_execution_start(now);
let maybe_flush_timer = self.observe_execution_end(now);
drop(maybe_flush_timer);
self.observe_execution_end_flush_start(now);
}
}
impl SmgrOpFlushInProgress {
/// The caller must guarantee that `socket_fd`` outlives this function.
pub(crate) async fn measure<Fut, O>(
self,
started_at: Instant,
mut fut: Fut,
socket_fd: RawFd,
) -> O
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
where
Fut: std::future::Future<Output = O>,
{
let mut fut = std::pin::pin!(fut);
let mut logged = false;
let mut last_counter_increment_at = started_at;
// Whenever observe_guard gets called, or dropped,
// it adds the time elapsed since its last call to metrics.
// Last call is tracked in `now`.
let mut observe_guard = scopeguard::guard(
|is_timeout| {
|| {
let now = Instant::now();
// Increment counter
{
let elapsed_since_last_observe = now - last_counter_increment_at;
self.global_micros
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
last_counter_increment_at = now;
}
// Log something on every timeout, and on completion but only if we hit a timeout.
if is_timeout || logged {
logged = true;
let elapsed_total = now - started_at;
let msg = if is_timeout {
"slow flush ongoing"
} else {
"slow flush completed or cancelled"
};
let (inq, outq) = {
// SAFETY: caller guarantees that `socket_fd` outlives this function.
#[cfg(target_os = "linux")]
unsafe {
(
utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
)
}
#[cfg(not(target_os = "linux"))]
{
_ = socket_fd; // appease unused lint on macOS
(-1, -1)
}
};
let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
tracing::info!(elapsed_total_secs, inq, outq, msg);
}
let elapsed = now - self.flush_started_at;
self.global_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.flush_started_at = now;
},
|mut observe| {
observe(false);
observe();
},
);
@@ -1507,7 +1471,7 @@ impl SmgrOpFlushInProgress {
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
Ok(v) => return v,
Err(_timeout) => {
(*observe_guard)(true);
(*observe_guard)();
}
}
}

View File

@@ -73,7 +73,6 @@ use pageserver_api::models::PageTraceEvent;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
/// is not yet in state [`TenantState::Active`].
@@ -237,7 +236,7 @@ pub async fn libpq_listener_main(
type ConnectionHandlerResult = anyhow::Result<()>;
#[instrument(skip_all, fields(peer_addr, application_name))]
#[instrument(skip_all, fields(peer_addr))]
#[allow(clippy::too_many_arguments)]
async fn page_service_conn_main(
conf: &'static PageServerConf,
@@ -258,8 +257,6 @@ async fn page_service_conn_main(
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
let socket_fd = socket.as_raw_fd();
let peer_addr = socket.peer_addr().context("get peer address")?;
tracing::Span::current().record("peer_addr", field::display(peer_addr));
@@ -308,7 +305,7 @@ async fn page_service_conn_main(
cancel.clone(),
gate_guard,
);
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend.run(&mut conn_handler, &cancel).await {
Ok(()) => {
@@ -917,7 +914,7 @@ impl PageServerHandler {
&shard,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
&shard.get_latest_gc_cutoff_lsn(),
ctx,
)
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
@@ -1077,7 +1074,7 @@ impl PageServerHandler {
};
// invoke handler function
let (mut handler_results, span): (
let (handler_results, span): (
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
_,
) = match batch {
@@ -1204,7 +1201,7 @@ impl PageServerHandler {
}
};
// We purposefully don't count flush time into the smgr operation timer.
// We purposefully don't count flush time into the smgr operaiton timer.
//
// The reason is that current compute client will not perform protocol processing
// if the postgres backend process is doing things other than `->smgr_read()`.
@@ -1221,32 +1218,17 @@ impl PageServerHandler {
// call, which (all unmeasured) adds syscall overhead but reduces time to first byte
// and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
// TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
let flush_timers = {
let flushing_start_time = Instant::now();
let mut flush_timers = Vec::with_capacity(handler_results.len());
for handler_result in &mut handler_results {
let flush_timer = match handler_result {
Ok((_, timer)) => Some(
timer
.observe_execution_end(flushing_start_time)
.expect("we are the first caller"),
),
Err(_) => {
// TODO: measure errors
None
}
};
flush_timers.push(flush_timer);
}
assert_eq!(flush_timers.len(), handler_results.len());
flush_timers
};
//
// Since we're flushing multiple times in the loop, but only have access to the per-op
// timers inside the loop, we capture the flush start time here and reuse it to finish
// each op timer.
let flushing_start_time = Instant::now();
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
let response_msg = match handler_result {
for handler_result in handler_results {
let (response_msg, timer) = match handler_result {
Err(e) => match &e.err {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1270,14 +1252,16 @@ impl PageServerHandler {
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
PagestreamBeMessage::Error(PagestreamErrorResponse {
req: e.req,
message: e.err.to_string(),
})
(
PagestreamBeMessage::Error(PagestreamErrorResponse {
req: e.req,
message: e.err.to_string(),
}),
None, // TODO: measure errors
)
}
},
Ok((response_msg, _op_timer_already_observed)) => response_msg,
Ok((response_msg, timer)) => (response_msg, Some(timer)),
};
//
@@ -1288,16 +1272,19 @@ impl PageServerHandler {
&response_msg.serialize(protocol_version),
))?;
let flushing_timer = timer.map(|mut timer| {
timer
.observe_execution_end_flush_start(flushing_start_time)
.expect("we are the first caller")
});
// what we want to do
let socket_fd = pgb_writer.socket_fd;
let flush_fut = pgb_writer.flush();
// metric for how long flushing takes
let flush_fut = match flushing_timer {
Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
Instant::now(),
flush_fut,
socket_fd,
)),
Some(flushing_timer) => {
futures::future::Either::Left(flushing_timer.measure(flush_fut))
}
None => futures::future::Either::Right(flush_fut),
};
// do it while respecting cancellation
@@ -1816,7 +1803,7 @@ impl PageServerHandler {
req: &PagestreamExistsRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1843,7 +1830,7 @@ impl PageServerHandler {
req: &PagestreamNblocksRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1870,7 +1857,7 @@ impl PageServerHandler {
req: &PagestreamDbSizeRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1960,7 +1947,7 @@ impl PageServerHandler {
req: &PagestreamGetSlruSegmentRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -2056,8 +2043,7 @@ impl PageServerHandler {
{
fn map_basebackup_error(err: BasebackupError) -> QueryError {
match err {
// TODO: passthrough the error site to the final error message?
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
}
}
@@ -2078,7 +2064,7 @@ impl PageServerHandler {
//return Err(QueryError::NotFound("timeline is archived".into()))
}
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
@@ -2158,12 +2144,10 @@ impl PageServerHandler {
.await
.map_err(map_basebackup_error)?;
}
writer.flush().await.map_err(|e| {
map_basebackup_error(BasebackupError::Client(
e,
"handle_basebackup_request,flush",
))
})?;
writer
.flush()
.await
.map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
}
pgb.write_message_noflush(&BeMessage::CopyDone)
@@ -2463,16 +2447,9 @@ where
fn startup(
&mut self,
_pgb: &mut PostgresBackend<IO>,
sm: &FeStartupPacket,
_sm: &FeStartupPacket,
) -> Result<(), QueryError> {
fail::fail_point!("ps::connection-start::startup-packet");
if let FeStartupPacket::StartupMessage { params, .. } = sm {
if let Some(app_name) = params.get("application_name") {
Span::current().record("application_name", field::display(app_name));
}
};
Ok(())
}

View File

@@ -23,14 +23,13 @@ use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::key::{rel_tag_sparse_key, Key};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -491,33 +490,12 @@ impl Timeline {
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
return Ok(false);
}
// Read path: first read the new reldir keyspace. Early return if the relation exists.
// Otherwise, read the old reldir keyspace.
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
if self.get_rel_size_v2_enabled() {
// fetch directory listing (new)
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
let exists_v2 = buf == RelDirExists::Exists;
// Fast path: if the relation exists in the new format, return true.
// TODO: we should have a verification mode that checks both keyspaces
// to ensure the relation only exists in one of them.
if exists_v2 {
return Ok(true);
}
}
// fetch directory listing (old)
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
Ok(exists_v1)
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
}
/// Get a list of all existing relations in given tablespace and database.
@@ -535,12 +513,12 @@ impl Timeline {
version: Version<'_>,
ctx: &RequestContext,
) -> Result<HashSet<RelTag>, PageReconstructError> {
// fetch directory listing (old)
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
let rels_v1: HashSet<RelTag> =
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
@@ -548,46 +526,6 @@ impl Timeline {
forknum: *forknum,
}));
if !self.get_rel_size_v2_enabled() {
return Ok(rels_v1);
}
// scan directory listing (new), merge with the old results
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
let results = self
.scan(
KeySpace::single(key_range),
version.get_lsn(),
ctx,
io_concurrency,
)
.await?;
let mut rels = rels_v1;
for (key, val) in results {
let val = RelDirExists::decode(&val?)
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
assert_eq!(key.field6, 1);
assert_eq!(key.field2, spcnode);
assert_eq!(key.field3, dbnode);
let tag = RelTag {
spcnode,
dbnode,
relnode: key.field4,
forknum: key.field5,
};
if val == RelDirExists::Removed {
debug_assert!(!rels.contains(&tag), "removed reltag in v2");
continue;
}
let did_not_contain = rels.insert(tag);
debug_assert!(did_not_contain, "duplicate reltag in v2");
}
Ok(rels)
}
@@ -673,7 +611,7 @@ impl Timeline {
) -> Result<LsnForTimestamp, PageReconstructError> {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let gc_cutoff_planned = {
let gc_info = self.gc_info.read().unwrap();
gc_info.min_cutoff()
@@ -1206,11 +1144,7 @@ impl Timeline {
let dense_keyspace = result.to_keyspace();
let sparse_keyspace = SparseKeySpace(KeySpace {
ranges: vec![
Key::metadata_aux_key_range(),
repl_origin_key_range(),
Key::rel_dir_sparse_key_range(),
],
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
});
if cfg!(debug_assertions) {
@@ -1340,22 +1274,12 @@ pub struct DatadirModification<'a> {
/// For special "directory" keys that store key-value maps, track the size of the map
/// if it was updated in this modification.
pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,
pending_directory_entries: Vec<(DirectoryKind, usize)>,
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
pending_metadata_bytes: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MetricsUpdate {
/// Set the metrics to this value
Set(u64),
/// Increment the metrics by this value
Add(u64),
/// Decrement the metrics by this value
Sub(u64),
}
impl DatadirModification<'_> {
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1435,8 +1359,7 @@ impl DatadirModification<'_> {
let buf = DbDirectory::ser(&DbDirectory {
dbdirs: HashMap::new(),
})?;
self.pending_directory_entries
.push((DirectoryKind::Db, MetricsUpdate::Set(0)));
self.pending_directory_entries.push((DirectoryKind::Db, 0));
self.put(DBDIR_KEY, Value::Image(buf.into()));
let buf = if self.tline.pg_version >= 17 {
@@ -1449,7 +1372,7 @@ impl DatadirModification<'_> {
})
}?;
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
.push((DirectoryKind::TwoPhase, 0));
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1459,23 +1382,17 @@ impl DatadirModification<'_> {
// harmless but they'd just be dropped on later compaction.
if self.tline.tenant_shard_id.is_shard_zero() {
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::Clog),
MetricsUpdate::Set(0),
));
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(
slru_dir_to_key(SlruKind::MultiXactMembers),
empty_dir.clone(),
);
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::Clog),
MetricsUpdate::Set(0),
));
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
MetricsUpdate::Set(0),
));
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
}
Ok(())
@@ -1741,16 +1658,10 @@ impl DatadirModification<'_> {
}
if r.is_none() {
// Create RelDirectory
// TODO: if we have fully migrated to v2, no need to create this directory
let buf = RelDirectory::ser(&RelDirectory {
rels: HashSet::new(),
})?;
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
if self.tline.get_rel_size_v2_enabled() {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
}
self.pending_directory_entries.push((DirectoryKind::Rel, 0));
self.put(
rel_dir_to_key(spcnode, dbnode),
Value::Image(Bytes::from(buf)),
@@ -1774,10 +1685,8 @@ impl DatadirModification<'_> {
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid = xid as u32;
@@ -1785,10 +1694,8 @@ impl DatadirModification<'_> {
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1837,10 +1744,8 @@ impl DatadirModification<'_> {
let mut dir = DbDirectory::des(&buf)?;
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
let buf = DbDirectory::ser(&dir)?;
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dir.dbdirs.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::Db, dir.dbdirs.len()));
self.put(DBDIR_KEY, Value::Image(buf.into()));
} else {
warn!(
@@ -1873,85 +1778,39 @@ impl DatadirModification<'_> {
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let dbdir_exists =
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir =
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
// Didn't exist. Update dbdir
e.insert(false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
self.put(DBDIR_KEY, Value::Image(buf.into()));
false
} else {
true
};
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if !dbdir_exists {
// Create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// and create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
if self.tline.get_rel_size_v2_enabled() {
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
let val = self
.sparse_get(sparse_rel_dir_key, ctx)
.await
.map_err(|e| RelationError::Other(e.into()))?;
let val = RelDirExists::decode_option(val)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
return Err(RelationError::AlreadyExists);
}
self.put(
sparse_rel_dir_key,
Value::Image(RelDirExists::Exists.encode()),
);
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
// We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
// TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
// will be key not found errors if we don't create an empty one for rel_size_v2.
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
)),
);
}
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
} else {
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
}
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
}
self.pending_directory_entries
.push((DirectoryKind::Rel, rel_dir.rels.len()));
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
// Put size
let size_key = rel_size_to_key(rel);
let buf = nblocks.to_le_bytes();
@@ -2037,34 +1896,9 @@ impl DatadirModification<'_> {
let mut dirty = false;
for rel_tag in rel_tags {
let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
dirty = true;
true
} else if self.tline.get_rel_size_v2_enabled() {
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
// logic).
let key =
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
if val == RelDirExists::Exists {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
// put tombstone
self.put(key, Value::Image(RelDirExists::Removed.encode()));
// no need to set dirty to true
true
} else {
false
}
} else {
false
};
if found {
// update logical size
let size_key = rel_size_to_key(rel_tag);
let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -2080,6 +1914,8 @@ impl DatadirModification<'_> {
if dirty {
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
self.pending_directory_entries
.push((DirectoryKind::Rel, dir.rels.len()));
}
}
@@ -2103,10 +1939,8 @@ impl DatadirModification<'_> {
if !dir.segments.insert(segno) {
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
}
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
MetricsUpdate::Set(dir.segments.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
self.put(
dir_key,
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2153,10 +1987,8 @@ impl DatadirModification<'_> {
if !dir.segments.remove(&segno) {
warn!("slru segment {:?}/{} does not exist", kind, segno);
}
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
MetricsUpdate::Set(dir.segments.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
self.put(
dir_key,
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2188,10 +2020,8 @@ impl DatadirModification<'_> {
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid: u32 = u32::try_from(xid)?;
@@ -2200,10 +2030,8 @@ impl DatadirModification<'_> {
if !dir.xids.remove(&xid) {
warn!("twophase file for xid {} does not exist", xid);
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
MetricsUpdate::Set(dir.xids.len() as u64),
));
self.pending_directory_entries
.push((DirectoryKind::TwoPhase, dir.xids.len()));
Bytes::from(TwoPhaseDirectory::ser(&dir)?)
};
self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2319,7 +2147,7 @@ impl DatadirModification<'_> {
}
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
writer.update_directory_entries_count(kind, count);
writer.update_directory_entries_count(kind, count as u64);
}
Ok(())
@@ -2405,7 +2233,7 @@ impl DatadirModification<'_> {
}
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
writer.update_directory_entries_count(kind, count);
writer.update_directory_entries_count(kind, count as u64);
}
self.pending_metadata_bytes = 0;
@@ -2469,22 +2297,6 @@ impl DatadirModification<'_> {
self.tline.get(key, lsn, ctx).await
}
/// Get a key from the sparse keyspace. Automatically converts the missing key error
/// and the empty value into None.
async fn sparse_get(
&self,
key: Key,
ctx: &RequestContext,
) -> Result<Option<Bytes>, PageReconstructError> {
let val = self.get(key, ctx).await;
match val {
Ok(val) if val.is_empty() => Ok(None),
Ok(val) => Ok(Some(val)),
Err(PageReconstructError::MissingKey(_)) => Ok(None),
Err(e) => Err(e),
}
}
fn put(&mut self, key: Key, val: Value) {
if Self::is_data_key(&key) {
self.put_data(key.to_compact(), val)
@@ -2567,23 +2379,6 @@ impl Version<'_> {
}
}
/// Get a key from the sparse keyspace. Automatically converts the missing key error
/// and the empty value into None.
async fn sparse_get(
&self,
timeline: &Timeline,
key: Key,
ctx: &RequestContext,
) -> Result<Option<Bytes>, PageReconstructError> {
let val = self.get(timeline, key, ctx).await;
match val {
Ok(val) if val.is_empty() => Ok(None),
Ok(val) => Ok(Some(val)),
Err(PageReconstructError::MissingKey(_)) => Ok(None),
Err(e) => Err(e),
}
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,
@@ -2643,7 +2438,6 @@ pub(crate) enum DirectoryKind {
Rel,
AuxFiles,
SlruSegment(SlruKind),
RelV2,
}
impl DirectoryKind {

View File

@@ -40,8 +40,6 @@ use remote_timeline_client::manifest::{
use remote_timeline_client::UploadQueueNotReadyError;
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
use secondary::heatmap::HeatMapTenant;
use secondary::heatmap::HeatMapTimeline;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
@@ -54,10 +52,7 @@ use timeline::compaction::GcCompactionQueue;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::PreviousHeatmap;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -265,7 +260,6 @@ struct TimelinePreload {
timeline_id: TimelineId,
client: RemoteTimelineClient,
index_part: Result<MaybeDeletedIndexPart, DownloadError>,
previous_heatmap: Option<PreviousHeatmap>,
}
pub(crate) struct TenantPreload {
@@ -1132,7 +1126,6 @@ impl Tenant {
resources: TimelineResources,
mut index_part: IndexPart,
metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1163,7 +1156,6 @@ impl Tenant {
let timeline = self.create_timeline_struct(
timeline_id,
&metadata,
previous_heatmap,
ancestor.clone(),
resources,
CreateTimelineCause::Load,
@@ -1563,18 +1555,8 @@ impl Tenant {
}
}
// TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
// pulled the first heatmap. Not entirely necessary since the storage controller
// will kick the secondary in any case and cause a download.
let maybe_heatmap_at = self.read_on_disk_heatmap().await;
let timelines = self
.load_timelines_metadata(
remote_timeline_ids,
remote_storage,
maybe_heatmap_at,
cancel,
)
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
.await?;
Ok(TenantPreload {
@@ -1587,26 +1569,6 @@ impl Tenant {
})
}
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
Err(err) => {
error!("Failed to deserialize old heatmap: {err}");
None
}
},
Err(err) => match err.kind() {
std::io::ErrorKind::NotFound => None,
_ => {
error!("Unexpected IO error reading old heatmap: {err}");
None
}
},
}
}
///
/// Background task that downloads all data for a tenant and brings it to Active state.
///
@@ -1694,10 +1656,7 @@ impl Tenant {
match index_part {
MaybeDeletedIndexPart::IndexPart(index_part) => {
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
remote_index_and_client.insert(
timeline_id,
(index_part, preload.client, preload.previous_heatmap),
);
remote_index_and_client.insert(timeline_id, (index_part, preload.client));
}
MaybeDeletedIndexPart::Deleted(index_part) => {
info!(
@@ -1716,7 +1675,7 @@ impl Tenant {
// layer file.
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
for (timeline_id, remote_metadata) in sorted_timelines {
let (index_part, remote_client, previous_heatmap) = remote_index_and_client
let (index_part, remote_client) = remote_index_and_client
.remove(&timeline_id)
.expect("just put it in above");
@@ -1736,7 +1695,6 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
previous_heatmap,
self.get_timeline_resources_for(remote_client),
LoadTimelineCause::Attach,
ctx,
@@ -1886,13 +1844,11 @@ impl Tenant {
}
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
#[allow(clippy::too_many_arguments)]
async fn load_remote_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
resources: TimelineResources,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1922,7 +1878,6 @@ impl Tenant {
resources,
index_part,
remote_metadata,
previous_heatmap,
ancestor,
cause,
ctx,
@@ -1934,29 +1889,14 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_ids: HashSet<TimelineId>,
remote_storage: &GenericRemoteStorage,
heatmap: Option<(HeatMapTenant, std::time::Instant)>,
cancel: CancellationToken,
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
let mut part_downloads = JoinSet::new();
for timeline_id in timeline_ids {
let cancel_clone = cancel.clone();
let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
heatmap: h,
read_at: hs.1,
})
});
part_downloads.spawn(
self.load_timeline_metadata(
timeline_id,
remote_storage.clone(),
previous_timeline_heatmap,
cancel_clone,
)
.instrument(info_span!("download_index_part", %timeline_id)),
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
.instrument(info_span!("download_index_part", %timeline_id)),
);
}
@@ -2004,7 +1944,6 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_id: TimelineId,
remote_storage: GenericRemoteStorage,
previous_heatmap: Option<PreviousHeatmap>,
cancel: CancellationToken,
) -> impl Future<Output = TimelinePreload> {
let client = self.build_timeline_client(timeline_id, remote_storage);
@@ -2020,7 +1959,6 @@ impl Tenant {
client,
timeline_id,
index_part,
previous_heatmap,
}
}
}
@@ -2132,12 +2070,7 @@ impl Tenant {
})?;
let timeline_preload = self
.load_timeline_metadata(
timeline_id,
self.remote_storage.clone(),
None,
cancel.clone(),
)
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
.await;
let index_part = match timeline_preload.index_part {
@@ -2171,7 +2104,6 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
None,
timeline_resources,
LoadTimelineCause::Unoffload,
&ctx,
@@ -2887,7 +2819,7 @@ impl Tenant {
};
let metadata = index_part.metadata.clone();
self
.load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
create_guard: timeline_create_guard, activate, }, &ctx)
.await?
.ready_to_activate()
@@ -2966,194 +2898,150 @@ impl Tenant {
.await
}
/// Performs one compaction iteration. Called periodically from the compaction loop. Returns
/// whether another compaction is needed, if we still have pending work or if we yield for
/// immediate L0 compaction.
/// Perform one compaction iteration.
/// This function is periodically called by compactor task.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
///
/// Compaction can also be explicitly requested for a timeline via the HTTP API.
/// Returns whether we have pending compaction task.
async fn compaction_iteration(
self: &Arc<Self>,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
// Don't compact inactive tenants.
) -> Result<CompactionOutcome, timeline::CompactionError> {
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
if !self.is_active() {
return Ok(CompactionOutcome::Skipped);
return Ok(CompactionOutcome::Done);
}
// Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
// since we need to compact L0 even in AttachedMulti to bound read amplification.
let location = self.tenant_conf.load().location;
if !location.may_upload_layers_hint() {
info!("skipping compaction in location state {location:?}");
return Ok(CompactionOutcome::Skipped);
}
// Don't compact if the circuit breaker is tripped.
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
info!("skipping compaction due to previous failures");
return Ok(CompactionOutcome::Skipped);
}
// Collect all timelines to compact, along with offload instructions and L0 counts.
let mut compact: Vec<Arc<Timeline>> = Vec::new();
let mut offload: HashSet<TimelineId> = HashSet::new();
let mut l0_counts: HashMap<TimelineId, usize> = HashMap::new();
{
let offload_enabled = self.get_timeline_offloading_enabled();
let conf = self.tenant_conf.load();
// Note that compaction usually requires deletions, but we don't respect
// may_delete_layers_hint here: that is because tenants in AttachedMulti
// should proceed with compaction even if they can't do deletion, to avoid
// accumulating dangerously deep stacks of L0 layers. Deletions will be
// enqueued inside RemoteTimelineClient, and executed layer if/when we transition
// to AttachedSingle state.
if !conf.location.may_upload_layers_hint() {
info!("Skipping compaction in location state {:?}", conf.location);
return Ok(CompactionOutcome::Done);
}
}
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// compactions. We don't want to block everything else while the
// compaction runs.
let timelines_to_compact_or_offload;
{
let timelines = self.timelines.lock().unwrap();
for (&timeline_id, timeline) in timelines.iter() {
// Skip inactive timelines.
if !timeline.is_active() {
continue;
}
// Schedule the timeline for compaction.
compact.push(timeline.clone());
// Schedule the timeline for offloading if eligible.
let can_offload = offload_enabled
&& timeline.can_offload().0
&& !timelines
.iter()
.any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id));
if can_offload {
offload.insert(timeline_id);
}
}
} // release timelines lock
for timeline in &compact {
// Collect L0 counts. Can't await while holding lock above.
if let Ok(lm) = timeline.layers.read().await.layer_map() {
l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
}
}
// Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to
// bound read amplification.
//
// TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC
// compaction and offloading. We leave that as a potential problem to solve later. Consider
// splitting L0 and image/GC compaction to separate background jobs.
if self.get_compaction_l0_first() {
let compaction_threshold = self.get_compaction_threshold();
let compact_l0 = compact
timelines_to_compact_or_offload = timelines
.iter()
.map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0)))
.filter(|&(_, l0)| l0 >= compaction_threshold)
.sorted_by_key(|&(_, l0)| l0)
.rev()
.map(|(tli, _)| tli.clone())
.collect_vec();
let mut has_pending_l0 = false;
for timeline in compact_l0 {
let outcome = timeline
.compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
.await
.inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
match outcome {
CompactionOutcome::Done => {}
CompactionOutcome::Skipped => {}
CompactionOutcome::Pending => has_pending_l0 = true,
CompactionOutcome::YieldForL0 => has_pending_l0 = true,
}
}
if has_pending_l0 {
return Ok(CompactionOutcome::YieldForL0); // do another pass
}
.filter_map(|(timeline_id, timeline)| {
let (is_active, (can_offload, _)) =
(timeline.is_active(), timeline.can_offload());
let has_no_unoffloaded_children = {
!timelines
.iter()
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
};
let config_allows_offload = self.conf.timeline_offloading
|| self
.tenant_conf
.load()
.tenant_conf
.timeline_offloading
.unwrap_or_default();
let can_offload =
can_offload && has_no_unoffloaded_children && config_allows_offload;
if (is_active, can_offload) == (false, false) {
None
} else {
Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
}
})
.collect::<Vec<_>>();
drop(timelines);
}
// Pass 2: image compaction and timeline offloading. If any timelines have accumulated
// more L0 layers, they may also be compacted here.
//
// NB: image compaction may yield if there is pending L0 compaction.
//
// TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
// different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
// We leave this for a later PR.
//
// TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
// amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
let mut has_pending = false;
for timeline in compact {
if !timeline.is_active() {
continue;
}
// Before doing any I/O work, check our circuit breaker
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
info!("Skipping compaction due to previous failures");
return Ok(CompactionOutcome::Done);
}
let mut outcome = timeline
.compact(cancel, EnumSet::default(), ctx)
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
.await
.inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
let mut has_pending_task = false;
// If we're done compacting, check the scheduled GC compaction queue for more work.
if outcome == CompactionOutcome::Done {
let queue = self
.scheduled_compaction_tasks
.lock()
.unwrap()
.get(&timeline.timeline_id)
.cloned();
if let Some(queue) = queue {
outcome = queue
.iteration(cancel, ctx, &self.gc_block, &timeline)
.await?;
}
}
// If we're done compacting, offload the timeline if requested.
if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) {
pausable_failpoint!("before-timeline-auto-offload");
offload_timeline(self, &timeline)
.instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id))
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
{
// pending_task_left == None: cannot compact, maybe still pending tasks
// pending_task_left == Some(Pending): compaction task left
// pending_task_left == Some(Done): no compaction task left
let pending_task_left = if *can_compact {
let compaction_outcome = timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
.or_else(|err| match err {
// Ignore this, we likely raced with unarchival.
OffloadError::NotArchived => Ok(()),
err => Err(err),
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
}
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
}
})?;
}
match outcome {
CompactionOutcome::Done => {}
CompactionOutcome::Skipped => {}
CompactionOutcome::Pending => has_pending = true,
// This mostly makes sense when the L0-only pass above is enabled, since there's
// otherwise no guarantee that we'll start with the timeline that has high L0.
CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0),
if let CompactionOutcome::Pending = compaction_outcome {
Some(CompactionOutcome::Pending)
} else {
let queue = {
let guard = self.scheduled_compaction_tasks.lock().unwrap();
guard.get(timeline_id).cloned()
};
if let Some(queue) = queue {
let outcome = queue
.iteration(cancel, ctx, &self.gc_block, timeline)
.await?;
Some(outcome)
} else {
Some(CompactionOutcome::Done)
}
}
} else {
None
};
has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
pausable_failpoint!("before-timeline-auto-offload");
match offload_timeline(self, timeline)
.instrument(info_span!("offload_timeline", %timeline_id))
.await
{
Err(OffloadError::NotArchived) => {
// Ignore this, we likely raced with unarchival
Ok(())
}
other => other,
}?;
}
}
// Success! Untrip the breaker if necessary.
self.compaction_circuit_breaker
.lock()
.unwrap()
.success(&CIRCUIT_BREAKERS_UNBROKEN);
match has_pending {
true => Ok(CompactionOutcome::Pending),
false => Ok(CompactionOutcome::Done),
}
}
/// Trips the compaction circuit breaker if appropriate.
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
match err {
CompactionError::ShuttingDown => (),
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
// shouldn't block compaction.
CompactionError::Offload(_) => {}
CompactionError::Other(err) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
}
Ok(if has_pending_task {
CompactionOutcome::Pending
} else {
CompactionOutcome::Done
})
}
/// Cancel scheduled compaction tasks
@@ -3924,13 +3812,6 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.rel_size_v2_enabled
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
pub fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
@@ -3938,13 +3819,6 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
}
pub fn get_compaction_l0_first(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_l0_first
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
}
pub fn get_gc_horizon(&self) -> u64 {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
@@ -3999,16 +3873,6 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
}
pub fn get_timeline_offloading_enabled(&self) -> bool {
if self.conf.timeline_offloading {
return true;
}
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.timeline_offloading
.unwrap_or(self.conf.default_tenant_conf.timeline_offloading)
}
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
fn build_tenant_manifest(&self) -> TenantManifest {
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
@@ -4103,7 +3967,6 @@ impl Tenant {
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
resources: TimelineResources,
cause: CreateTimelineCause,
@@ -4127,7 +3990,6 @@ impl Tenant {
self.conf,
Arc::clone(&self.tenant_conf),
new_metadata,
previous_heatmap,
ancestor,
new_timeline_id,
self.tenant_shard_id,
@@ -4770,24 +4632,24 @@ impl Tenant {
// We check it against both the planned GC cutoff stored in 'gc_info',
// and the 'latest_gc_cutoff' of the last GC that was performed. The
// planned GC cutoff in 'gc_info' is normally larger than
// 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
// 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
// changed the GC settings for the tenant to make the PITR window
// larger, but some of the data was already removed by an earlier GC
// iteration.
// check against last actual 'latest_gc_cutoff' first
let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
{
let gc_info = src_timeline.gc_info.read().unwrap();
let planned_cutoff = gc_info.min_cutoff();
if gc_info.lsn_covered_by_lease(start_lsn) {
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
} else {
src_timeline
.check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*applied_gc_cutoff_lsn,
*latest_gc_cutoff_lsn,
))
.map_err(CreateTimelineError::AncestorLsn)?;
@@ -4826,7 +4688,7 @@ impl Tenant {
dst_prev,
Some(src_id),
start_lsn,
*src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
src_timeline.initdb_lsn,
src_timeline.pg_version,
);
@@ -5199,7 +5061,6 @@ impl Tenant {
.create_timeline_struct(
new_timeline_id,
new_metadata,
None,
ancestor,
resources,
CreateTimelineCause::Load,
@@ -5617,8 +5478,6 @@ pub(crate) mod harness {
compaction_threshold: Some(tenant_conf.compaction_threshold),
compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
compaction_l0_first: Some(tenant_conf.compaction_l0_first),
compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore),
l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
@@ -5647,7 +5506,7 @@ pub(crate) mod harness {
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),
wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
gc_compaction_initial_threshold_kb: Some(
tenant_conf.gc_compaction_initial_threshold_kb,
@@ -6206,8 +6065,8 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
match tline.get(*TEST_KEY, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err.to_string().contains("not found at")),
@@ -8503,7 +8362,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -8611,7 +8470,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -8779,8 +8638,8 @@ mod tests {
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
info!(
"applied_gc_cutoff_lsn: {}",
*timeline.get_applied_gc_cutoff_lsn()
"latest_gc_cutoff_lsn: {}",
*timeline.get_latest_gc_cutoff_lsn()
);
timeline.force_set_disk_consistent_lsn(end_lsn);
@@ -8806,7 +8665,7 @@ mod tests {
// Make lease on a already GC-ed LSN.
// 0/80 does not have a valid lease + is below latest_gc_cutoff
assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
timeline
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
.expect_err("lease request on GC-ed LSN should fail");
@@ -8997,7 +8856,7 @@ mod tests {
};
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9084,7 +8943,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -9537,7 +9396,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9684,7 +9543,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x38))
.wait()
@@ -9785,7 +9644,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10036,7 +9895,7 @@ mod tests {
{
parent_tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x10))
.wait()
@@ -10056,7 +9915,7 @@ mod tests {
{
branch_tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x50))
.wait()
@@ -10412,7 +10271,7 @@ mod tests {
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10797,7 +10656,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -11048,7 +10907,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()

View File

@@ -285,14 +285,6 @@ pub struct TenantConfOpt {
#[serde(default)]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_l0_first: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_l0_semaphore: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub l0_flush_delay_threshold: Option<usize>,
@@ -424,12 +416,6 @@ impl TenantConfOpt {
.as_ref()
.unwrap_or(&global_conf.compaction_algorithm)
.clone(),
compaction_l0_first: self
.compaction_l0_first
.unwrap_or(global_conf.compaction_l0_first),
compaction_l0_semaphore: self
.compaction_l0_semaphore
.unwrap_or(global_conf.compaction_l0_semaphore),
l0_flush_delay_threshold: self
.l0_flush_delay_threshold
.or(global_conf.l0_flush_delay_threshold),
@@ -480,14 +466,12 @@ impl TenantConfOpt {
.lsn_lease_length_for_ts
.unwrap_or(global_conf.lsn_lease_length_for_ts),
timeline_offloading: self
.timeline_offloading
.lazy_slru_download
.unwrap_or(global_conf.timeline_offloading),
wal_receiver_protocol_override: self
.wal_receiver_protocol_override
.or(global_conf.wal_receiver_protocol_override),
rel_size_v2_enabled: self
.rel_size_v2_enabled
.unwrap_or(global_conf.rel_size_v2_enabled),
rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
gc_compaction_enabled: self
.gc_compaction_enabled
.unwrap_or(global_conf.gc_compaction_enabled),
@@ -509,8 +493,6 @@ impl TenantConfOpt {
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
mut l0_flush_wait_upload,
@@ -556,10 +538,6 @@ impl TenantConfOpt {
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
.apply(&mut compaction_l0_semaphore);
patch
.l0_flush_delay_threshold
.apply(&mut l0_flush_delay_threshold);
@@ -641,8 +619,6 @@ impl TenantConfOpt {
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
l0_flush_wait_upload,
@@ -705,8 +681,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
compaction_period: value.compaction_period.map(humantime),
compaction_threshold: value.compaction_threshold,
compaction_upper_limit: value.compaction_upper_limit,
compaction_l0_first: value.compaction_l0_first,
compaction_l0_semaphore: value.compaction_l0_semaphore,
l0_flush_delay_threshold: value.l0_flush_delay_threshold,
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
l0_flush_wait_upload: value.l0_flush_wait_upload,

View File

@@ -130,10 +130,7 @@ struct TimelineMetadataBodyV2 {
prev_record_lsn: Option<Lsn>,
ancestor_timeline: Option<TimelineId>,
ancestor_lsn: Lsn,
// The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`].
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
}

View File

@@ -2816,8 +2816,8 @@ where
}
use {
crate::tenant::gc_result::GcResult, http_utils::error::ApiError,
pageserver_api::models::TimelineGcRequest,
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};
#[cfg(test)]

View File

@@ -1,4 +1,4 @@
use std::{collections::HashMap, time::SystemTime};
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTenant {
pub(super) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
@@ -25,17 +25,8 @@ pub(crate) struct HeatMapTenant {
pub(super) upload_period_ms: Option<u128>,
}
impl HeatMapTenant {
pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
self.timelines
.into_iter()
.map(|htl| (htl.timeline_id, htl))
.collect()
}
}
#[serde_as]
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
@@ -44,13 +35,13 @@ pub(crate) struct HeatMapTimeline {
}
#[serde_as]
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(crate) name: LayerName,
pub(crate) metadata: LayerFileMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(crate) access_time: SystemTime,
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}

View File

@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
ancestor_lsn,
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
next_pitr_cutoff,
retention_param_cutoff,
lease_points,

View File

@@ -136,22 +136,6 @@ pub(crate) fn local_layer_path(
}
}
pub(crate) enum LastEviction {
Never,
At(std::time::Instant),
Evicting,
}
impl LastEviction {
pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
match self {
LastEviction::Never => false,
LastEviction::At(evicted_at) => evicted_at > &timepoint,
LastEviction::Evicting => true,
}
}
}
impl Layer {
/// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted(
@@ -369,6 +353,7 @@ impl Layer {
/// while the guard exists.
///
/// Returns None if the layer is currently evicted or becoming evicted.
#[cfg(test)]
pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
@@ -421,17 +406,6 @@ impl Layer {
self.0.metadata()
}
pub(crate) fn last_evicted_at(&self) -> LastEviction {
match self.0.last_evicted_at.try_lock() {
Ok(lock) => match *lock {
None => LastEviction::Never,
Some(at) => LastEviction::At(at),
},
Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
}
}
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
self.0
.timeline
@@ -556,6 +530,7 @@ impl ResidentOrWantedEvicted {
/// This is not used on the read path (anything that calls
/// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
/// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
#[cfg(test)]
fn get(&self) -> Option<Arc<DownloadedLayer>> {
match self {
ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
@@ -683,9 +658,7 @@ struct LayerInner {
/// When the Layer was last evicted but has not been downloaded since.
///
/// This is used for skipping evicted layers from the previous heatmap (see
/// `[Timeline::generate_heatmap]`) and for updating metrics
/// (see [`LayerImplMetrics::redownload_after`]).
/// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
#[cfg(test)]

View File

@@ -4,7 +4,7 @@ use std::cmp::max;
use std::future::Future;
use std::ops::{ControlFlow, RangeInclusive};
use std::pin::pin;
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use once_cell::sync::Lazy;
@@ -15,7 +15,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::compaction::CompactionOutcome;
@@ -25,6 +25,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
use utils::backoff::exponential_backoff_duration;
use utils::completion::Barrier;
use utils::pausable_failpoint;
use utils::rate_limit::RateLimit;
/// Semaphore limiting concurrent background tasks (across all tenants).
///
@@ -37,17 +38,17 @@ static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
Semaphore::new(permits)
});
/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if
/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled.
/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
/// default, see `use_compaction_semaphore`.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
///
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less
/// important (e.g. due to page images in delta layers) and can wait for other background tasks.
/// to avoid high read amp during heavy write workloads.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note
/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same
/// thread pool.
static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
@@ -58,7 +59,7 @@ static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
/// Background jobs.
///
/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
/// do any significant IO or CPU work.
/// do any significant IO.
#[derive(
Debug,
PartialEq,
@@ -71,9 +72,6 @@ static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum BackgroundLoopKind {
/// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is
/// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics.
L0Compaction,
Compaction,
Gc,
Eviction,
@@ -93,22 +91,37 @@ pub struct BackgroundLoopSemaphorePermit<'a> {
/// Acquires a semaphore permit, to limit concurrent background jobs.
pub(crate) async fn acquire_concurrency_permit(
loop_kind: BackgroundLoopKind,
use_compaction_semaphore: bool,
_ctx: &RequestContext,
) -> BackgroundLoopSemaphorePermit<'static> {
let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
static WARN_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
pausable_failpoint!("initial-size-calculation-permit-pause");
}
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
let semaphore = match loop_kind {
BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS,
_ => &CONCURRENT_BACKGROUND_TASKS,
};
let permit = semaphore.acquire().await.expect("should never close");
let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
CONCURRENT_COMPACTION_TASKS.acquire().await
} else {
assert!(!use_compaction_semaphore);
CONCURRENT_BACKGROUND_TASKS.acquire().await
}
.expect("should never close");
recorder.acquired();
let waited = recorder.acquired();
if waited >= WARN_THRESHOLD {
let waited = waited.as_secs_f64();
WARN_PACER
.lock()
.unwrap()
.call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
}
BackgroundLoopSemaphorePermit {
_permit: permit,
@@ -255,12 +268,13 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
match output {
Ok(outcome) => {
error_run = 0;
// If there's more compaction work, L0 or not, schedule an immediate run.
match outcome {
CompactionOutcome::Done => {}
CompactionOutcome::Skipped => {}
CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(),
CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(),
// If there's more compaction work pending, reschedule immediately. This isn't
// necessarily L0 compaction, but that's fine for now.
//
// TODO: differentiate between L0 compaction and other compaction. The former needs
// to be responsive, the latter doesn't.
if outcome == CompactionOutcome::Pending {
tenant.l0_compaction_trigger.notify_one();
}
}
@@ -576,7 +590,7 @@ pub(crate) fn warn_when_period_overrun(
?task,
"task iteration took longer than the configured period"
);
metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
.inc();
}

View File

@@ -117,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::tenant::config::TenantConfOpt;
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
@@ -150,15 +150,16 @@ use super::{
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
MaybeOffloaded,
};
use super::{
debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
storage_layer::ReadableLayer,
};
use super::{secondary::heatmap::HeatMapLayer, GcError};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
};
#[cfg(test)]
use pageserver_api::value::Value;
@@ -327,7 +328,6 @@ pub struct Timeline {
// in `crate::page_service` writes these metrics.
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
/// Ensures layers aren't frozen by checkpointer between
@@ -352,11 +352,8 @@ pub struct Timeline {
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
// The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
// we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
// Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
// the planned GC cutoff.
pub applied_gc_cutoff_lsn: Rcu<Lsn>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
@@ -465,16 +462,6 @@ pub struct Timeline {
/// If Some, collects GetPage metadata for an ongoing PageTrace.
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
previous_heatmap: ArcSwapOption<PreviousHeatmap>,
}
pub(crate) enum PreviousHeatmap {
Active {
heatmap: HeatMapTimeline,
read_at: std::time::Instant,
},
Obsolete,
}
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -889,12 +876,8 @@ pub(crate) enum CompactFlags {
ForceRepartition,
ForceImageLayerCreation,
ForceL0Compaction,
OnlyL0Compaction,
EnhancedGcBottomMostCompaction,
DryRun,
/// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
/// compaction via HTTP API.
NoYield,
}
#[serde_with::serde_as]
@@ -1090,15 +1073,9 @@ impl Timeline {
(history, gc_info.within_ancestor_pitr)
}
/// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.applied_gc_cutoff_lsn.read()
}
/// Read timeline's planned GC cutoff: this is the logical end of history that users
/// are allowed to read (based on configured PITR), even if physically we have more history.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
self.gc_info.read().unwrap().cutoffs.time
/// Lock and get timeline's GC cutoff
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
}
/// Look up given page version.
@@ -1576,7 +1553,6 @@ impl Timeline {
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
let mut gc_info = self.gc_info.write().unwrap();
let planned_cutoff = gc_info.min_cutoff();
let valid_until = SystemTime::now() + length;
@@ -1597,7 +1573,7 @@ impl Timeline {
existing_lease.clone()
}
Entry::Vacant(vacant) => {
// Reject already GC-ed LSN if we are in AttachedSingle and
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
// not blocked by the lsn lease deadline.
let validate = {
let conf = self.tenant_conf.load();
@@ -1606,12 +1582,9 @@ impl Timeline {
};
if init || validate {
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
if lsn < *latest_gc_cutoff_lsn {
bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
if lsn < planned_cutoff {
bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff);
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
}
@@ -1813,48 +1786,36 @@ impl Timeline {
.await
}
/// Outermost timeline compaction operation; downloads needed layers.
///
/// NB: the cancellation token is usually from a background task, but can also come from a
/// request task.
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
/// compaction tasks.
pub(crate) async fn compact_with_options(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
// Acquire the compaction lock and task semaphore.
//
// L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved
// out by other background tasks (including image compaction). We request this via
// `BackgroundLoopKind::L0Compaction`.
//
// If this is a regular compaction pass, and L0-only compaction is enabled in the config,
// then we should yield for immediate L0 compaction if necessary while we're waiting for the
// background task semaphore. There's no point yielding otherwise, since we'd just end up
// right back here.
let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
true => BackgroundLoopKind::L0Compaction,
false => BackgroundLoopKind::Compaction,
};
let yield_for_l0 = !is_l0_only
&& self.get_compaction_l0_first()
&& !options.flags.contains(CompactFlags::NoYield);
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
let acquire = async move {
let prepare = async move {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await;
let permit = super::tasks::acquire_concurrency_permit(
BackgroundLoopKind::Compaction,
self.conf.use_compaction_semaphore,
ctx,
)
.await;
(guard, permit)
};
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
(guard, permit) = acquire => (guard, permit),
_ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {
return Ok(CompactionOutcome::YieldForL0);
}
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
_ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
_ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
};
let last_record_lsn = self.get_last_record_lsn();
@@ -1862,7 +1823,7 @@ impl Timeline {
// Last record Lsn could be zero in case the timeline was just created
if !last_record_lsn.is_valid() {
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
return Ok(CompactionOutcome::Skipped);
return Ok(CompactionOutcome::Done);
}
let result = match self.get_compaction_algorithm_settings().kind {
@@ -2356,14 +2317,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.rel_size_v2_enabled
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2372,20 +2325,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
}
pub fn get_compaction_l0_first(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_l0_first
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
}
pub fn get_compaction_l0_semaphore(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_l0_semaphore
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore)
}
fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
// Disable L0 flushes by default. This and compaction needs further tuning.
const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
@@ -2586,7 +2525,6 @@ impl Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
@@ -2673,7 +2611,6 @@ impl Timeline {
),
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
@@ -2688,7 +2625,7 @@ impl Timeline {
LastImageLayerCreationStatus::default(),
)),
applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
current_logical_size: if disk_consistent_lsn.is_valid() {
@@ -2750,8 +2687,6 @@ impl Timeline {
create_idempotency,
page_trace: Default::default(),
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
};
result.repartition_threshold =
@@ -3207,6 +3142,7 @@ impl Timeline {
async move {
let wait_for_permit = super::tasks::acquire_concurrency_permit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
false,
background_ctx,
);
@@ -3440,42 +3376,8 @@ impl Timeline {
}
}
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
// TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
// for each of the database, but we only store one value, and therefore each pgdirmodification
// would overwrite the previous value if they modify different databases.
match count {
MetricsUpdate::Set(count) => {
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
}
MetricsUpdate::Add(count) => {
// TODO: these operations are not atomic; but we only have one writer to the metrics, so
// it's fine.
if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
// The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
// the value reliably.
self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
}
// Otherwise, ignore this update
}
MetricsUpdate::Sub(count) => {
// TODO: these operations are not atomic; but we only have one writer to the metrics, so
// it's fine.
if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
// The metrics has been initialized with `MetricsUpdate::Set` before.
// The operation could overflow so we need to normalize the value.
let prev_val =
self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
let res = prev_val.saturating_sub(count);
self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
}
// Otherwise, ignore this update
}
};
// TODO: remove this, there's no place in the code that updates this aux metrics.
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
let aux_metric =
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
@@ -3524,52 +3426,12 @@ impl Timeline {
let guard = self.layers.read().await;
// Firstly, if there's any heatmap left over from when this location
// was a secondary, take that into account. Keep layers that are:
// * present in the layer map
// * visible
// * non-resident
// * not evicted since we read the heatmap
//
// Without this, a new cold, attached location would clobber the previous
// heatamp.
let previous_heatmap = self.previous_heatmap.load();
let visible_non_resident = match previous_heatmap.as_deref() {
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
Some(heatmap.layers.iter().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
if layer.visibility() == LayerVisibilityHint::Covered {
return None;
}
if layer.is_likely_resident() {
return None;
}
if layer.last_evicted_at().happened_after(*read_at) {
return None;
}
Some((desc, hl.metadata.clone(), hl.access_time))
}))
}
Some(PreviousHeatmap::Obsolete) => None,
None => None,
};
// Secondly, all currently visible, resident layers are included.
let resident = guard.likely_resident_layers().filter_map(|layer| {
match layer.visibility() {
LayerVisibilityHint::Visible => {
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
let last_activity_ts = layer.latest_activity();
Some((
layer.layer_desc().clone(),
layer.metadata(),
last_activity_ts,
))
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
}
LayerVisibilityHint::Covered => {
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -3578,18 +3440,7 @@ impl Timeline {
}
});
let mut layers = match visible_non_resident {
Some(non_resident) => {
let mut non_resident = non_resident.peekable();
if non_resident.peek().is_none() {
self.previous_heatmap
.store(Some(PreviousHeatmap::Obsolete.into()));
}
non_resident.chain(resident).collect::<Vec<_>>()
}
None => resident.collect::<Vec<_>>(),
};
let mut layers = resident.collect::<Vec<_>>();
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
@@ -3693,9 +3544,7 @@ impl Timeline {
// space. If that's not the case, we had at least one key encounter a gap in the image layer
// and stop the search as a result of that.
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
// Do not fire missing key error and end early for sparse keys. Note that we hava already removed
// non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
// figuring out what is the inherited key range and do a fine-grained pruning.
// Do not fire missing key error for sparse keys.
removed.remove_overlapping_with(&KeySpace {
ranges: vec![SPARSE_RANGE],
});
@@ -3780,7 +3629,7 @@ impl Timeline {
// the timeline, then it will remove layers that are required for fulfilling
// the current get request (read-path cannot "look back" and notice the new
// image layer).
let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();
let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
// See `compaction::compact_with_gc` for why we need this.
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
@@ -4338,7 +4187,6 @@ impl Timeline {
ImageLayerCreationMode::Initial,
ctx,
LastImageLayerCreationStatus::Initial,
false, // don't yield for L0, we're flushing L0
)
.await?;
debug_assert!(
@@ -4467,7 +4315,7 @@ impl Timeline {
let update = crate::tenant::metadata::MetadataUpdate::new(
disk_consistent_lsn,
ondisk_prev_record_lsn,
*self.applied_gc_cutoff_lsn.read(),
*self.latest_gc_cutoff_lsn.read(),
);
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -4911,7 +4759,6 @@ impl Timeline {
mode: ImageLayerCreationMode,
ctx: &RequestContext,
last_status: LastImageLayerCreationStatus,
yield_for_l0: bool,
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
@@ -5108,7 +4955,7 @@ impl Timeline {
if let ImageLayerCreationMode::Try = mode {
// We have at least made some progress
if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 {
if batch_image_writer.pending_layer_num() >= 1 {
// The `Try` mode is currently only used on the compaction path. We want to avoid
// image layer generation taking too long time and blocking L0 compaction. So in this
// mode, we also inspect the current number of L0 layers and skip image layer generation
@@ -5695,7 +5542,7 @@ impl Timeline {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: *self.get_applied_gc_cutoff_lsn(),
time: *self.get_latest_gc_cutoff_lsn(),
space: space_cutoff,
}
}
@@ -5816,7 +5663,7 @@ impl Timeline {
let mut result: GcResult = GcResult::default();
// Nothing to GC. Return early.
let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
if latest_gc_cutoff >= new_gc_cutoff {
info!(
"Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
@@ -5830,7 +5677,7 @@ impl Timeline {
//
// The GC cutoff should only ever move forwards.
let waitlist = {
let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
if *write_guard > new_gc_cutoff {
return Err(GcError::BadLsn {
why: format!(
@@ -6770,32 +6617,18 @@ fn is_send() {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use tracing::Instrument;
use utils::{id::TimelineId, lsn::Lsn};
use crate::tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName, LayerVisibilityHint},
storage_layer::{Layer, LayerName},
timeline::{DeltaLayerTestDesc, EvictionError},
PreviousHeatmap, Timeline,
Timeline,
};
use super::HeatMapTimeline;
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
assert_eq!(lhs.layers.len(), rhs.layers.len());
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
for (l, r) in lhs_rhs {
assert_eq!(l.name, r.name);
assert_eq!(l.metadata, r.metadata);
}
}
#[tokio::test]
async fn test_heatmap_generation() {
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
@@ -6869,7 +6702,7 @@ mod tests {
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in &heatmap.layers {
for layer in heatmap.layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
@@ -6884,144 +6717,6 @@ mod tests {
last_lsn = layer_lsn;
}
}
// Evict all the layers and stash the old heatmap in the timeline.
// This simulates a migration to a cold secondary location.
let guard = timeline.layers.read().await;
let mut all_layers = Vec::new();
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
all_layers.push(layer.clone());
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Generate a new heatmap and assert that it contains the same layers as the old one.
let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
// Download each layer one by one. Generate the heatmap at each step and check
// that it's stable.
for layer in all_layers {
if layer.visibility() == LayerVisibilityHint::Covered {
continue;
}
eprintln!("Downloading {layer} and re-generating heatmap");
let _resident = layer
.download_and_keep_resident()
.instrument(tracing::info_span!(
parent: None,
"download_layer",
tenant_id = %timeline.tenant_shard_id.tenant_id,
shard_id = %timeline.tenant_shard_id.shard_slug(),
timeline_id = %timeline.timeline_id
))
.await
.unwrap();
let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
}
// Everything from the post-migration heatmap is now resident.
// Check that we drop it from memory.
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]
async fn test_previous_heatmap_obsoletion() {
let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
.await
.unwrap();
let l0_delta = DeltaLayerTestDesc::new(
Lsn(0x20)..Lsn(0x30),
Key::from_hex("000000000000000000000000000000000000").unwrap()
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
vec![(
Key::from_hex("720000000033333333444444445500000000").unwrap(),
Lsn(0x25),
Value::Image(test_img("foo")),
)],
);
let image_layer = (
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("bar"),
)],
);
let delta_layers = vec![l0_delta];
let image_layers = vec![image_layer];
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
delta_layers,
image_layers,
Lsn(0x100),
)
.await
.unwrap();
// Layer visibility is an input to heatmap generation, so refresh it first
timeline.update_layer_visibility().await.unwrap();
let heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
// Both layers should be in the heatmap
assert!(!heatmap.layers.is_empty());
// Now simulate a migration.
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Evict all the layers in the previous heatmap
let guard = timeline.layers.read().await;
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
// Generate a new heatmap and check that the previous heatmap
// has been marked obsolete.
let post_eviction_heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
assert!(post_eviction_heatmap.layers.is_empty());
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]

View File

@@ -609,11 +609,6 @@ pub enum CompactionOutcome {
/// Still has pending layers to be compacted after this round. Ideally, the scheduler
/// should immediately schedule another compaction.
Pending,
/// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only
/// guaranteed when `compaction_l0_first` is enabled).
YieldForL0,
/// Compaction was skipped, because the timeline is ineligible for compaction.
Skipped,
}
impl Timeline {
@@ -706,11 +701,10 @@ impl Timeline {
.unwrap_or(self.get_disk_consistent_lsn());
l0_min_lsn.max(self.get_ancestor_lsn())
};
// 1. L0 Compact
let l0_outcome = {
let l0_compaction_outcome = {
let timer = self.metrics.compact_time_histo.start_timer();
let l0_outcome = self
let l0_compaction_outcome = self
.compact_level0(
target_file_size,
options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -718,19 +712,15 @@ impl Timeline {
)
.await?;
timer.stop_and_record();
l0_outcome
l0_compaction_outcome
};
if options.flags.contains(CompactFlags::OnlyL0Compaction) {
return Ok(l0_outcome);
}
// Yield if we have pending L0 compaction. The scheduler will do another pass.
if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
&& !options.flags.contains(CompactFlags::NoYield)
{
info!("image/ancestor compaction yielding for L0 compaction");
return Ok(CompactionOutcome::YieldForL0);
if let CompactionOutcome::Pending = l0_compaction_outcome {
// Yield and do not do any other kind of compaction. True means
// that we have pending L0 compaction tasks and the compaction scheduler
// will prioritize compacting this tenant/timeline again.
info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
return Ok(CompactionOutcome::Pending);
}
if l0_l1_boundary_lsn < self.partitioning.read().1 {
@@ -776,7 +766,6 @@ impl Timeline {
.load()
.as_ref()
.clone(),
!options.flags.contains(CompactFlags::NoYield),
)
.await
.inspect_err(|err| {
@@ -795,7 +784,7 @@ impl Timeline {
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
// Yield and do not do any other kind of compaction.
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
return Ok(CompactionOutcome::YieldForL0);
return Ok(CompactionOutcome::Pending);
}
}
Err(err) => {
@@ -852,7 +841,7 @@ impl Timeline {
//
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
@@ -2202,7 +2191,7 @@ impl Timeline {
// TODO: ensure the child branches will not use anything below the watermark, or consider
// them when computing the watermark.
gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
}
/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.

Some files were not shown because too many files have changed in this diff Show More