Compare commits

..

1 Commits

Author SHA1 Message Date
Erik Grinaker
9f119d6f64 pageserver: simplify block_deletions handling 2025-02-12 15:03:12 +01:00
79 changed files with 717 additions and 2318 deletions

56
.github/workflows/_push-to-acr.yml vendored Normal file
View File

@@ -0,0 +1,56 @@
name: Push images to ACR
on:
workflow_call:
inputs:
client_id:
description: Client ID of Azure managed identity or Entra app
required: true
type: string
image_tag:
description: Tag for the container image
required: true
type: string
images:
description: Images to push
required: true
type: string
registry_name:
description: Name of the container registry
required: true
type: string
subscription_id:
description: Azure subscription ID
required: true
type: string
tenant_id:
description: Azure tenant ID
required: true
type: string
jobs:
push-to-acr:
runs-on: ubuntu-22.04
permissions:
contents: read # This is required for actions/checkout
id-token: write # This is required for Azure Login to work.
steps:
- name: Azure login
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ inputs.client_id }}
subscription-id: ${{ inputs.subscription_id }}
tenant-id: ${{ inputs.tenant_id }}
- name: Login to ACR
run: |
az acr login --name=${{ inputs.registry_name }}
- name: Copy docker images to ACR ${{ inputs.registry_name }}
run: |
images='${{ inputs.images }}'
for image in ${images}; do
docker buildx imagetools create \
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
neondatabase/${image}:${{ inputs.image_tag }}
done

View File

@@ -1,101 +0,0 @@
name: Push images to Container Registry
on:
workflow_call:
inputs:
# Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
image-map:
description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
required: true
type: string
aws-region:
description: AWS region to log in to. Required when pushing to ECR.
required: false
type: string
aws-account-ids:
description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
required: false
type: string
azure-client-id:
description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR.
required: false
type: string
azure-subscription-id:
description: Azure subscription ID. Required when pushing to ACR.
required: false
type: string
azure-tenant-id:
description: Azure tenant ID. Required when pushing to ACR.
required: false
type: string
acr-registry-name:
description: ACR registry name. Required when pushing to ACR.
required: false
type: string
secrets:
docker-hub-username:
description: Docker Hub username. Required when pushing to Docker Hub.
required: false
docker-hub-password:
description: Docker Hub password. Required when pushing to Docker Hub.
required: false
aws-role-to-assume:
description: AWS role to assume. Required when pushing to ECR.
required: false
permissions: {}
defaults:
run:
shell: bash -euo pipefail {0}
jobs:
push-to-container-registry:
runs-on: ubuntu-22.04
permissions:
id-token: write # Required for aws/azure login
steps:
- uses: actions/checkout@v4
with:
sparse-checkout: scripts/push_with_image_map.py
sparse-checkout-cone-mode: false
- name: Print image-map
run: echo '${{ inputs.image-map }}' | jq
- name: Configure AWS credentials
if: contains(inputs.image-map, 'amazonaws.com/')
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: "${{ inputs.aws-region }}"
role-to-assume: "${{ secrets.aws-role-to-assume }}"
role-duration-seconds: 3600
- name: Login to ECR
if: contains(inputs.image-map, 'amazonaws.com/')
uses: aws-actions/amazon-ecr-login@v2
with:
registries: "${{ inputs.aws-account-ids }}"
- name: Configure Azure credentials
if: contains(inputs.image-map, 'azurecr.io/')
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ inputs.azure-client-id }}
subscription-id: ${{ inputs.azure-subscription-id }}
tenant-id: ${{ inputs.azure-tenant-id }}
- name: Login to ACR
if: contains(inputs.image-map, 'azurecr.io/')
run: |
az acr login --name=${{ inputs.acr-registry-name }}
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.docker-hub-username }}
password: ${{ secrets.docker-hub-password }}
- name: Copy docker images to target registries
run: python scripts/push_with_image_map.py
env:
IMAGE_MAP: ${{ inputs.image-map }}

View File

@@ -497,7 +497,7 @@ jobs:
trigger-e2e-tests:
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
needs: [ check-permissions, promote-images-dev, tag ]
uses: ./.github/workflows/trigger-e2e-tests.yml
secrets: inherit
@@ -571,6 +571,21 @@ jobs:
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- name: Push multi-arch image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
compute-node-image-arch:
needs: [ check-permissions, build-build-tools-image, tag ]
permissions:
@@ -617,6 +632,16 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- uses: docker/login-action@v3
with:
registry: cache.neon.build
@@ -704,6 +729,21 @@ jobs:
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
@@ -836,109 +876,133 @@ jobs:
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
generate-image-maps:
needs: [ tag ]
promote-images-dev:
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
runs-on: ubuntu-22.04
outputs:
neon-dev: ${{ steps.generate.outputs.neon-dev }}
neon-prod: ${{ steps.generate.outputs.neon-prod }}
compute-dev: ${{ steps.generate.outputs.compute-dev }}
compute-prod: ${{ steps.generate.outputs.compute-prod }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
env:
VERSIONS: v14 v15 v16 v17
steps:
- uses: actions/checkout@v4
- uses: docker/login-action@v3
with:
sparse-checkout: scripts/generate_image_maps.py
sparse-checkout-cone-mode: false
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Generate Image Maps
id: generate
run: python scripts/generate_image_maps.py
env:
BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
BRANCH: "${{ github.ref_name }}"
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
push-neon-image-dev:
needs: [ generate-image-maps, neon-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
push-compute-image-dev:
needs: [ generate-image-maps, vm-compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
aws-region: eu-central-1
aws-account-ids: "369495373322"
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Copy vm-compute-node images to ECR
run: |
for version in ${VERSIONS}; do
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
push-neon-image-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, neon-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
promote-images-prod:
needs: [ check-permissions, tag, test-images, promote-images-dev ]
runs-on: ubuntu-22.04
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
push-compute-image-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ generate-image-maps, vm-compute-node-image, test-images ]
uses: ./.github/workflows/_push-to-container-registry.yml
with:
image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
aws-region: eu-central-1
aws-account-ids: "093970136003"
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
secrets:
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
permissions:
id-token: write # aws-actions/configure-aws-credentials
statuses: write
contents: read
# This is a bit of a special case so we're not using a generated image map.
add-latest-tag-to-neon-extensions-test-image:
env:
VERSIONS: v14 v15 v16 v17
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 3600
- name: Login to Amazon Dev ECR
uses: aws-actions/amazon-ecr-login@v2
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Add latest tag to images
if: github.ref_name == 'main'
run: |
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
docker buildx imagetools create -t $repo/neon:latest \
$repo/neon:${{ needs.tag.outputs.build-tag }}
for version in ${VERSIONS}; do
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
- name: Configure AWS-prod credentials
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
mask-aws-account-id: true
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
- name: Login to prod ECR
uses: docker/login-action@v3
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
with:
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
- name: Copy all images to prod ECR
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
run: |
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
push-to-acr-dev:
if: github.ref_name == 'main'
needs: [ tag, compute-node-image ]
uses: ./.github/workflows/_push-to-container-registry.yml
needs: [ tag, promote-images-dev ]
uses: ./.github/workflows/_push-to-acr.yml
with:
image-map: |
{
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
}
secrets:
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
push-to-acr-prod:
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
needs: [ tag, promote-images-prod ]
uses: ./.github/workflows/_push-to-acr.yml
with:
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
image_tag: ${{ needs.tag.outputs.build-tag }}
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
tenant_id: ${{ vars.AZURE_TENANT_ID }}
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
@@ -1020,7 +1084,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
permissions:
@@ -1273,7 +1337,7 @@ jobs:
done
pin-build-tools-image:
needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
@@ -1298,8 +1362,7 @@ jobs:
- check-codestyle-rust
- check-dependencies-rust
- files-changed
- push-compute-image-dev
- push-neon-image-dev
- promote-images-dev
- test-images
- trigger-custom-extensions-build-and-wait
runs-on: ubuntu-22.04
@@ -1316,7 +1379,6 @@ jobs:
|| needs.check-codestyle-python.result == 'skipped'
|| needs.check-codestyle-rust.result == 'skipped'
|| needs.files-changed.result == 'skipped'
|| needs.push-compute-image-dev.result == 'skipped'
|| needs.push-neon-image-dev.result == 'skipped'
|| needs.promote-images-dev.result == 'skipped'
|| needs.test-images.result == 'skipped'
|| needs.trigger-custom-extensions-build-and-wait.result == 'skipped'

View File

@@ -15,14 +15,7 @@ env:
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name }}
cancel-previous-e2e-tests:
needs: [ check-permissions ]
if: github.event_name == 'pull_request'
runs-on: ubuntu-22.04
@@ -36,7 +29,6 @@ jobs:
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
tag:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
outputs:
build-tag: ${{ steps.build-tag.outputs.tag }}
@@ -76,7 +68,7 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Wait for `push-{neon,compute}-image-dev` job to finish
- name: Wait for `promote-images-dev` job to finish
# It's important to have a timeout here, the script in the step can run infinitely
timeout-minutes: 60
run: |
@@ -87,20 +79,20 @@ jobs:
# For PRs we use the run id as the tag
BUILD_AND_TEST_RUN_ID=${TAG}
while true; do
gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
break
fi
jq -c '.[]' jobs.json | while read -r job; do
case $(echo $job | jq .conclusion) in
failure | cancelled | skipped)
echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..."
exit 1
;;
esac
done
echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..."
sleep 60
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
case "$conclusion" in
success)
break
;;
failure | cancelled | skipped)
echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
exit 1
;;
*)
echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
sleep 60
;;
esac
done
- name: Set e2e-platforms

14
Cargo.lock generated
View File

@@ -786,7 +786,7 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"async-trait",
"base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"async-lock",
"async-trait",
@@ -834,7 +834,7 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"RustyXML",
"async-lock",
@@ -852,7 +852,7 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"RustyXML",
"azure_core",
@@ -872,7 +872,7 @@ dependencies = [
[[package]]
name = "azure_svc_blobstorage"
version = "0.21.0"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
dependencies = [
"azure_core",
"bytes",
@@ -1293,7 +1293,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"jsonwebtoken",
"regex",
"remote_storage",
"serde",
@@ -1321,7 +1320,6 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
"notify",
@@ -6466,8 +6464,6 @@ dependencies = [
"routerify",
"rustls 0.23.18",
"rustls-native-certs 0.8.0",
"safekeeper_api",
"safekeeper_client",
"scoped-futures",
"scopeguard",
"serde",

View File

@@ -10,28 +10,6 @@ ARG STABLE_PG_VERSION=16
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
# Build Postgres
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
WORKDIR /home/nonroot
@@ -50,14 +28,6 @@ RUN set -e \
&& rm -rf pg_install/build \
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
# Prepare cargo-chef recipe
FROM $REPOSITORY/$IMAGE:$TAG AS plan
WORKDIR /home/nonroot
COPY --chown=nonroot . .
RUN cargo chef prepare --recipe-path recipe.json
# Build neon binaries
FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
@@ -71,15 +41,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
COPY --from=plan /home/nonroot/recipe.json recipe.json
ARG ADDITIONAL_RUSTFLAGS=""
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
--bin pg_sni_router \
@@ -95,7 +59,7 @@ RUN set -e \
# Build final image
#
FROM $BASE_IMAGE_SHA
FROM debian:${DEBIAN_FLAVOR}
ARG DEFAULT_PG_VERSION
WORKDIR /data
@@ -148,3 +112,4 @@ EXPOSE 6400
EXPOSE 9898
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]

View File

@@ -1,29 +1,6 @@
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
FROM $BASE_IMAGE_SHA AS pgcopydb_builder
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
@@ -32,7 +9,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# By default, /bin/sh used in debian images will treat '\n' as eol,
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
@@ -81,7 +58,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
fi
FROM $BASE_IMAGE_SHA AS build_tools
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
ARG DEBIAN_VERSION
# Add nonroot user
@@ -98,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
# System deps
@@ -161,8 +138,7 @@ RUN curl -fsSL \
--output sql_exporter.tar.gz \
&& mkdir /tmp/sql_exporter \
&& tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \
&& rm sql_exporter.tar.gz
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
# protobuf-compiler (protoc)
ENV PROTOC_VERSION=25.1
@@ -300,7 +276,6 @@ ARG CARGO_HAKARI_VERSION=0.9.33
ARG CARGO_DENY_VERSION=0.16.2
ARG CARGO_HACK_VERSION=0.6.33
ARG CARGO_NEXTEST_VERSION=0.9.85
ARG CARGO_CHEF_VERSION=0.1.71
ARG CARGO_DIESEL_CLI_VERSION=2.2.6
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
@@ -315,7 +290,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
--features postgres-bundled --no-default-features && \
rm -rf /home/nonroot/.cargo/registry && \

View File

@@ -83,28 +83,7 @@ ARG TAG=pinned
ARG BUILD_TAG
ARG DEBIAN_VERSION=bookworm
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
# Here are the INDEX DIGESTS for the images we use.
# You can get them following next steps for now:
# 1. Get an authentication token from DockerHub:
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
# 2. Using that token, query index for the given tag:
# curl -s -H "Authorization: Bearer $TOKEN" \
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
# -I | grep -i docker-content-digest
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
# and updates on regular bases and in automated way.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
# Here we use ${var/search/replace} syntax, to check
# if base image is one of the images, we pin image index for.
# If var will match one the known images, we will replace it with the known sha.
# If no match, than value will be unaffected, and will process with no-pinned image.
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
ARG ALPINE_CURL_VERSION=8.11.1
# By default, build all PostgreSQL extensions. For quick local testing when you don't
# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
@@ -115,7 +94,7 @@ ARG EXTENSIONS=all
# Layer "build-deps"
#
#########################################################################################
FROM $BASE_IMAGE_SHA AS build-deps
FROM debian:$DEBIAN_FLAVOR AS build-deps
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early
@@ -124,7 +103,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
# By default, /bin/sh used in debian images will treat '\n' as eol,
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
RUN case $DEBIAN_VERSION in \
@@ -148,7 +127,7 @@ RUN case $DEBIAN_VERSION in \
apt install --no-install-recommends --no-install-suggests -y \
ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
$VERSION_INSTALLS \
&& apt clean && rm -rf /var/lib/apt/lists/*
@@ -160,11 +139,11 @@ RUN case $DEBIAN_VERSION in \
#########################################################################################
FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION:?} postgres
COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
--with-icu --with-libxml --with-libxslt --with-lz4" && \
if [ "${PG_VERSION:?}" != "v14" ]; then \
if [ "${PG_VERSION}" != "v14" ]; then \
# zstd is available only from PG15
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
fi && \
@@ -258,7 +237,7 @@ RUN case "${DEBIAN_VERSION}" in \
# Postgis 3.5.0 supports v17
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export POSTGIS_VERSION=3.5.0 \
export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \
@@ -333,7 +312,7 @@ FROM build-deps AS pgrouting-src
ARG DEBIAN_VERSION
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export PGROUTING_VERSION=3.6.2 \
export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \
@@ -379,7 +358,7 @@ COPY compute/patches/plv8-3.1.10.patch .
#
# Use new version only for v17
# because since v3.2, plv8 doesn't include plcoffee and plls extensions
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export PLV8_TAG=v3.2.3 \
;; \
@@ -393,7 +372,7 @@ RUN case "${PG_VERSION:?}" in \
git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
tar -czf plv8.tar.gz --exclude .git plv8-src && \
cd plv8-src && \
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
FROM pg-build AS plv8-build
ARG PG_VERSION
@@ -413,7 +392,7 @@ RUN \
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
# don't break computes with installed old version of plv8
cd /usr/local/pgsql/lib/ && \
case "${PG_VERSION:?}" in \
case "${PG_VERSION}" in \
"v17") \
ln -s plv8-3.2.3.so plv8-3.1.8.so && \
ln -s plv8-3.2.3.so plv8-3.1.5.so && \
@@ -750,7 +729,7 @@ FROM build-deps AS timescaledb-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export TIMESCALEDB_VERSION=2.10.1 \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -788,7 +767,7 @@ ARG PG_VERSION
# version-specific, has separate releases for each version
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14") \
export PG_HINT_PLAN_VERSION=14_1_4_1 \
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -864,7 +843,7 @@ ARG PG_VERSION
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export RDKIT_VERSION=Release_2024_09_1 \
export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
@@ -991,7 +970,7 @@ ARG PG_VERSION
#
# last release v0.40.0 - Jul 22, 2024
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
export SEMVER_VERSION=0.40.0 \
export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \
@@ -1027,7 +1006,7 @@ ARG PG_VERSION
# This is our extension, support stopped in favor of pgvector
# TODO: deprecate it
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.5 \
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -1060,7 +1039,7 @@ ARG PG_VERSION
# This is an experimental extension, never got to real production.
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in "v17") \
RUN case "${PG_VERSION}" in "v17") \
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
@@ -1112,7 +1091,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
FROM pg-build-nonroot-with-cargo AS rust-extensions-build
ARG PG_VERSION
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
'v17') \
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
esac && \
@@ -1291,7 +1270,7 @@ FROM build-deps AS pgx_ulid-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v14" | "v15" | "v16") \
;; \
*) \
@@ -1323,7 +1302,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
RUN case "${PG_VERSION}" in \
"v17") \
;; \
*) \
@@ -1451,8 +1430,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
FROM build-deps AS pg_mooncake-src
ARG PG_VERSION
WORKDIR /ext-src
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
chmod a+x neon-test.sh
@@ -1615,7 +1594,7 @@ RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
#
#########################################################################################
FROM $BASE_IMAGE_SHA AS pgbouncer
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
RUN set -e \
&& echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
&& apt update \
@@ -1645,12 +1624,13 @@ RUN set -e \
# Layer "exporters"
#
#########################################################################################
FROM build-deps AS exporters
FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
ARG TARGETARCH
# Keep sql_exporter version same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py
# See comment on the top of the file regading `echo`, `-e` and `\n`
RUN if [ "$TARGETARCH" = "amd64" ]; then\
RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
if [ "$TARGETARCH" = "amd64" ]; then\
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
@@ -1674,7 +1654,7 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
# Layer "awscli"
#
#########################################################################################
FROM build-deps AS awscli
FROM alpine/curl:${ALPINE_CURL_VERSION} AS awscli
ARG TARGETARCH
RUN set -ex; \
if [ "${TARGETARCH}" = "amd64" ]; then \
@@ -1724,7 +1704,7 @@ USER nonroot
COPY --chown=nonroot compute compute
RUN make PG_VERSION="${PG_VERSION:?}" -C compute
RUN make PG_VERSION="${PG_VERSION}" -C compute
#########################################################################################
#
@@ -1750,15 +1730,15 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
COPY --from=hypopg-src /ext-src/ /ext-src/
COPY --from=pg_hashids-src /ext-src/ /ext-src/
COPY --from=rum-src /ext-src/ /ext-src/
COPY --from=pgtap-src /ext-src/ /ext-src/
#COPY --from=pgtap-src /ext-src/ /ext-src/
COPY --from=ip4r-src /ext-src/ /ext-src/
COPY --from=prefix-src /ext-src/ /ext-src/
COPY --from=hll-src /ext-src/ /ext-src/
COPY --from=plpgsql_check-src /ext-src/ /ext-src/
#COPY --from=timescaledb-src /ext-src/ /ext-src/
COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch
COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
COPY --from=pg_cron-src /ext-src/ /ext-src/
#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
@@ -1787,7 +1767,7 @@ ENV PGDATABASE=postgres
# Put it all together into the final image
#
#########################################################################################
FROM $BASE_IMAGE_SHA
FROM debian:$DEBIAN_FLAVOR
ARG DEBIAN_VERSION
# Use strict mode for bash to catch errors early

View File

@@ -74,8 +74,8 @@ build: |
# At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
# and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
# for debian version migration.
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder
#
FROM debian:bookworm-slim as libcgroup-builder
ENV LIBCGROUP_VERSION=v2.0.3
RUN set -exu \

View File

@@ -68,8 +68,7 @@ build: |
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
# requires cgroup v2, so we'll build cgroup-tools ourselves.
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder
FROM debian:bullseye-slim as libcgroup-builder
ENV LIBCGROUP_VERSION=v2.0.3
RUN set -exu \

View File

@@ -24,7 +24,6 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
jsonwebtoken.workspace = true
metrics.workspace = true
nix.workspace = true
notify.workspace = true

View File

@@ -55,7 +55,7 @@ use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
@@ -281,7 +281,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
info!("got spec from cli argument {}", spec_json);
return Ok(CliSpecParams {
spec: Some(serde_json::from_str(spec_json)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: false,
});
}
@@ -291,7 +290,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
compute_ctl_config: ComputeCtlConfig::default(),
live_config_allowed: true,
});
}
@@ -301,9 +299,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
};
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
Ok(resp) => Ok(CliSpecParams {
spec: resp.0,
compute_ctl_config: resp.1,
Ok(spec) => Ok(CliSpecParams {
spec,
live_config_allowed: true,
}),
Err(e) => {
@@ -320,8 +317,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
#[allow(dead_code)]
compute_ctl_config: ComputeCtlConfig,
live_config_allowed: bool,
}
@@ -331,7 +326,6 @@ fn wait_spec(
CliSpecParams {
spec,
live_config_allowed,
compute_ctl_config: _,
}: CliSpecParams,
) -> Result<Arc<ComputeNode>> {
let mut new_state = ComputeState::new();

View File

@@ -1400,27 +1400,26 @@ impl ComputeNode {
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
if !spec.skip_pg_catalog_updates {
let max_concurrent_connections = spec.reconfigure_concurrency;
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc.
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
self.pg_reload_conf()?;
let max_concurrent_connections = spec.reconfigure_concurrency;
if spec.mode == ComputeMode::Primary {
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
conf.application_name("apply_config");
let conf = Arc::new(conf);
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc.
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
self.pg_reload_conf()?;
let spec = Arc::new(spec.clone());
if spec.mode == ComputeMode::Primary {
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
conf.application_name("apply_config");
let conf = Arc::new(conf);
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
}
let spec = Arc::new(spec.clone());
Ok(())
})?;
}
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
}
Ok(())
})?;
self.pg_reload_conf()?;

View File

@@ -11,9 +11,7 @@ use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
use compute_api::responses::{
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
};
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
use compute_api::spec::ComputeSpec;
// Do control plane request and return response if any. In case of error it
@@ -75,13 +73,14 @@ fn do_control_plane_request(
pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
) -> Result<Option<ComputeSpec>> {
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
};
let mut attempt = 1;
let mut spec: Result<Option<ComputeSpec>> = Ok(None);
info!("getting spec from control plane: {}", cp_uri);
@@ -91,7 +90,7 @@ pub fn get_spec_from_control_plane(
// - no spec for compute yet (Empty state) -> return Ok(None)
// - got spec -> return Ok(Some(spec))
while attempt < 4 {
let result = match do_control_plane_request(&cp_uri, &jwt) {
spec = match do_control_plane_request(&cp_uri, &jwt) {
Ok(spec_resp) => {
CPLANE_REQUESTS_TOTAL
.with_label_values(&[
@@ -100,10 +99,10 @@ pub fn get_spec_from_control_plane(
])
.inc();
match spec_resp.status {
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
ControlPlaneComputeStatus::Empty => Ok(None),
ControlPlaneComputeStatus::Attached => {
if let Some(spec) = spec_resp.spec {
Ok((Some(spec), spec_resp.compute_ctl_config))
Ok(Some(spec))
} else {
bail!("compute is attached, but spec is empty")
}
@@ -122,10 +121,10 @@ pub fn get_spec_from_control_plane(
}
};
if let Err(e) = &result {
if let Err(e) = &spec {
error!("attempt {} to get spec failed with: {}", attempt, e);
} else {
return result;
return spec;
}
attempt += 1;
@@ -133,9 +132,7 @@ pub fn get_spec_from_control_plane(
}
// All attempts failed, return error.
Err(anyhow::anyhow!(
"Exhausted all attempts to retrieve the spec from the control plane"
))
spec
}
/// Check `pg_hba.conf` and update if needed to allow external connections.

View File

@@ -48,8 +48,6 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::ComputeCtlConfig;
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
@@ -882,13 +880,10 @@ impl Endpoint {
self.external_http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.body(
serde_json::to_string(&ConfigurationRequest {
spec,
compute_ctl_config: ComputeCtlConfig::default(),
})
.unwrap(),
)
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?
))
.send()
.await?;

View File

@@ -362,11 +362,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_l0_first' as a bool")?,
compaction_l0_semaphore: settings
.remove("compaction_l0_semaphore")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'compaction_l0_semaphore' as a bool")?,
l0_flush_delay_threshold: settings
.remove("l0_flush_delay_threshold")
.map(|x| x.parse::<usize>())

View File

@@ -838,10 +838,7 @@ impl StorageController {
self.dispatch(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(TenantShardMigrateRequest {
node_id,
migration_config: None,
}),
Some(TenantShardMigrateRequest { node_id }),
)
.await
}

View File

@@ -609,10 +609,7 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -626,10 +623,7 @@ async fn main() -> anyhow::Result<()> {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
node_id: node,
migration_config: None,
};
let req = TenantShardMigrateRequest { node_id: node };
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -1088,10 +1082,7 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
node_id: mv.to,
migration_config: None,
}),
Some(TenantShardMigrateRequest { node_id: mv.to }),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))

View File

@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
# We are running tests now
rm -f testout.txt testout_contrib.txt
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
$TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
$TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0

View File

@@ -1,15 +0,0 @@
diff --git a/test/schedule/create.sql b/test/schedule/create.sql
index ba355ed..7e250f5 100644
--- a/test/schedule/create.sql
+++ b/test/schedule/create.sql
@@ -1,3 +1,2 @@
\unset ECHO
\i test/psql.sql
-CREATE EXTENSION pgtap;
diff --git a/test/schedule/main.sch b/test/schedule/main.sch
index a8a5fbc..0463fc4 100644
--- a/test/schedule/main.sch
+++ b/test/schedule/main.sch
@@ -1,2 +1 @@
-test: build
test: create

View File

@@ -1,6 +0,0 @@
#!/bin/sh
set -ex
cd "$(dirname ${0})"
patch -p1 <test-upgrade.patch
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --max-connections=86 --schedule test/schedule/main.sch --schedule test/build/run.sch --dbname contrib_regression --use-existing

View File

@@ -41,8 +41,7 @@ EXTENSIONS='[
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
{"extname": "semver", "extdir": "pg_semver-src"},
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
{"extname": "pgjwt", "extdir": "pgjwt-src"},
{"extname": "pgtap", "extdir": "pgtap-src"}
{"extname": "pgjwt", "extdir": "pgjwt-src"}
]'
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

View File

@@ -7,7 +7,6 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
chrono.workspace = true
jsonwebtoken.workspace = true
serde.workspace = true
serde_json.workspace = true
regex.workspace = true

View File

@@ -1,20 +1,18 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use crate::{
privilege::Privilege,
responses::ComputeCtlConfig,
spec::{ComputeSpec, ExtVersion, PgIdent},
};
use serde::{Deserialize, Serialize};
use serde::Deserialize;
/// Request of the /configure API
///
/// We now pass only `spec` in the configuration request, but later we can
/// extend it and something like `restart: bool` or something else. So put
/// `spec` into a struct initially to be more flexible in the future.
#[derive(Debug, Deserialize, Serialize)]
#[derive(Deserialize, Debug)]
pub struct ConfigurationRequest {
pub spec: ComputeSpec,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Debug)]

View File

@@ -3,7 +3,6 @@
use std::fmt::Display;
use chrono::{DateTime, Utc};
use jsonwebtoken::jwk::JwkSet;
use serde::{Deserialize, Serialize, Serializer};
use crate::{
@@ -136,27 +135,13 @@ pub struct CatalogObjects {
pub databases: Vec<Database>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct ComputeCtlConfig {
pub jwks: JwkSet,
}
impl Default for ComputeCtlConfig {
fn default() -> Self {
Self {
jwks: JwkSet {
keys: Vec::default(),
},
}
}
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving
/// to a different place.
#[derive(Deserialize, Debug)]
pub struct ControlPlaneSpecResponse {
pub spec: Option<ComputeSpec>,
pub status: ControlPlaneComputeStatus,
pub compute_ctl_config: ComputeCtlConfig,
}
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]

View File

@@ -94,6 +94,7 @@ pub struct ConfigToml {
pub ondemand_download_behavior_treat_error_as_warn: bool,
#[serde(with = "humantime_serde")]
pub background_task_maximum_delay: Duration,
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<reqwest::Url>,
pub control_plane_api_token: Option<String>,
pub control_plane_emergency_mode: bool,
@@ -265,9 +266,6 @@ pub struct TenantConfigToml {
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
pub compaction_l0_first: bool,
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
/// has an effect if `compaction_l0_first` is `true`.
pub compaction_l0_semaphore: bool,
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
@@ -476,6 +474,7 @@ impl Default for ConfigToml {
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
use_compaction_semaphore: false,
control_plane_api: (None),
control_plane_api_token: (None),
@@ -549,7 +548,6 @@ pub mod tenant_conf_defaults {
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;
@@ -600,7 +598,6 @@ impl Default for TenantConfigToml {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
l0_flush_delay_threshold: None,
l0_flush_stall_threshold: None,
l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,

View File

@@ -182,18 +182,6 @@ pub struct TenantDescribeResponseShard {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
#[serde(default)]
pub migration_config: Option<MigrationConfig>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MigrationConfig {
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
#[derive(Serialize, Clone, Debug)]

View File

@@ -466,8 +466,6 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_first: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_l0_semaphore: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub l0_flush_delay_threshold: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub l0_flush_stall_threshold: FieldPatch<usize>,
@@ -534,7 +532,6 @@ pub struct TenantConfig {
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub compaction_l0_first: Option<bool>,
pub compaction_l0_semaphore: Option<bool>,
pub l0_flush_delay_threshold: Option<usize>,
pub l0_flush_stall_threshold: Option<usize>,
pub l0_flush_wait_upload: Option<bool>,
@@ -574,7 +571,6 @@ impl TenantConfig {
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
mut l0_flush_wait_upload,
@@ -615,9 +611,6 @@ impl TenantConfig {
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
.apply(&mut compaction_l0_semaphore);
patch
.l0_flush_delay_threshold
.apply(&mut l0_flush_delay_threshold);
@@ -682,7 +675,6 @@ impl TenantConfig {
compaction_upper_limit,
compaction_algorithm,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
l0_flush_wait_upload,
@@ -1136,24 +1128,7 @@ pub struct TimelineInfo {
pub ancestor_lsn: Option<Lsn>,
pub last_record_lsn: Lsn,
pub prev_record_lsn: Option<Lsn>,
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
/// TODO: remove once control plane no longer reads it.
pub latest_gc_cutoff_lsn: Lsn,
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
/// as it is easier to reason about.
pub applied_gc_cutoff_lsn: Lsn,
/// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
/// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
/// LSN at which it is legal to create a branch or ephemeral endpoint.
///
/// Note that holders of valid LSN leases may be able to create branches and read pages earlier
/// than this LSN, but new leases may not be taken out earlier than this LSN.
pub min_readable_lsn: Lsn,
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage

View File

@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
pub enum BasebackupError {
#[error("basebackup pageserver error {0:#}")]
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
#[error("basebackup client error {0:#}")]
Client(#[source] io::Error),
}
/// Create basebackup with non-rel data in it.
@@ -234,7 +234,7 @@ where
self.ar
.append(&header, self.buf.as_slice())
.await
.map_err(|e| BasebackupError::Client(e, "flush"))?;
.map_err(BasebackupError::Client)?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -273,9 +273,9 @@ where
for dir in subdirs.iter() {
let header = new_tar_header_dir(dir)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
.context("could not add directory to basebackup tarball")?;
}
// Send config files.
@@ -286,13 +286,13 @@ where
self.ar
.append(&header, data)
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
.context("could not add config file to basebackup tarball")?;
} else {
let header = new_tar_header(filepath, 0)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
.context("could not add config file to basebackup tarball")?;
}
}
if !lazy_slru_download {
@@ -406,7 +406,7 @@ where
self.ar
.append(&header, &*content)
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
.context("could not add aux file to basebackup tarball")?;
}
if min_restart_lsn != Lsn::MAX {
@@ -419,7 +419,7 @@ where
self.ar
.append(&header, &data[..])
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
.context("could not add restart.lsn file to basebackup tarball")?;
}
for xid in self
.timeline
@@ -451,9 +451,9 @@ where
let crc32 = crc32c::crc32c(&content);
content.extend_from_slice(&crc32.to_le_bytes());
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
self.ar.append(&header, &*content).await.map_err(|e| {
BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
})?;
self.ar.append(&header, &*content).await.context(
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
)?;
}
fail_point!("basebackup-before-control-file", |_| {
@@ -464,10 +464,7 @@ where
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
self.ar
.finish()
.await
.map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
debug!("all tarred up!");
Ok(())
}
@@ -485,9 +482,9 @@ where
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
.map_err(BasebackupError::Client)?;
return Ok(());
}
@@ -518,7 +515,7 @@ where
self.ar
.append(&header, segment_data.as_slice())
.await
.map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;
.map_err(BasebackupError::Client)?;
seg += 1;
startblk = endblk;
@@ -569,7 +566,7 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;
.map_err(BasebackupError::Client)?;
info!("timeline.pg_version {}", self.timeline.pg_version);
@@ -579,7 +576,7 @@ where
self.ar
.append(&header, &img[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
.map_err(BasebackupError::Client)?;
} else {
warn!("global/pg_filenode.map is missing");
}
@@ -615,9 +612,9 @@ where
let path = format!("base/{}", dbnode);
let header = new_tar_header_dir(&path)?;
self.ar
.append(&header, io::empty())
.append(&header, &mut io::empty())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
.map_err(BasebackupError::Client)?;
if let Some(img) = relmap_img {
let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -630,14 +627,14 @@ where
self.ar
.append(&header, pg_version_str.as_bytes())
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
.map_err(BasebackupError::Client)?;
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
let header = new_tar_header(&relmap_path, img.len() as u64)?;
self.ar
.append(&header, &img[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
.map_err(BasebackupError::Client)?;
}
};
Ok(())
@@ -666,7 +663,7 @@ where
self.ar
.append(&header, &buf[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;
.map_err(BasebackupError::Client)?;
Ok(())
}
@@ -696,7 +693,7 @@ where
zenith_signal.as_bytes(),
)
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
.map_err(BasebackupError::Client)?;
let checkpoint_bytes = self
.timeline
@@ -721,7 +718,7 @@ where
self.ar
.append(&header, &pg_control_bytes[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;
.map_err(BasebackupError::Client)?;
//send wal segment
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -745,7 +742,7 @@ where
self.ar
.append(&header, &wal_seg[..])
.await
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
.map_err(BasebackupError::Client)?;
Ok(())
}
}

View File

@@ -140,6 +140,10 @@ pub struct PageServerConf {
/// not terrible.
pub background_task_maximum_delay: Duration,
/// If true, use a separate semaphore for compaction tasks instead of the common background task
/// semaphore. Defaults to false.
pub use_compaction_semaphore: bool,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
@@ -336,6 +340,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_api_token,
control_plane_emergency_mode,
@@ -390,6 +395,7 @@ impl PageServerConf {
test_remote_failures,
ondemand_download_behavior_treat_error_as_warn,
background_task_maximum_delay,
use_compaction_semaphore,
control_plane_api,
control_plane_emergency_mode,
heatmap_upload_concurrency,

View File

@@ -1080,10 +1080,7 @@ components:
type: integer
state:
type: string
min_readable_lsn:
type: string
format: hex
applied_gc_cutoff_lsn:
latest_gc_cutoff_lsn:
type: string
format: hex

View File

@@ -482,11 +482,6 @@ async fn build_timeline_info_common(
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
let min_readable_lsn = std::cmp::max(
timeline.get_gc_cutoff_lsn(),
*timeline.get_applied_gc_cutoff_lsn(),
);
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
@@ -498,12 +493,7 @@ async fn build_timeline_info_common(
initdb_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
// Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
// we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
// actually trimmed data to), which can pass each other when PITR is changed.
latest_gc_cutoff_lsn: min_readable_lsn,
min_readable_lsn,
applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
current_logical_size_is_accurate: match current_logical_size.accuracy() {
tenant::timeline::logical_size::Accuracy::Approximate => false,
@@ -2161,7 +2151,6 @@ async fn timeline_compact_handler(
let state = get_state(&request);
let mut flags = EnumSet::empty();
flags |= CompactFlags::NoYield; // run compaction to completion
if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
flags |= CompactFlags::ForceL0Compaction;

View File

@@ -914,7 +914,7 @@ impl PageServerHandler {
&shard,
req.hdr.request_lsn,
req.hdr.not_modified_since,
&shard.get_applied_gc_cutoff_lsn(),
&shard.get_latest_gc_cutoff_lsn(),
ctx,
)
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
@@ -1810,7 +1810,7 @@ impl PageServerHandler {
req: &PagestreamExistsRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1837,7 +1837,7 @@ impl PageServerHandler {
req: &PagestreamNblocksRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1864,7 +1864,7 @@ impl PageServerHandler {
req: &PagestreamDbSizeRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -1954,7 +1954,7 @@ impl PageServerHandler {
req: &PagestreamGetSlruSegmentRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
timeline,
req.hdr.request_lsn,
@@ -2050,8 +2050,7 @@ impl PageServerHandler {
{
fn map_basebackup_error(err: BasebackupError) -> QueryError {
match err {
// TODO: passthrough the error site to the final error message?
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
}
}
@@ -2072,7 +2071,7 @@ impl PageServerHandler {
//return Err(QueryError::NotFound("timeline is archived".into()))
}
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
@@ -2152,12 +2151,10 @@ impl PageServerHandler {
.await
.map_err(map_basebackup_error)?;
}
writer.flush().await.map_err(|e| {
map_basebackup_error(BasebackupError::Client(
e,
"handle_basebackup_request,flush",
))
})?;
writer
.flush()
.await
.map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
}
pgb.write_message_noflush(&BeMessage::CopyDone)

View File

@@ -611,7 +611,7 @@ impl Timeline {
) -> Result<LsnForTimestamp, PageReconstructError> {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let gc_cutoff_planned = {
let gc_info = self.gc_info.read().unwrap();
gc_info.min_cutoff()

View File

@@ -40,8 +40,6 @@ use remote_timeline_client::manifest::{
use remote_timeline_client::UploadQueueNotReadyError;
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
use secondary::heatmap::HeatMapTenant;
use secondary::heatmap::HeatMapTimeline;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
@@ -57,7 +55,6 @@ use timeline::offload::OffloadError;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::PreviousHeatmap;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -265,7 +262,6 @@ struct TimelinePreload {
timeline_id: TimelineId,
client: RemoteTimelineClient,
index_part: Result<MaybeDeletedIndexPart, DownloadError>,
previous_heatmap: Option<PreviousHeatmap>,
}
pub(crate) struct TenantPreload {
@@ -283,53 +279,6 @@ pub(crate) enum SpawnMode {
Lazy,
}
/// A notifier that can be used to trigger compaction shared by all timelines of a tenant.
///
/// It is used to notify the compaction loop that there is work to be done. It is also used
/// to balance the load of compaction between timelines. If there are pending L0 compaction in
/// one of the timeline, it could preempt long-running compaction jobs (e.g., image compaction)
/// on other timelines.
pub struct CompactionNotifier {
notify: Notify,
l0_count: std::sync::Mutex<HashMap<TimelineId, usize>>,
}
impl CompactionNotifier {
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
Self {
notify: Notify::new(),
l0_count: std::sync::Mutex::new(HashMap::new()),
}
}
pub fn notify_one(&self) {
self.notify.notify_one();
}
pub fn notified(&self) -> tokio::sync::futures::Notified<'_> {
self.notify.notified()
}
pub fn on_l0_update(&self, timeline_id: TimelineId, l0_count: usize) {
let mut guard = self.l0_count.lock().unwrap();
guard.insert(timeline_id, l0_count);
}
pub fn on_shutdown(&self, timeline_id: TimelineId) {
let mut guard = self.l0_count.lock().unwrap();
guard.remove(&timeline_id);
}
pub fn get_max_l0_count(&self) -> Option<(usize, TimelineId)> {
let guard = self.l0_count.lock().unwrap();
guard
.iter()
.max_by_key(|(_, count)| *count)
.map(|(timeline_id, count)| (*count, *timeline_id))
}
}
///
/// Tenant consists of multiple timelines. Keep them in a hash table.
///
@@ -405,7 +354,7 @@ pub struct Tenant {
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
/// Signals the tenant compaction loop that there is L0 compaction work to be done.
pub(crate) l0_compaction_trigger: Arc<CompactionNotifier>,
pub(crate) l0_compaction_trigger: Arc<Notify>,
/// Scheduled gc-compaction tasks.
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
@@ -1179,7 +1128,6 @@ impl Tenant {
resources: TimelineResources,
mut index_part: IndexPart,
metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1210,7 +1158,6 @@ impl Tenant {
let timeline = self.create_timeline_struct(
timeline_id,
&metadata,
previous_heatmap,
ancestor.clone(),
resources,
CreateTimelineCause::Load,
@@ -1610,18 +1557,8 @@ impl Tenant {
}
}
// TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
// pulled the first heatmap. Not entirely necessary since the storage controller
// will kick the secondary in any case and cause a download.
let maybe_heatmap_at = self.read_on_disk_heatmap().await;
let timelines = self
.load_timelines_metadata(
remote_timeline_ids,
remote_storage,
maybe_heatmap_at,
cancel,
)
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
.await?;
Ok(TenantPreload {
@@ -1634,26 +1571,6 @@ impl Tenant {
})
}
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
Err(err) => {
error!("Failed to deserialize old heatmap: {err}");
None
}
},
Err(err) => match err.kind() {
std::io::ErrorKind::NotFound => None,
_ => {
error!("Unexpected IO error reading old heatmap: {err}");
None
}
},
}
}
///
/// Background task that downloads all data for a tenant and brings it to Active state.
///
@@ -1741,10 +1658,7 @@ impl Tenant {
match index_part {
MaybeDeletedIndexPart::IndexPart(index_part) => {
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
remote_index_and_client.insert(
timeline_id,
(index_part, preload.client, preload.previous_heatmap),
);
remote_index_and_client.insert(timeline_id, (index_part, preload.client));
}
MaybeDeletedIndexPart::Deleted(index_part) => {
info!(
@@ -1763,7 +1677,7 @@ impl Tenant {
// layer file.
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
for (timeline_id, remote_metadata) in sorted_timelines {
let (index_part, remote_client, previous_heatmap) = remote_index_and_client
let (index_part, remote_client) = remote_index_and_client
.remove(&timeline_id)
.expect("just put it in above");
@@ -1783,7 +1697,6 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
previous_heatmap,
self.get_timeline_resources_for(remote_client),
LoadTimelineCause::Attach,
ctx,
@@ -1933,13 +1846,11 @@ impl Tenant {
}
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
#[allow(clippy::too_many_arguments)]
async fn load_remote_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
resources: TimelineResources,
cause: LoadTimelineCause,
ctx: &RequestContext,
@@ -1969,7 +1880,6 @@ impl Tenant {
resources,
index_part,
remote_metadata,
previous_heatmap,
ancestor,
cause,
ctx,
@@ -1981,29 +1891,14 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_ids: HashSet<TimelineId>,
remote_storage: &GenericRemoteStorage,
heatmap: Option<(HeatMapTenant, std::time::Instant)>,
cancel: CancellationToken,
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
let mut part_downloads = JoinSet::new();
for timeline_id in timeline_ids {
let cancel_clone = cancel.clone();
let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
heatmap: h,
read_at: hs.1,
})
});
part_downloads.spawn(
self.load_timeline_metadata(
timeline_id,
remote_storage.clone(),
previous_timeline_heatmap,
cancel_clone,
)
.instrument(info_span!("download_index_part", %timeline_id)),
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
.instrument(info_span!("download_index_part", %timeline_id)),
);
}
@@ -2051,7 +1946,6 @@ impl Tenant {
self: &Arc<Tenant>,
timeline_id: TimelineId,
remote_storage: GenericRemoteStorage,
previous_heatmap: Option<PreviousHeatmap>,
cancel: CancellationToken,
) -> impl Future<Output = TimelinePreload> {
let client = self.build_timeline_client(timeline_id, remote_storage);
@@ -2067,7 +1961,6 @@ impl Tenant {
client,
timeline_id,
index_part,
previous_heatmap,
}
}
}
@@ -2179,12 +2072,7 @@ impl Tenant {
})?;
let timeline_preload = self
.load_timeline_metadata(
timeline_id,
self.remote_storage.clone(),
None,
cancel.clone(),
)
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
.await;
let index_part = match timeline_preload.index_part {
@@ -2218,7 +2106,6 @@ impl Tenant {
timeline_id,
index_part,
remote_metadata,
None,
timeline_resources,
LoadTimelineCause::Unoffload,
&ctx,
@@ -2934,7 +2821,7 @@ impl Tenant {
};
let metadata = index_part.metadata.clone();
self
.load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
create_guard: timeline_create_guard, activate, }, &ctx)
.await?
.ready_to_activate()
@@ -4143,7 +4030,6 @@ impl Tenant {
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
resources: TimelineResources,
cause: CreateTimelineCause,
@@ -4167,7 +4053,6 @@ impl Tenant {
self.conf,
Arc::clone(&self.tenant_conf),
new_metadata,
previous_heatmap,
ancestor,
new_timeline_id,
self.tenant_shard_id,
@@ -4289,7 +4174,7 @@ impl Tenant {
// use an extremely long backoff.
Some(Duration::from_secs(3600 * 24)),
)),
l0_compaction_trigger: Arc::new(CompactionNotifier::new()),
l0_compaction_trigger: Arc::new(Notify::new()),
scheduled_compaction_tasks: Mutex::new(Default::default()),
activate_now_sem: tokio::sync::Semaphore::new(0),
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
@@ -4810,24 +4695,24 @@ impl Tenant {
// We check it against both the planned GC cutoff stored in 'gc_info',
// and the 'latest_gc_cutoff' of the last GC that was performed. The
// planned GC cutoff in 'gc_info' is normally larger than
// 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
// 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
// changed the GC settings for the tenant to make the PITR window
// larger, but some of the data was already removed by an earlier GC
// iteration.
// check against last actual 'latest_gc_cutoff' first
let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
{
let gc_info = src_timeline.gc_info.read().unwrap();
let planned_cutoff = gc_info.min_cutoff();
if gc_info.lsn_covered_by_lease(start_lsn) {
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
} else {
src_timeline
.check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {}",
*applied_gc_cutoff_lsn,
*latest_gc_cutoff_lsn,
))
.map_err(CreateTimelineError::AncestorLsn)?;
@@ -4866,7 +4751,7 @@ impl Tenant {
dst_prev,
Some(src_id),
start_lsn,
*src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
src_timeline.initdb_lsn,
src_timeline.pg_version,
);
@@ -5239,7 +5124,6 @@ impl Tenant {
.create_timeline_struct(
new_timeline_id,
new_metadata,
None,
ancestor,
resources,
CreateTimelineCause::Load,
@@ -5658,7 +5542,6 @@ pub(crate) mod harness {
compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
compaction_l0_first: Some(tenant_conf.compaction_l0_first),
compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore),
l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
@@ -6246,8 +6129,8 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
match tline.get(*TEST_KEY, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err.to_string().contains("not found at")),
@@ -8543,7 +8426,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -8651,7 +8534,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -8819,8 +8702,8 @@ mod tests {
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
info!(
"applied_gc_cutoff_lsn: {}",
*timeline.get_applied_gc_cutoff_lsn()
"latest_gc_cutoff_lsn: {}",
*timeline.get_latest_gc_cutoff_lsn()
);
timeline.force_set_disk_consistent_lsn(end_lsn);
@@ -8846,7 +8729,7 @@ mod tests {
// Make lease on a already GC-ed LSN.
// 0/80 does not have a valid lease + is below latest_gc_cutoff
assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
timeline
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
.expect_err("lease request on GC-ed LSN should fail");
@@ -9037,7 +8920,7 @@ mod tests {
};
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9124,7 +9007,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
@@ -9577,7 +9460,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -9724,7 +9607,7 @@ mod tests {
// increase GC horizon and compact again
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x38))
.wait()
@@ -9825,7 +9708,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10076,7 +9959,7 @@ mod tests {
{
parent_tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x10))
.wait()
@@ -10096,7 +9979,7 @@ mod tests {
{
branch_tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x50))
.wait()
@@ -10452,7 +10335,7 @@ mod tests {
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -10837,7 +10720,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
@@ -11088,7 +10971,7 @@ mod tests {
.await?;
{
tline
.applied_gc_cutoff_lsn
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()

View File

@@ -289,10 +289,6 @@ pub struct TenantConfOpt {
#[serde(default)]
pub compaction_l0_first: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_l0_semaphore: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub l0_flush_delay_threshold: Option<usize>,
@@ -427,9 +423,6 @@ impl TenantConfOpt {
compaction_l0_first: self
.compaction_l0_first
.unwrap_or(global_conf.compaction_l0_first),
compaction_l0_semaphore: self
.compaction_l0_semaphore
.unwrap_or(global_conf.compaction_l0_semaphore),
l0_flush_delay_threshold: self
.l0_flush_delay_threshold
.or(global_conf.l0_flush_delay_threshold),
@@ -508,7 +501,6 @@ impl TenantConfOpt {
mut compaction_upper_limit,
mut compaction_algorithm,
mut compaction_l0_first,
mut compaction_l0_semaphore,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
mut l0_flush_wait_upload,
@@ -555,9 +547,6 @@ impl TenantConfOpt {
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.compaction_l0_first.apply(&mut compaction_l0_first);
patch
.compaction_l0_semaphore
.apply(&mut compaction_l0_semaphore);
patch
.l0_flush_delay_threshold
.apply(&mut l0_flush_delay_threshold);
@@ -640,7 +629,6 @@ impl TenantConfOpt {
compaction_upper_limit,
compaction_algorithm,
compaction_l0_first,
compaction_l0_semaphore,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
l0_flush_wait_upload,
@@ -704,7 +692,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
compaction_threshold: value.compaction_threshold,
compaction_upper_limit: value.compaction_upper_limit,
compaction_l0_first: value.compaction_l0_first,
compaction_l0_semaphore: value.compaction_l0_semaphore,
l0_flush_delay_threshold: value.l0_flush_delay_threshold,
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
l0_flush_wait_upload: value.l0_flush_wait_upload,

View File

@@ -130,10 +130,7 @@ struct TimelineMetadataBodyV2 {
prev_record_lsn: Option<Lsn>,
ancestor_timeline: Option<TimelineId>,
ancestor_lsn: Lsn,
// The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`].
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
}

View File

@@ -195,7 +195,7 @@ use utils::backoff::{
use utils::pausable_failpoint;
use utils::shard::ShardNumber;
use std::collections::{HashMap, HashSet, VecDeque};
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex, OnceLock};
use std::time::Duration;
@@ -219,7 +219,7 @@ use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::download::download_retry;
use crate::tenant::storage_layer::AsLayerDesc;
use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
use crate::tenant::TIMELINES_SEGMENT_NAME;
use crate::{
config::PageServerConf,
@@ -499,37 +499,21 @@ impl RemoteTimelineClient {
/// Notify this client of a change to its parent tenant's config, as this may cause us to
/// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle)
pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) {
pub(super) fn update_config(self: &Arc<Self>, location_conf: &AttachedLocationConfig) {
let new_conf = RemoteTimelineClientConfig::from(location_conf);
let unblocked = !new_conf.block_deletions;
// Update config before draining deletions, so that we don't race with more being
// inserted. This can result in deletions happening our of order, but that does not
// violate any invariants: deletions only need to be ordered relative to upload of the index
// that dereferences the deleted objects, and we are not changing that order.
let mut block_deletions_below = Generation::None;
if new_conf.block_deletions {
block_deletions_below = self.generation;
}
*self.config.write().unwrap() = new_conf;
if unblocked {
// If we may now delete layers, drain any that were blocked in our old
// configuration state
let mut queue_locked = self.upload_queue.lock().unwrap();
if let Ok(queue) = queue_locked.initialized_mut() {
let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
for d in blocked_deletions {
if let Err(e) = self.deletion_queue_client.push_layers(
self.tenant_shard_id,
self.timeline_id,
self.generation,
d.layers,
) {
// This could happen if the pageserver is shut down while a tenant
// is transitioning from a deletion-blocked state: we will leak some
// S3 objects in this case.
warn!("Failed to drain blocked deletions: {}", e);
break;
}
}
// If the conf change may unblock any deletions, try to launch them.
if let Ok(queue) = self.upload_queue.lock().unwrap().initialized_mut() {
if block_deletions_below != queue.block_deletions_below {
queue.block_deletions_below = block_deletions_below;
self.launch_queued_tasks(queue);
}
}
}
@@ -1185,7 +1169,7 @@ impl RemoteTimelineClient {
"scheduled layer file upload {layer}",
);
let op = UploadOp::UploadLayer(layer, metadata, None);
let op = UploadOp::UploadLayer(layer, metadata);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -1411,13 +1395,6 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue);
Ok(())
}
fn schedule_barrier0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
@@ -1892,31 +1869,16 @@ impl RemoteTimelineClient {
while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
debug!("starting op: {next_op}");
// Prepare upload.
// Handle barriers and shutdowns.
match &mut next_op {
UploadOp::UploadLayer(layer, meta, mode) => {
if upload_queue
.recently_deleted
.remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
{
*mode = Some(OpType::FlushDeletion);
} else {
*mode = Some(OpType::MayReorder)
}
}
UploadOp::UploadMetadata { .. } => {}
UploadOp::Delete(Delete { layers }) => {
for (name, meta) in layers {
upload_queue
.recently_deleted
.insert((name.clone(), meta.generation));
}
}
UploadOp::Barrier(sender) => {
sender.send_replace(());
continue;
}
UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
UploadOp::UploadLayer(..) => {}
UploadOp::UploadMetadata { .. } => {}
UploadOp::Delete(Delete { .. }) => {}
};
// Assign unique ID to this task
@@ -1988,7 +1950,7 @@ impl RemoteTimelineClient {
// Assert that we don't modify a layer that's referenced by the current index.
if cfg!(debug_assertions) {
let modified = match &task.op {
UploadOp::UploadLayer(layer, layer_metadata, _) => {
UploadOp::UploadLayer(layer, layer_metadata) => {
vec![(layer.layer_desc().layer_name(), layer_metadata)]
}
UploadOp::Delete(delete) => {
@@ -2010,68 +1972,7 @@ impl RemoteTimelineClient {
}
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
// TODO: check if this mechanism can be removed now that can_bypass() performs
// conflict checks during scheduling.
if let Some(OpType::FlushDeletion) = mode {
if self.config.read().unwrap().block_deletions {
// Of course, this is not efficient... but usually the queue should be empty.
let mut queue_locked = self.upload_queue.lock().unwrap();
let mut detected = false;
if let Ok(queue) = queue_locked.initialized_mut() {
for list in queue.blocked_deletions.iter_mut() {
list.layers.retain(|(name, meta)| {
if name == &layer.layer_desc().layer_name()
&& meta.generation == layer_metadata.generation
{
detected = true;
// remove the layer from deletion queue
false
} else {
// keep the layer
true
}
});
}
}
if detected {
info!(
"cancelled blocked deletion of layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
}
} else {
// TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions
// that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted,
// which is not possible in the current system.
info!(
"waiting for deletion queue flush to complete before uploading layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
{
// We are going to flush, we can clean up the recently deleted list.
let mut queue_locked = self.upload_queue.lock().unwrap();
if let Ok(queue) = queue_locked.initialized_mut() {
queue.recently_deleted.clear();
}
}
if let Err(e) = self.deletion_queue_client.flush_execute().await {
warn!(
"failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
} else {
info!(
"done flushing deletion queue before uploading layer {} at gen {:?}",
layer.layer_desc().layer_name(),
layer_metadata.generation
);
}
}
}
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
let local_path = layer.local_path();
// We should only be uploading layers created by this `Tenant`'s lifetime, so
@@ -2136,23 +2037,17 @@ impl RemoteTimelineClient {
res
}
UploadOp::Delete(delete) => {
if self.config.read().unwrap().block_deletions {
let mut queue_locked = self.upload_queue.lock().unwrap();
if let Ok(queue) = queue_locked.initialized_mut() {
queue.blocked_deletions.push(delete.clone());
}
Ok(())
} else {
pausable_failpoint!("before-delete-layer-pausable");
self.deletion_queue_client
.push_layers(
self.tenant_shard_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.map_err(|e| anyhow::anyhow!(e))
}
pausable_failpoint!("before-delete-layer-pausable");
// TODO: these can't go through the deletion queue, since it's asynchronous and
// may clobber a later write to this later.
self.deletion_queue_client
.push_layers(
self.tenant_shard_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.map_err(|e| anyhow::anyhow!(e))
}
unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
// unreachable. Barrier operations are handled synchronously in
@@ -2234,7 +2129,7 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
UploadOp::UploadLayer(_, _, _) => None,
UploadOp::UploadLayer(_, _) => None,
UploadOp::UploadMetadata { ref uploaded } => {
// the task id is reused as a monotonicity check for storing the "clean"
// IndexPart.
@@ -2309,7 +2204,7 @@ impl RemoteTimelineClient {
)> {
use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
let res = match op {
UploadOp::UploadLayer(_, m, _) => (
UploadOp::UploadLayer(_, m) => (
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
@@ -2399,12 +2294,11 @@ impl RemoteTimelineClient {
.clone(),
inprogress_tasks: HashMap::default(),
queued_operations: VecDeque::default(),
block_deletions_below: Generation::None,
#[cfg(feature = "testing")]
dangling_files: HashMap::default(),
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
recently_deleted: HashSet::new(),
};
let upload_queue = std::mem::replace(

View File

@@ -1,4 +1,4 @@
use std::{collections::HashMap, time::SystemTime};
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTenant {
pub(super) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
@@ -25,17 +25,8 @@ pub(crate) struct HeatMapTenant {
pub(super) upload_period_ms: Option<u128>,
}
impl HeatMapTenant {
pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
self.timelines
.into_iter()
.map(|htl| (htl.timeline_id, htl))
.collect()
}
}
#[serde_as]
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
@@ -44,13 +35,13 @@ pub(crate) struct HeatMapTimeline {
}
#[serde_as]
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(crate) name: LayerName,
pub(crate) metadata: LayerFileMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(crate) access_time: SystemTime,
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}

View File

@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
ancestor_lsn,
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
next_pitr_cutoff,
retention_param_cutoff,
lease_points,

View File

@@ -136,22 +136,6 @@ pub(crate) fn local_layer_path(
}
}
pub(crate) enum LastEviction {
Never,
At(std::time::Instant),
Evicting,
}
impl LastEviction {
pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
match self {
LastEviction::Never => false,
LastEviction::At(evicted_at) => evicted_at > &timepoint,
LastEviction::Evicting => true,
}
}
}
impl Layer {
/// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted(
@@ -421,17 +405,6 @@ impl Layer {
self.0.metadata()
}
pub(crate) fn last_evicted_at(&self) -> LastEviction {
match self.0.last_evicted_at.try_lock() {
Ok(lock) => match *lock {
None => LastEviction::Never,
Some(at) => LastEviction::At(at),
},
Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
}
}
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
self.0
.timeline
@@ -683,9 +656,7 @@ struct LayerInner {
/// When the Layer was last evicted but has not been downloaded since.
///
/// This is used for skipping evicted layers from the previous heatmap (see
/// `[Timeline::generate_heatmap]`) and for updating metrics
/// (see [`LayerImplMetrics::redownload_after`]).
/// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
#[cfg(test)]

View File

@@ -4,7 +4,7 @@ use std::cmp::max;
use std::future::Future;
use std::ops::{ControlFlow, RangeInclusive};
use std::pin::pin;
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use once_cell::sync::Lazy;
@@ -15,7 +15,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::compaction::CompactionOutcome;
@@ -25,6 +25,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
use utils::backoff::exponential_backoff_duration;
use utils::completion::Barrier;
use utils::pausable_failpoint;
use utils::rate_limit::RateLimit;
/// Semaphore limiting concurrent background tasks (across all tenants).
///
@@ -37,17 +38,17 @@ static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
Semaphore::new(permits)
});
/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if
/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled.
/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
/// default, see `use_compaction_semaphore`.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
///
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less
/// important (e.g. due to page images in delta layers) and can wait for other background tasks.
/// to avoid high read amp during heavy write workloads.
///
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note
/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same
/// thread pool.
static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
let total_threads = TOKIO_WORKER_THREADS.get();
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
assert_ne!(permits, 0, "we will not be adding in permits later");
@@ -58,7 +59,7 @@ static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
/// Background jobs.
///
/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
/// do any significant IO or CPU work.
/// do any significant IO.
#[derive(
Debug,
PartialEq,
@@ -71,9 +72,6 @@ static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum BackgroundLoopKind {
/// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is
/// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics.
L0Compaction,
Compaction,
Gc,
Eviction,
@@ -93,22 +91,37 @@ pub struct BackgroundLoopSemaphorePermit<'a> {
/// Acquires a semaphore permit, to limit concurrent background jobs.
pub(crate) async fn acquire_concurrency_permit(
loop_kind: BackgroundLoopKind,
use_compaction_semaphore: bool,
_ctx: &RequestContext,
) -> BackgroundLoopSemaphorePermit<'static> {
let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
static WARN_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
pausable_failpoint!("initial-size-calculation-permit-pause");
}
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
let semaphore = match loop_kind {
BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS,
_ => &CONCURRENT_BACKGROUND_TASKS,
};
let permit = semaphore.acquire().await.expect("should never close");
let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
CONCURRENT_COMPACTION_TASKS.acquire().await
} else {
assert!(!use_compaction_semaphore);
CONCURRENT_BACKGROUND_TASKS.acquire().await
}
.expect("should never close");
recorder.acquired();
let waited = recorder.acquired();
if waited >= WARN_THRESHOLD {
let waited = waited.as_secs_f64();
WARN_PACER
.lock()
.unwrap()
.call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
}
BackgroundLoopSemaphorePermit {
_permit: permit,
@@ -576,7 +589,7 @@ pub(crate) fn warn_when_period_overrun(
?task,
"task iteration took longer than the configured period"
);
metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
.inc();
}

View File

@@ -47,7 +47,7 @@ use serde_with::serde_as;
use storage_broker::BrokerClientChannel;
use tokio::runtime::Handle;
use tokio::sync::mpsc::Sender;
use tokio::sync::{oneshot, watch};
use tokio::sync::{oneshot, watch, Notify};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::critical;
@@ -150,16 +150,16 @@ use super::{
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
MaybeOffloaded,
};
use super::{
debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, CompactionNotifier,
HeatMapTimeline,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
storage_layer::ReadableLayer,
};
use super::{secondary::heatmap::HeatMapLayer, GcError};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
};
#[cfg(test)]
use pageserver_api::value::Value;
@@ -225,7 +225,7 @@ pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
pub l0_compaction_trigger: Arc<CompactionNotifier>,
pub l0_compaction_trigger: Arc<Notify>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
@@ -352,11 +352,8 @@ pub struct Timeline {
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
// The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
// we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
// Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
// the planned GC cutoff.
pub applied_gc_cutoff_lsn: Rcu<Lsn>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
@@ -429,7 +426,7 @@ pub struct Timeline {
compaction_failed: AtomicBool,
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
l0_compaction_trigger: Arc<CompactionNotifier>,
l0_compaction_trigger: Arc<Notify>,
/// Make sure we only have one running gc at a time.
///
@@ -465,16 +462,6 @@ pub struct Timeline {
/// If Some, collects GetPage metadata for an ongoing PageTrace.
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
previous_heatmap: ArcSwapOption<PreviousHeatmap>,
}
pub(crate) enum PreviousHeatmap {
Active {
heatmap: HeatMapTimeline,
read_at: std::time::Instant,
},
Obsolete,
}
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -892,9 +879,6 @@ pub(crate) enum CompactFlags {
OnlyL0Compaction,
EnhancedGcBottomMostCompaction,
DryRun,
/// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
/// compaction via HTTP API.
NoYield,
}
#[serde_with::serde_as]
@@ -1090,15 +1074,9 @@ impl Timeline {
(history, gc_info.within_ancestor_pitr)
}
/// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.applied_gc_cutoff_lsn.read()
}
/// Read timeline's planned GC cutoff: this is the logical end of history that users
/// are allowed to read (based on configured PITR), even if physically we have more history.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
self.gc_info.read().unwrap().cutoffs.time
/// Lock and get timeline's GC cutoff
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
}
/// Look up given page version.
@@ -1576,7 +1554,6 @@ impl Timeline {
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
let mut gc_info = self.gc_info.write().unwrap();
let planned_cutoff = gc_info.min_cutoff();
let valid_until = SystemTime::now() + length;
@@ -1597,7 +1574,7 @@ impl Timeline {
existing_lease.clone()
}
Entry::Vacant(vacant) => {
// Reject already GC-ed LSN if we are in AttachedSingle and
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
// not blocked by the lsn lease deadline.
let validate = {
let conf = self.tenant_conf.load();
@@ -1606,12 +1583,9 @@ impl Timeline {
};
if init || validate {
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
if lsn < *latest_gc_cutoff_lsn {
bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
if lsn < planned_cutoff {
bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff);
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
}
@@ -1813,46 +1787,34 @@ impl Timeline {
.await
}
/// Outermost timeline compaction operation; downloads needed layers.
///
/// NB: the cancellation token is usually from a background task, but can also come from a
/// request task.
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
/// compaction tasks.
pub(crate) async fn compact_with_options(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
// Acquire the compaction lock and task semaphore.
//
// L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved
// out by other background tasks (including image compaction). We request this via
// `BackgroundLoopKind::L0Compaction`.
//
// If this is a regular compaction pass, and L0-only compaction is enabled in the config,
// then we should yield for immediate L0 compaction if necessary while we're waiting for the
// background task semaphore. There's no point yielding otherwise, since we'd just end up
// right back here.
let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
true => BackgroundLoopKind::L0Compaction,
false => BackgroundLoopKind::Compaction,
};
let yield_for_l0 = !is_l0_only
&& self.get_compaction_l0_first()
&& !options.flags.contains(CompactFlags::NoYield);
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
let acquire = async move {
let prepare = async move {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await;
let permit = super::tasks::acquire_concurrency_permit(
BackgroundLoopKind::Compaction,
self.conf.use_compaction_semaphore,
ctx,
)
.await;
(guard, permit)
};
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
(guard, permit) = acquire => (guard, permit),
_ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {
return Ok(CompactionOutcome::YieldForL0);
}
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
_ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
};
@@ -1966,8 +1928,6 @@ impl Timeline {
// ... and inform any waiters for newer LSNs that there won't be any.
self.last_record_lsn.shutdown();
self.l0_compaction_trigger.on_shutdown(self.timeline_id);
if let ShutdownMode::FreezeAndFlush = mode {
let do_flush = if let Some((open, frozen)) = self
.layers
@@ -2366,20 +2326,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
}
pub fn get_compaction_l0_first(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_l0_first
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
}
pub fn get_compaction_l0_semaphore(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_l0_semaphore
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore)
}
fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
// Disable L0 flushes by default. This and compaction needs further tuning.
const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
@@ -2580,7 +2526,6 @@ impl Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
metadata: &TimelineMetadata,
previous_heatmap: Option<PreviousHeatmap>,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
@@ -2681,7 +2626,7 @@ impl Timeline {
LastImageLayerCreationStatus::default(),
)),
applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
current_logical_size: if disk_consistent_lsn.is_valid() {
@@ -2743,8 +2688,6 @@ impl Timeline {
create_idempotency,
page_trace: Default::default(),
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
};
result.repartition_threshold =
@@ -3008,31 +2951,6 @@ impl Timeline {
.schedule_layer_file_deletion(&needs_cleanup)?;
self.remote_client
.schedule_index_upload_for_file_changes()?;
// This barrier orders above DELETEs before any later operations.
// This is critical because code executing after the barrier might
// create again objects with the same key that we just scheduled for deletion.
// For example, if we just scheduled deletion of an image layer "from the future",
// later compaction might run again and re-create the same image layer.
// "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
// "same" here means same key range and LSN.
//
// Without a barrier between above DELETEs and the re-creation's PUTs,
// the upload queue may execute the PUT first, then the DELETE.
// In our example, we will end up with an IndexPart referencing a non-existent object.
//
// 1. a future image layer is created and uploaded
// 2. ps restart
// 3. the future layer from (1) is deleted during load layer map
// 4. image layer is re-created and uploaded
// 5. deletion queue would like to delete (1) but actually deletes (4)
// 6. delete by name works as expected, but it now deletes the wrong (later) version
//
// See https://github.com/neondatabase/neon/issues/5878
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
// TODO: this is basically a no-op now, should we remove it?
self.remote_client.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -3200,6 +3118,7 @@ impl Timeline {
async move {
let wait_for_permit = super::tasks::acquire_concurrency_permit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
false,
background_ctx,
);
@@ -3483,52 +3402,12 @@ impl Timeline {
let guard = self.layers.read().await;
// Firstly, if there's any heatmap left over from when this location
// was a secondary, take that into account. Keep layers that are:
// * present in the layer map
// * visible
// * non-resident
// * not evicted since we read the heatmap
//
// Without this, a new cold, attached location would clobber the previous
// heatamp.
let previous_heatmap = self.previous_heatmap.load();
let visible_non_resident = match previous_heatmap.as_deref() {
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
Some(heatmap.layers.iter().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
if layer.visibility() == LayerVisibilityHint::Covered {
return None;
}
if layer.is_likely_resident() {
return None;
}
if layer.last_evicted_at().happened_after(*read_at) {
return None;
}
Some((desc, hl.metadata.clone(), hl.access_time))
}))
}
Some(PreviousHeatmap::Obsolete) => None,
None => None,
};
// Secondly, all currently visible, resident layers are included.
let resident = guard.likely_resident_layers().filter_map(|layer| {
match layer.visibility() {
LayerVisibilityHint::Visible => {
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
let last_activity_ts = layer.latest_activity();
Some((
layer.layer_desc().clone(),
layer.metadata(),
last_activity_ts,
))
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
}
LayerVisibilityHint::Covered => {
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -3537,18 +3416,7 @@ impl Timeline {
}
});
let mut layers = match visible_non_resident {
Some(non_resident) => {
let mut non_resident = non_resident.peekable();
if non_resident.peek().is_none() {
self.previous_heatmap
.store(Some(PreviousHeatmap::Obsolete.into()));
}
non_resident.chain(resident).collect::<Vec<_>>()
}
None => resident.collect::<Vec<_>>(),
};
let mut layers = resident.collect::<Vec<_>>();
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
@@ -3737,7 +3605,7 @@ impl Timeline {
// the timeline, then it will remove layers that are required for fulfilling
// the current get request (read-path cannot "look back" and notice the new
// image layer).
let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();
let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
// See `compaction::compact_with_gc` for why we need this.
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
@@ -4122,8 +3990,6 @@ impl Timeline {
if l0_count >= self.get_compaction_threshold() {
self.l0_compaction_trigger.notify_one();
}
self.l0_compaction_trigger
.on_l0_update(self.timeline_id, l0_count);
// Delay the next flush to backpressure if compaction can't keep up. We delay by the
// flush duration such that the flush takes 2x as long. This is propagated up to WAL
@@ -4297,7 +4163,6 @@ impl Timeline {
ImageLayerCreationMode::Initial,
ctx,
LastImageLayerCreationStatus::Initial,
false, // don't yield for L0, we're flushing L0
)
.await?;
debug_assert!(
@@ -4426,7 +4291,7 @@ impl Timeline {
let update = crate::tenant::metadata::MetadataUpdate::new(
disk_consistent_lsn,
ondisk_prev_record_lsn,
*self.applied_gc_cutoff_lsn.read(),
*self.latest_gc_cutoff_lsn.read(),
);
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -4870,7 +4735,6 @@ impl Timeline {
mode: ImageLayerCreationMode,
ctx: &RequestContext,
last_status: LastImageLayerCreationStatus,
yield_for_l0: bool,
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
let timer = self.metrics.create_images_time_histo.start_timer();
@@ -5067,27 +4931,25 @@ impl Timeline {
if let ImageLayerCreationMode::Try = mode {
// We have at least made some progress
if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 {
if batch_image_writer.pending_layer_num() >= 1 {
// The `Try` mode is currently only used on the compaction path. We want to avoid
// image layer generation taking too long time and blocking L0 compaction. So in this
// mode, we also inspect the current number of L0 layers and skip image layer generation
// if there are too many of them.
if let Some((max_num_of_l0_layers, timeline_id)) =
self.l0_compaction_trigger.get_max_l0_count()
{
let image_preempt_threshold = self.get_image_creation_preempt_threshold()
* self.get_compaction_threshold();
if image_preempt_threshold != 0
&& max_num_of_l0_layers >= image_preempt_threshold
{
tracing::info!(
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {} on timeline {}",
partition.start().unwrap(), partition.end().unwrap(), max_num_of_l0_layers, timeline_id
);
last_partition_processed = Some(partition.clone());
all_generated = false;
break;
}
let num_of_l0_layers = {
let layers = self.layers.read().await;
layers.layer_map()?.level0_deltas().len()
};
let image_preempt_threshold = self.get_image_creation_preempt_threshold()
* self.get_compaction_threshold();
if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
tracing::info!(
"preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
);
last_partition_processed = Some(partition.clone());
all_generated = false;
break;
}
}
}
@@ -5466,13 +5328,8 @@ impl Timeline {
self.remote_client
.schedule_compaction_update(&remove_layers, new_deltas)?;
let l0_count = guard.layer_map()?.level0_deltas().len();
drop_wlock(guard);
self.l0_compaction_trigger
.on_l0_update(self.timeline_id, l0_count);
Ok(())
}
@@ -5661,7 +5518,7 @@ impl Timeline {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: *self.get_applied_gc_cutoff_lsn(),
time: *self.get_latest_gc_cutoff_lsn(),
space: space_cutoff,
}
}
@@ -5782,7 +5639,7 @@ impl Timeline {
let mut result: GcResult = GcResult::default();
// Nothing to GC. Return early.
let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
if latest_gc_cutoff >= new_gc_cutoff {
info!(
"Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
@@ -5796,7 +5653,7 @@ impl Timeline {
//
// The GC cutoff should only ever move forwards.
let waitlist = {
let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
if *write_guard > new_gc_cutoff {
return Err(GcError::BadLsn {
why: format!(
@@ -6736,32 +6593,18 @@ fn is_send() {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use tracing::Instrument;
use utils::{id::TimelineId, lsn::Lsn};
use crate::tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName, LayerVisibilityHint},
storage_layer::{Layer, LayerName},
timeline::{DeltaLayerTestDesc, EvictionError},
PreviousHeatmap, Timeline,
Timeline,
};
use super::HeatMapTimeline;
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
assert_eq!(lhs.layers.len(), rhs.layers.len());
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
for (l, r) in lhs_rhs {
assert_eq!(l.name, r.name);
assert_eq!(l.metadata, r.metadata);
}
}
#[tokio::test]
async fn test_heatmap_generation() {
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
@@ -6835,7 +6678,7 @@ mod tests {
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in &heatmap.layers {
for layer in heatmap.layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
@@ -6850,144 +6693,6 @@ mod tests {
last_lsn = layer_lsn;
}
}
// Evict all the layers and stash the old heatmap in the timeline.
// This simulates a migration to a cold secondary location.
let guard = timeline.layers.read().await;
let mut all_layers = Vec::new();
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
all_layers.push(layer.clone());
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Generate a new heatmap and assert that it contains the same layers as the old one.
let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
// Download each layer one by one. Generate the heatmap at each step and check
// that it's stable.
for layer in all_layers {
if layer.visibility() == LayerVisibilityHint::Covered {
continue;
}
eprintln!("Downloading {layer} and re-generating heatmap");
let _resident = layer
.download_and_keep_resident()
.instrument(tracing::info_span!(
parent: None,
"download_layer",
tenant_id = %timeline.tenant_shard_id.tenant_id,
shard_id = %timeline.tenant_shard_id.shard_slug(),
timeline_id = %timeline.timeline_id
))
.await
.unwrap();
let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
}
// Everything from the post-migration heatmap is now resident.
// Check that we drop it from memory.
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]
async fn test_previous_heatmap_obsoletion() {
let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
.await
.unwrap();
let l0_delta = DeltaLayerTestDesc::new(
Lsn(0x20)..Lsn(0x30),
Key::from_hex("000000000000000000000000000000000000").unwrap()
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
vec![(
Key::from_hex("720000000033333333444444445500000000").unwrap(),
Lsn(0x25),
Value::Image(test_img("foo")),
)],
);
let image_layer = (
Lsn(0x40),
vec![(
Key::from_hex("620000000033333333444444445500000000").unwrap(),
test_img("bar"),
)],
);
let delta_layers = vec![l0_delta];
let image_layers = vec![image_layer];
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline_with_layers(
TimelineId::generate(),
Lsn(0x10),
14,
&ctx,
delta_layers,
image_layers,
Lsn(0x100),
)
.await
.unwrap();
// Layer visibility is an input to heatmap generation, so refresh it first
timeline.update_layer_visibility().await.unwrap();
let heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
// Both layers should be in the heatmap
assert!(!heatmap.layers.is_empty());
// Now simulate a migration.
timeline
.previous_heatmap
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
})));
// Evict all the layers in the previous heatmap
let guard = timeline.layers.read().await;
let forever = std::time::Duration::from_secs(120);
for layer in guard.likely_resident_layers() {
layer.evict_and_wait(forever).await.unwrap();
}
drop(guard);
// Generate a new heatmap and check that the previous heatmap
// has been marked obsolete.
let post_eviction_heatmap = timeline
.generate_heatmap()
.await
.expect("Infallible while timeline is not shut down");
assert!(post_eviction_heatmap.layers.is_empty());
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)
));
}
#[tokio::test]

View File

@@ -726,9 +726,7 @@ impl Timeline {
}
// Yield if we have pending L0 compaction. The scheduler will do another pass.
if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
&& !options.flags.contains(CompactFlags::NoYield)
{
if l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0 {
info!("image/ancestor compaction yielding for L0 compaction");
return Ok(CompactionOutcome::YieldForL0);
}
@@ -776,7 +774,6 @@ impl Timeline {
.load()
.as_ref()
.clone(),
!options.flags.contains(CompactFlags::NoYield),
)
.await
.inspect_err(|err| {
@@ -852,7 +849,7 @@ impl Timeline {
//
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
@@ -2202,7 +2199,7 @@ impl Timeline {
// TODO: ensure the child branches will not use anything below the watermark, or consider
// them when computing the watermark.
gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
}
/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.

View File

@@ -294,7 +294,6 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
None, // Previous heatmap is not needed for deletion
tenant.get_timeline_resources_for(remote_client),
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -332,8 +332,11 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
let acquire_permit =
crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);
let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
BackgroundLoopKind::Eviction,
false,
ctx,
);
tokio::select! {
permit = acquire_permit => ControlFlow::Continue(permit),

View File

@@ -1,23 +1,21 @@
use std::collections::{HashMap, HashSet, VecDeque};
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use std::sync::atomic::AtomicU32;
use std::sync::Arc;
use super::remote_timeline_client::is_same_remote_layer_path;
use super::storage_layer::AsLayerDesc as _;
use super::storage_layer::LayerName;
use super::storage_layer::ResidentLayer;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use utils::generation::Generation;
use utils::lsn::{AtomicLsn, Lsn};
use chrono::NaiveDateTime;
use once_cell::sync::Lazy;
use tracing::info;
use super::remote_timeline_client::is_same_remote_layer_path;
use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer};
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata};
use utils::generation::Generation;
use utils::lsn::{AtomicLsn, Lsn};
/// Kill switch for upload queue reordering in case it causes problems.
/// NB: enabling this will block all uploads in AttachedMulti tenants with a blocked deletion.
/// TODO: remove this once we have confidence in it.
static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
@@ -48,12 +46,6 @@ impl UploadQueue {
}
}
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub enum OpType {
MayReorder,
FlushDeletion,
}
/// This keeps track of queued and in-progress tasks.
pub struct UploadQueueInitialized {
/// Maximum number of inprogress tasks to schedule. 0 is no limit.
@@ -94,6 +86,9 @@ pub struct UploadQueueInitialized {
/// preceding layer file uploads have completed.
pub queued_operations: VecDeque<UploadOp>,
/// If set to a valid generation, deletions at generations below this are blocked.
pub(crate) block_deletions_below: Generation,
/// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
/// for error logging.
///
@@ -102,12 +97,6 @@ pub struct UploadQueueInitialized {
#[cfg(feature = "testing")]
pub(crate) dangling_files: HashMap<LayerName, Generation>,
/// Ensure we order file operations correctly.
pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
/// Deletions that are blocked by the tenant configuration
pub(crate) blocked_deletions: Vec<Delete>,
/// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
pub(crate) shutting_down: bool,
@@ -210,6 +199,21 @@ impl UploadQueueInitialized {
/// The position must be valid for the queue size.
fn is_ready(&self, pos: usize) -> bool {
let candidate = self.queued_operations.get(pos).expect("invalid position");
// Don't allow deletions for past generations if blocked.
// TODO: document motivation.
if self.block_deletions_below != Generation::None {
if let UploadOp::Delete(delete) = candidate {
if delete
.layers
.iter()
.any(|(_, metadata)| metadata.generation < self.block_deletions_below)
{
return false;
}
}
}
self
// Look at in-progress operations, in random order.
.inprogress_tasks
@@ -248,7 +252,7 @@ impl UploadQueueInitialized {
pub(crate) fn num_inprogress_layer_uploads(&self) -> usize {
self.inprogress_tasks
.iter()
.filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _)))
.filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _)))
.count()
}
@@ -358,10 +362,9 @@ impl UploadQueue {
task_counter: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
block_deletions_below: Generation::None,
#[cfg(feature = "testing")]
dangling_files: HashMap::new(),
recently_deleted: HashSet::new(),
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
};
@@ -399,10 +402,9 @@ impl UploadQueue {
task_counter: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
block_deletions_below: Generation::None,
#[cfg(feature = "testing")]
dangling_files: HashMap::new(),
recently_deleted: HashSet::new(),
blocked_deletions: Vec::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
};
@@ -463,7 +465,7 @@ pub struct Delete {
#[derive(Clone, Debug)]
pub enum UploadOp {
/// Upload a layer file. The last field indicates the last operation for thie file.
UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
UploadLayer(ResidentLayer, LayerFileMetadata),
/// Upload a index_part.json file
UploadMetadata {
@@ -485,11 +487,11 @@ pub enum UploadOp {
impl std::fmt::Display for UploadOp {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
UploadOp::UploadLayer(layer, metadata, mode) => {
UploadOp::UploadLayer(layer, metadata) => {
write!(
f,
"UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
layer, metadata.file_size, metadata.generation, mode
"UploadLayer({}, size={:?}, gen={:?})",
layer, metadata.file_size, metadata.generation
)
}
UploadOp::UploadMetadata { uploaded, .. } => {
@@ -519,13 +521,13 @@ impl UploadOp {
(UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false,
// Uploads and deletes can bypass each other unless they're for the same file.
(UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => {
(UploadOp::UploadLayer(a, ameta), UploadOp::UploadLayer(b, bmeta)) => {
let aname = &a.layer_desc().layer_name();
let bname = &b.layer_desc().layer_name();
!is_same_remote_layer_path(aname, ameta, bname, bmeta)
}
(UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d))
| (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => {
(UploadOp::UploadLayer(u, umeta), UploadOp::Delete(d))
| (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta)) => {
d.layers.iter().all(|(dname, dmeta)| {
!is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta)
})
@@ -541,8 +543,8 @@ impl UploadOp {
// Similarly, index uploads can bypass uploads and deletes as long as neither the
// uploaded index nor the active index references the file (the latter would be
// incorrect use by the caller).
(UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i })
| (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => {
(UploadOp::UploadLayer(u, umeta), UploadOp::UploadMetadata { uploaded: i })
| (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta)) => {
let uname = u.layer_desc().layer_name();
!i.references(&uname, umeta) && !index.references(&uname, umeta)
}
@@ -570,6 +572,7 @@ mod tests {
use crate::DEFAULT_PG_VERSION;
use itertools::Itertools as _;
use std::str::FromStr as _;
use utils::generation::Generation;
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
/// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
@@ -577,10 +580,9 @@ mod tests {
fn assert_same_op(a: &UploadOp, b: &UploadOp) {
use UploadOp::*;
match (a, b) {
(UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => {
(UploadLayer(a, ameta), UploadLayer(b, bmeta)) => {
assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name());
assert_eq!(ameta, bmeta);
assert_eq!(atype, btype);
}
(Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers),
(UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b),
@@ -698,7 +700,7 @@ mod tests {
// Enqueue non-conflicting upload, delete, and index before and after a barrier.
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
}),
@@ -706,7 +708,7 @@ mod tests {
uploaded: index.clone(),
},
UploadOp::Barrier(barrier),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
}),
@@ -807,9 +809,9 @@ mod tests {
let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
let ops = [
UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None),
UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None),
UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata()),
UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata()),
UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata()),
];
queue.queued_operations.extend(ops.clone());
@@ -840,14 +842,14 @@ mod tests {
let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::Delete(Delete {
layers: vec![
(layer0.layer_desc().layer_name(), layer0.metadata()),
(layer1.layer_desc().layer_name(), layer1.metadata()),
],
}),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata()),
];
queue.queued_operations.extend(ops.clone());
@@ -884,15 +886,15 @@ mod tests {
let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::Delete(Delete {
layers: vec![
(layer0.layer_desc().layer_name(), layer0.metadata()),
(layer1.layer_desc().layer_name(), layer1.metadata()),
],
}),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata()),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
}),
@@ -921,9 +923,9 @@ mod tests {
let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata()),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
];
queue.queued_operations.extend(ops.clone());
@@ -989,15 +991,15 @@ mod tests {
let index2 = index_with(&index1, &layer2);
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::UploadMetadata {
uploaded: index0.clone(),
},
UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata()),
UploadOp::UploadMetadata {
uploaded: index1.clone(),
},
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
UploadOp::UploadMetadata {
uploaded: index2.clone(),
},
@@ -1053,7 +1055,7 @@ mod tests {
let ops = [
// Initial upload, with a barrier to prevent index coalescing.
UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
UploadOp::UploadLayer(layer.clone(), layer.metadata()),
UploadOp::UploadMetadata {
uploaded: index_upload.clone(),
},
@@ -1099,7 +1101,7 @@ mod tests {
let ops = [
// Initial upload, with a barrier to prevent index coalescing.
UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
UploadOp::UploadLayer(layer.clone(), layer.metadata()),
UploadOp::UploadMetadata {
uploaded: index_upload.clone(),
},
@@ -1109,7 +1111,7 @@ mod tests {
uploaded: index_deref.clone(),
},
// Replace and reference the layer.
UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
UploadOp::UploadLayer(layer.clone(), layer.metadata()),
UploadOp::UploadMetadata {
uploaded: index_ref.clone(),
},
@@ -1145,7 +1147,7 @@ mod tests {
// Enqueue non-conflicting upload, delete, and index before and after a shutdown.
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
}),
@@ -1153,7 +1155,7 @@ mod tests {
uploaded: index.clone(),
},
UploadOp::Shutdown,
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
}),
@@ -1203,10 +1205,10 @@ mod tests {
let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
let ops = [
UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None),
UploadOp::UploadLayer(layer0.clone(), layer0.metadata()),
UploadOp::UploadLayer(layer1.clone(), layer1.metadata()),
UploadOp::UploadLayer(layer2.clone(), layer2.metadata()),
UploadOp::UploadLayer(layer3.clone(), layer3.metadata()),
];
queue.queued_operations.extend(ops.clone());
@@ -1257,7 +1259,7 @@ mod tests {
.layer_metadata
.insert(layer.layer_desc().layer_name(), layer.metadata());
vec![
UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
UploadOp::UploadLayer(layer.clone(), layer.metadata()),
UploadOp::Delete(Delete {
layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
}),

View File

@@ -14,7 +14,7 @@
#include "utils/guc.h"
#include "extension_server.h"
#include "extension_server.h"
#include "neon_utils.h"
static int extension_server_port = 0;
@@ -45,7 +45,7 @@ neon_download_extension_file_http(const char *filename, bool is_library)
handle = alloc_curl_handle();
curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",

View File

@@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode)
* neon_truncate() -- Truncate relation to specified number of blocks.
*/
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
XLogRecPtr lsn;
@@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
mdtruncate(reln, forknum, old_blocks, nblocks);
mdtruncate(reln, forknum, nblocks);
return;
default:
@@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdtruncate(reln, forknum, old_blocks, nblocks);
mdtruncate(reln, forknum, nblocks);
#endif
}

View File

@@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber old_blocks, BlockNumber nblocks);
BlockNumber nblocks);
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
#if PG_MAJORVERSION_NUM >= 17
static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
@@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
* inmem_truncate() -- Truncate relation to specified number of blocks.
*/
static void
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
}

View File

@@ -501,7 +501,7 @@ impl Session {
_guard: Metrics::get()
.proxy
.cancel_channel_size
.guard(RedisMsgKind::HDel),
.guard(RedisMsgKind::HSet),
};
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {

View File

@@ -5,7 +5,7 @@
use http_utils::error::HttpErrorBody;
use reqwest::{IntoUrl, Method, StatusCode};
use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus};
use std::error::Error as _;
use utils::{
id::{NodeId, TenantId, TimelineId},
@@ -32,9 +32,6 @@ pub enum Error {
/// Status is not ok; parsed error in body as `HttpErrorBody`.
#[error("safekeeper API: {1}")]
ApiError(StatusCode, String),
#[error("Cancelled")]
Cancelled,
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -127,10 +124,9 @@ impl Client {
self.get(&uri).await
}
pub async fn utilization(&self) -> Result<SafekeeperUtilization> {
pub async fn utilization(&self) -> Result<reqwest::Response> {
let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
let resp = self.get(&uri).await?;
resp.json().await.map_err(Error::ReceiveBody)
self.get(&uri).await
}
async fn post<B: serde::Serialize, U: IntoUrl>(

View File

@@ -310,12 +310,9 @@ impl WalBackupTask {
retry_attempt = 0;
}
Err(e) => {
// We might have managed to upload some segment even though
// some later in the range failed, so log backup_lsn
// separately.
error!(
"failed while offloading range {}-{}, backup_lsn {}: {:?}",
backup_lsn, commit_lsn, backup_lsn, e
"failed while offloading range {}-{}: {:?}",
backup_lsn, commit_lsn, e
);
retry_attempt = retry_attempt.saturating_add(1);
@@ -341,13 +338,6 @@ async fn backup_lsn_range(
let start_lsn = *backup_lsn;
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
info!(
"offloading segnos {:?} of range [{}-{})",
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
start_lsn,
end_lsn,
);
// Pool of concurrent upload tasks. We use `FuturesOrdered` to
// preserve order of uploads, and update `backup_lsn` only after
// all previous uploads are finished.
@@ -384,10 +374,10 @@ async fn backup_lsn_range(
}
info!(
"offloaded segnos {:?} of range [{}-{})",
"offloaded segnos {:?} up to {}, previous backup_lsn {}",
segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
start_lsn,
end_lsn,
start_lsn,
);
Ok(())
}

View File

@@ -1,58 +0,0 @@
import itertools
import json
import os
build_tag = os.environ["BUILD_TAG"]
branch = os.environ["BRANCH"]
dev_acr = os.environ["DEV_ACR"]
prod_acr = os.environ["PROD_ACR"]
components = {
"neon": ["neon"],
"compute": [
"compute-node-v14",
"compute-node-v15",
"compute-node-v16",
"compute-node-v17",
"vm-compute-node-v14",
"vm-compute-node-v15",
"vm-compute-node-v16",
"vm-compute-node-v17",
],
}
registries = {
"dev": [
"docker.io/neondatabase",
"369495373322.dkr.ecr.eu-central-1.amazonaws.com",
f"{dev_acr}.azurecr.io/neondatabase",
],
"prod": [
"093970136003.dkr.ecr.eu-central-1.amazonaws.com",
f"{prod_acr}.azurecr.io/neondatabase",
],
}
outputs: dict[str, dict[str, list[str]]] = {}
target_tags = [build_tag, "latest"] if branch == "main" else [build_tag]
target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"]
for component_name, component_images in components.items():
for stage in target_stages:
outputs[f"{component_name}-{stage}"] = dict(
[
(
f"docker.io/neondatabase/{component_image}:{build_tag}",
[
f"{combo[0]}/{component_image}:{combo[1]}"
for combo in itertools.product(registries[stage], target_tags)
],
)
for component_image in component_images
]
)
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
for key, value in outputs.items():
f.write(f"{key}={json.dumps(value)}\n")

View File

@@ -1,22 +0,0 @@
import json
import os
import subprocess
image_map = os.getenv("IMAGE_MAP")
if not image_map:
raise ValueError("IMAGE_MAP environment variable is not set")
try:
parsed_image_map: dict[str, list[str]] = json.loads(image_map)
except json.JSONDecodeError as e:
raise ValueError("Failed to parse IMAGE_MAP as JSON") from e
for source, targets in parsed_image_map.items():
for target in targets:
cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
print(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
if result.returncode != 0:
print(f"Error: {result.stdout}")
raise RuntimeError(f"Command failed: {' '.join(cmd)}")

View File

@@ -32,8 +32,6 @@ postgres_connection.workspace = true
rand.workspace = true
reqwest = { workspace = true, features = ["stream"] }
routerify.workspace = true
safekeeper_api.workspace = true
safekeeper_client.workspace = true
rustls-native-certs.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -1,10 +1,6 @@
use futures::{stream::FuturesUnordered, StreamExt};
use safekeeper_api::models::SafekeeperUtilization;
use safekeeper_client::mgmt_api;
use std::{
collections::HashMap,
fmt::Debug,
future::Future,
sync::Arc,
time::{Duration, Instant},
};
@@ -13,15 +9,15 @@ use tokio_util::sync::CancellationToken;
use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
use thiserror::Error;
use utils::{id::NodeId, logging::SecretString};
use utils::id::NodeId;
use crate::{node::Node, safekeeper::Safekeeper};
use crate::node::Node;
struct HeartbeaterTask<Server, State> {
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest<Server, State>>,
struct HeartbeaterTask {
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
cancel: CancellationToken,
state: HashMap<NodeId, State>,
state: HashMap<NodeId, PageserverState>,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
@@ -40,17 +36,8 @@ pub(crate) enum PageserverState {
Offline,
}
#[derive(Debug, Clone)]
pub(crate) enum SafekeeperState {
Available {
last_seen_at: Instant,
utilization: SafekeeperUtilization,
},
Offline,
}
#[derive(Debug)]
pub(crate) struct AvailablityDeltas<State>(pub Vec<(NodeId, State)>);
pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
#[derive(Debug, Error)]
pub(crate) enum HeartbeaterError {
@@ -58,28 +45,23 @@ pub(crate) enum HeartbeaterError {
Cancel,
}
struct HeartbeatRequest<Server, State> {
servers: Arc<HashMap<NodeId, Server>>,
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas<State>, HeartbeaterError>>,
struct HeartbeatRequest {
pageservers: Arc<HashMap<NodeId, Node>>,
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
}
pub(crate) struct Heartbeater<Server, State> {
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest<Server, State>>,
pub(crate) struct Heartbeater {
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
}
#[allow(private_bounds)]
impl<Server: Send + Sync + 'static, State: Debug + Send + 'static> Heartbeater<Server, State>
where
HeartbeaterTask<Server, State>: HeartBeat<Server, State>,
{
impl Heartbeater {
pub(crate) fn new(
jwt_token: Option<String>,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
cancel: CancellationToken,
) -> Self {
let (sender, receiver) =
tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest<Server, State>>();
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
let mut heartbeater = HeartbeaterTask::new(
receiver,
jwt_token,
@@ -94,12 +76,12 @@ where
pub(crate) async fn heartbeat(
&self,
servers: Arc<HashMap<NodeId, Server>>,
) -> Result<AvailablityDeltas<State>, HeartbeaterError> {
pageservers: Arc<HashMap<NodeId, Node>>,
) -> Result<AvailablityDeltas, HeartbeaterError> {
let (sender, receiver) = tokio::sync::oneshot::channel();
self.sender
.send(HeartbeatRequest {
servers,
pageservers,
reply: sender,
})
.map_err(|_| HeartbeaterError::Cancel)?;
@@ -111,12 +93,9 @@ where
}
}
impl<Server, State: Debug> HeartbeaterTask<Server, State>
where
HeartbeaterTask<Server, State>: HeartBeat<Server, State>,
{
impl HeartbeaterTask {
fn new(
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest<Server, State>>,
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
jwt_token: Option<String>,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
@@ -131,13 +110,14 @@ where
jwt_token,
}
}
async fn run(&mut self) {
loop {
tokio::select! {
request = self.receiver.recv() => {
match request {
Some(req) => {
let res = self.heartbeat(req.servers).await;
let res = self.heartbeat(req.pageservers).await;
req.reply.send(res).unwrap();
},
None => { return; }
@@ -147,20 +127,11 @@ where
}
}
}
}
pub(crate) trait HeartBeat<Server, State> {
fn heartbeat(
&mut self,
pageservers: Arc<HashMap<NodeId, Server>>,
) -> impl Future<Output = Result<AvailablityDeltas<State>, HeartbeaterError>> + Send;
}
impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState> {
async fn heartbeat(
&mut self,
pageservers: Arc<HashMap<NodeId, Node>>,
) -> Result<AvailablityDeltas<PageserverState>, HeartbeaterError> {
) -> Result<AvailablityDeltas, HeartbeaterError> {
let mut new_state = HashMap::new();
let mut heartbeat_futs = FuturesUnordered::new();
@@ -301,121 +272,3 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
Ok(AvailablityDeltas(deltas))
}
}
impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, SafekeeperState> {
async fn heartbeat(
&mut self,
safekeepers: Arc<HashMap<NodeId, Safekeeper>>,
) -> Result<AvailablityDeltas<SafekeeperState>, HeartbeaterError> {
let mut new_state = HashMap::new();
let mut heartbeat_futs = FuturesUnordered::new();
for (node_id, sk) in &*safekeepers {
heartbeat_futs.push({
let jwt_token = self
.jwt_token
.as_ref()
.map(|t| SecretString::from(t.to_owned()));
let cancel = self.cancel.clone();
async move {
let response = sk
.with_client_retries(
|client| async move { client.get_utilization().await },
&jwt_token,
3,
3,
Duration::from_secs(1),
&cancel,
)
.await;
let status = match response {
Ok(utilization) => SafekeeperState::Available {
last_seen_at: Instant::now(),
utilization,
},
Err(mgmt_api::Error::Cancelled) => {
// This indicates cancellation of the request.
// We ignore the node in this case.
return None;
}
Err(_) => SafekeeperState::Offline,
};
Some((*node_id, status))
}
});
loop {
let maybe_status = tokio::select! {
next = heartbeat_futs.next() => {
match next {
Some(result) => result,
None => { break; }
}
},
_ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
};
if let Some((node_id, status)) = maybe_status {
new_state.insert(node_id, status);
}
}
}
let mut offline = 0;
for state in new_state.values() {
match state {
SafekeeperState::Offline { .. } => offline += 1,
SafekeeperState::Available { .. } => {}
}
}
tracing::info!(
"Heartbeat round complete for {} safekeepers, {} offline",
new_state.len(),
offline
);
let mut deltas = Vec::new();
let now = Instant::now();
for (node_id, sk_state) in new_state.iter_mut() {
use std::collections::hash_map::Entry::*;
let entry = self.state.entry(*node_id);
let mut needs_update = false;
match entry {
Occupied(ref occ) => match (occ.get(), &sk_state) {
(SafekeeperState::Offline, SafekeeperState::Offline) => {}
(SafekeeperState::Available { last_seen_at, .. }, SafekeeperState::Offline) => {
if now - *last_seen_at >= self.max_offline_interval {
deltas.push((*node_id, sk_state.clone()));
needs_update = true;
}
}
_ => {
deltas.push((*node_id, sk_state.clone()));
needs_update = true;
}
},
Vacant(_) => {
// This is a new node. Don't generate a delta for it.
deltas.push((*node_id, sk_state.clone()));
}
}
match entry {
Occupied(mut occ) if needs_update => {
(*occ.get_mut()) = sk_state.clone();
}
Vacant(vac) => {
vac.insert(sk_state.clone());
}
_ => {}
}
}
Ok(AvailablityDeltas(deltas))
}
}

View File

@@ -17,8 +17,6 @@ mod pageserver_client;
mod peer_client;
pub mod persistence;
mod reconciler;
mod safekeeper;
mod safekeeper_client;
mod scheduler;
mod schema;
pub mod service;

View File

@@ -80,11 +80,6 @@ pub(crate) struct StorageControllerMetricGroup {
pub(crate) storage_controller_pageserver_request_error:
measured::CounterVec<PageserverRequestLabelGroupSet>,
/// Count of HTTP requests to the safekeeper that resulted in an error,
/// broken down by the safekeeper node id, request name and method
pub(crate) storage_controller_safekeeper_request_error:
measured::CounterVec<PageserverRequestLabelGroupSet>,
/// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
@@ -92,13 +87,6 @@ pub(crate) struct StorageControllerMetricGroup {
pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
/// Latency of HTTP requests to the safekeeper, broken down by safekeeper
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_safekeeper_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
/// broken down by the pageserver node id, request name and method
pub(crate) storage_controller_passthrough_request_error:

View File

@@ -1185,6 +1185,23 @@ impl Persistence {
Ok(safekeepers)
}
pub(crate) async fn safekeeper_get(
&self,
id: i64,
) -> Result<SafekeeperPersistence, DatabaseError> {
use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
self.with_conn(move |conn| {
Box::pin(async move {
Ok(safekeepers
.filter(id_column.eq(&id))
.select(SafekeeperPersistence::as_select())
.get_result(conn)
.await?)
})
})
.await
}
pub(crate) async fn safekeeper_upsert(
&self,
record: SafekeeperUpsert,
@@ -1537,21 +1554,6 @@ pub(crate) struct SafekeeperPersistence {
}
impl SafekeeperPersistence {
pub(crate) fn from_upsert(
upsert: SafekeeperUpsert,
scheduling_policy: SkSchedulingPolicy,
) -> Self {
crate::persistence::SafekeeperPersistence {
id: upsert.id,
region_id: upsert.region_id,
version: upsert.version,
host: upsert.host,
port: upsert.port,
http_port: upsert.http_port,
availability_zone_id: upsert.availability_zone_id,
scheduling_policy: String::from(scheduling_policy),
}
}
pub(crate) fn as_describe_response(&self) -> Result<SafekeeperDescribeResponse, DatabaseError> {
let scheduling_policy =
SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {

View File

@@ -1,7 +1,7 @@
use crate::pageserver_client::PageserverClient;
use crate::persistence::Persistence;
use crate::{compute_hook, service};
use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
};
@@ -162,22 +162,6 @@ impl ReconcilerConfig {
}
}
impl From<&MigrationConfig> for ReconcilerConfig {
fn from(value: &MigrationConfig) -> Self {
let mut builder = ReconcilerConfigBuilder::new();
if let Some(timeout) = value.secondary_warmup_timeout {
builder = builder.secondary_warmup_timeout(timeout)
}
if let Some(timeout) = value.secondary_download_request_timeout {
builder = builder.secondary_download_request_timeout(timeout)
}
builder.build()
}
}
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
pub(crate) struct ReconcileUnits {
_sem_units: tokio::sync::OwnedSemaphorePermit,

View File

@@ -1,139 +0,0 @@
use std::{str::FromStr, time::Duration};
use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
use reqwest::StatusCode;
use safekeeper_client::mgmt_api;
use tokio_util::sync::CancellationToken;
use utils::{backoff, id::NodeId, logging::SecretString};
use crate::{
heartbeater::SafekeeperState,
persistence::{DatabaseError, SafekeeperPersistence},
safekeeper_client::SafekeeperClient,
};
#[derive(Clone)]
pub struct Safekeeper {
pub(crate) skp: SafekeeperPersistence,
cancel: CancellationToken,
listen_http_addr: String,
listen_http_port: u16,
id: NodeId,
availability: SafekeeperState,
}
impl Safekeeper {
pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self {
Self {
cancel,
listen_http_addr: skp.host.clone(),
listen_http_port: skp.http_port as u16,
id: NodeId(skp.id as u64),
skp,
availability: SafekeeperState::Offline,
}
}
pub(crate) fn base_url(&self) -> String {
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
}
pub(crate) fn get_id(&self) -> NodeId {
self.id
}
pub(crate) fn describe_response(&self) -> Result<SafekeeperDescribeResponse, DatabaseError> {
self.skp.as_describe_response()
}
pub(crate) fn set_availability(&mut self, availability: SafekeeperState) {
self.availability = availability;
}
/// Perform an operation (which is given a [`SafekeeperClient`]) with retries
pub(crate) async fn with_client_retries<T, O, F>(
&self,
mut op: O,
jwt: &Option<SecretString>,
warn_threshold: u32,
max_retries: u32,
timeout: Duration,
cancel: &CancellationToken,
) -> mgmt_api::Result<T>
where
O: FnMut(SafekeeperClient) -> F,
F: std::future::Future<Output = mgmt_api::Result<T>>,
{
fn is_fatal(e: &mgmt_api::Error) -> bool {
use mgmt_api::Error::*;
match e {
ReceiveBody(_) | ReceiveErrorBody(_) => false,
ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
| ApiError(StatusCode::GATEWAY_TIMEOUT, _)
| ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
ApiError(_, _) => true,
Cancelled => true,
}
}
backoff::retry(
|| {
let http_client = reqwest::ClientBuilder::new()
.timeout(timeout)
.build()
.expect("Failed to construct HTTP client");
let client = SafekeeperClient::from_client(
self.get_id(),
http_client,
self.base_url(),
jwt.clone(),
);
let node_cancel_fut = self.cancel.cancelled();
let op_fut = op(client);
async {
tokio::select! {
r = op_fut=> {r},
_ = node_cancel_fut => {
Err(mgmt_api::Error::Cancelled)
}}
}
},
is_fatal,
warn_threshold,
max_retries,
&format!(
"Call to node {} ({}:{}) management API",
self.id, self.listen_http_addr, self.listen_http_port
),
cancel,
)
.await
.unwrap_or(Err(mgmt_api::Error::Cancelled))
}
pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) {
let crate::persistence::SafekeeperUpsert {
active: _,
availability_zone_id: _,
host,
http_port,
id,
port: _,
region_id: _,
version: _,
} = record.clone();
if id != self.id.0 as i64 {
// The way the function is called ensures this. If we regress on that, it's a bug.
panic!(
"id can't be changed via update_from_record function: {id} != {}",
self.id.0
);
}
self.skp = crate::persistence::SafekeeperPersistence::from_upsert(
record,
SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(),
);
self.listen_http_port = http_port as u16;
self.listen_http_addr = host;
}
}

View File

@@ -1,105 +0,0 @@
use crate::metrics::PageserverRequestLabelGroup;
use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
use safekeeper_client::mgmt_api::{Client, Result};
use utils::{
id::{NodeId, TenantId, TimelineId},
logging::SecretString,
};
/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage
/// controller to collect metrics in a non-intrusive manner.
///
/// Analogous to [`crate::pageserver_client::PageserverClient`].
#[derive(Debug, Clone)]
pub(crate) struct SafekeeperClient {
inner: Client,
node_id_label: String,
}
macro_rules! measured_request {
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
let labels = PageserverRequestLabelGroup {
pageserver_id: $node_id,
path: $name,
method: $method,
};
let latency = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_safekeeper_request_latency;
let _timer_guard = latency.start_timer(labels.clone());
let res = $invoke;
if res.is_err() {
let error_counters = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_request_error;
error_counters.inc(labels)
}
res
}};
}
impl SafekeeperClient {
#[allow(dead_code)]
pub(crate) fn new(
node_id: NodeId,
mgmt_api_endpoint: String,
jwt: Option<SecretString>,
) -> Self {
Self {
inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
node_id_label: node_id.0.to_string(),
}
}
pub(crate) fn from_client(
node_id: NodeId,
raw_client: reqwest::Client,
mgmt_api_endpoint: String,
jwt: Option<SecretString>,
) -> Self {
Self {
inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
node_id_label: node_id.0.to_string(),
}
}
#[allow(dead_code)]
pub(crate) async fn create_timeline(
&self,
req: &TimelineCreateRequest,
) -> Result<TimelineStatus> {
measured_request!(
"create_timeline",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner.create_timeline(req).await
)
}
#[allow(dead_code)]
pub(crate) async fn delete_timeline(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<TimelineStatus> {
measured_request!(
"delete_timeline",
crate::metrics::Method::Delete,
&self.node_id_label,
self.inner.delete_timeline(tenant_id, timeline_id).await
)
}
pub(crate) async fn get_utilization(&self) -> Result<SafekeeperUtilization> {
measured_request!(
"utilization",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.utilization().await
)
}
}

View File

@@ -2,7 +2,6 @@ pub mod chaos_injector;
mod context_iterator;
use hyper::Uri;
use safekeeper_api::models::SafekeeperUtilization;
use std::{
borrow::Cow,
cmp::Ordering,
@@ -21,7 +20,6 @@ use crate::{
},
compute_hook::{self, NotifyError},
drain_utils::{self, TenantShardDrain, TenantShardIterator},
heartbeater::SafekeeperState,
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
leadership::Leadership,
metrics,
@@ -31,7 +29,6 @@ use crate::{
ShardGenerationState, TenantFilter,
},
reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
safekeeper::Safekeeper,
scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
tenant_shard::{
MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus,
@@ -209,8 +206,6 @@ struct ServiceState {
nodes: Arc<HashMap<NodeId, Node>>,
safekeepers: Arc<HashMap<NodeId, Safekeeper>>,
scheduler: Scheduler,
/// Ongoing background operation on the cluster if any is running.
@@ -277,7 +272,6 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
impl ServiceState {
fn new(
nodes: HashMap<NodeId, Node>,
safekeepers: HashMap<NodeId, Safekeeper>,
tenants: BTreeMap<TenantShardId, TenantShard>,
scheduler: Scheduler,
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
@@ -289,7 +283,6 @@ impl ServiceState {
leadership_status: initial_leadership_status,
tenants,
nodes: Arc::new(nodes),
safekeepers: Arc::new(safekeepers),
scheduler,
ongoing_operation: None,
delayed_reconcile_rx,
@@ -306,23 +299,6 @@ impl ServiceState {
(&mut self.nodes, &mut self.tenants, &mut self.scheduler)
}
#[allow(clippy::type_complexity)]
fn parts_mut_sk(
&mut self,
) -> (
&mut Arc<HashMap<NodeId, Node>>,
&mut Arc<HashMap<NodeId, Safekeeper>>,
&mut BTreeMap<TenantShardId, TenantShard>,
&mut Scheduler,
) {
(
&mut self.nodes,
&mut self.safekeepers,
&mut self.tenants,
&mut self.scheduler,
)
}
fn get_leadership_status(&self) -> LeadershipStatus {
self.leadership_status
}
@@ -421,8 +397,7 @@ pub struct Service {
compute_hook: Arc<ComputeHook>,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
heartbeater_ps: Heartbeater<Node, PageserverState>,
heartbeater_sk: Heartbeater<Safekeeper, SafekeeperState>,
heartbeater: Heartbeater,
// Channel for background cleanup from failed operations that require cleanup, such as shard split
abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
@@ -632,8 +607,7 @@ impl Service {
let locked = self.inner.read().unwrap();
locked.nodes.clone()
};
let (mut nodes_online, mut sks_online) =
self.initial_heartbeat_round(all_nodes.keys()).await;
let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
// List of tenants for which we will attempt to notify compute of their location at startup
let mut compute_notifications = Vec::new();
@@ -642,7 +616,7 @@ impl Service {
tracing::info!("Populating tenant shards' states from initial pageserver scan...");
let shard_count = {
let mut locked = self.inner.write().unwrap();
let (nodes, safekeepers, tenants, scheduler) = locked.parts_mut_sk();
let (nodes, tenants, scheduler) = locked.parts_mut();
// Mark nodes online if they responded to us: nodes are offline by default after a restart.
let mut new_nodes = (**nodes).clone();
@@ -654,17 +628,6 @@ impl Service {
}
*nodes = Arc::new(new_nodes);
let mut new_sks = (**safekeepers).clone();
for (node_id, node) in new_sks.iter_mut() {
if let Some((utilization, last_seen_at)) = sks_online.remove(node_id) {
node.set_availability(SafekeeperState::Available {
utilization,
last_seen_at,
});
}
}
*safekeepers = Arc::new(new_sks);
for (tenant_shard_id, observed_state) in observed.0 {
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
for node_id in observed_state.locations.keys() {
@@ -773,10 +736,7 @@ impl Service {
async fn initial_heartbeat_round<'a>(
&self,
node_ids: impl Iterator<Item = &'a NodeId>,
) -> (
HashMap<NodeId, PageserverUtilization>,
HashMap<NodeId, (SafekeeperUtilization, Instant)>,
) {
) -> HashMap<NodeId, PageserverUtilization> {
assert!(!self.startup_complete.is_ready());
let all_nodes = {
@@ -796,20 +756,14 @@ impl Service {
}
}
let all_sks = {
let locked = self.inner.read().unwrap();
locked.safekeepers.clone()
};
tracing::info!("Sending initial heartbeats...");
let res_ps = self
.heartbeater_ps
let res = self
.heartbeater
.heartbeat(Arc::new(nodes_to_heartbeat))
.await;
let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
let mut online_nodes = HashMap::new();
if let Ok(deltas) = res_ps {
if let Ok(deltas) = res {
for (node_id, status) in deltas.0 {
match status {
PageserverState::Available { utilization, .. } => {
@@ -823,22 +777,7 @@ impl Service {
}
}
let mut online_sks = HashMap::new();
if let Ok(deltas) = res_sk {
for (node_id, status) in deltas.0 {
match status {
SafekeeperState::Available {
utilization,
last_seen_at,
} => {
online_sks.insert(node_id, (utilization, last_seen_at));
}
SafekeeperState::Offline => {}
}
}
}
(online_nodes, online_sks)
online_nodes
}
/// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
@@ -1045,14 +984,8 @@ impl Service {
locked.nodes.clone()
};
let safekeepers = {
let locked = self.inner.read().unwrap();
locked.safekeepers.clone()
};
let res_ps = self.heartbeater_ps.heartbeat(nodes).await;
let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await;
if let Ok(deltas) = res_ps {
let res = self.heartbeater.heartbeat(nodes).await;
if let Ok(deltas) = res {
let mut to_handle = Vec::default();
for (node_id, state) in deltas.0 {
@@ -1153,18 +1086,6 @@ impl Service {
}
}
}
if let Ok(deltas) = res_sk {
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
for (id, state) in deltas.0 {
let Some(sk) = safekeepers.get_mut(&id) else {
tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}");
continue;
};
sk.set_availability(state);
}
locked.safekeepers = Arc::new(safekeepers);
}
}
}
@@ -1390,17 +1311,6 @@ impl Service {
.storage_controller_pageserver_nodes
.set(nodes.len() as i64);
tracing::info!("Loading safekeepers from database...");
let safekeepers = persistence
.list_safekeepers()
.await?
.into_iter()
.map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
.collect::<Vec<_>>();
let safekeepers: HashMap<NodeId, Safekeeper> =
safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
tracing::info!("Loading shards from database...");
let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
tracing::info!(
@@ -1527,14 +1437,7 @@ impl Service {
let cancel = CancellationToken::new();
let reconcilers_cancel = cancel.child_token();
let heartbeater_ps = Heartbeater::new(
config.jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
);
let heartbeater_sk = Heartbeater::new(
let heartbeater = Heartbeater::new(
config.jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
@@ -1550,7 +1453,6 @@ impl Service {
let this = Arc::new(Self {
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
nodes,
safekeepers,
tenants,
scheduler,
delayed_reconcile_rx,
@@ -1560,8 +1462,7 @@ impl Service {
persistence,
compute_hook: Arc::new(ComputeHook::new(config.clone())),
result_tx,
heartbeater_ps,
heartbeater_sk,
heartbeater,
reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
config.reconciler_concurrency,
)),
@@ -5213,12 +5114,7 @@ impl Service {
shard.sequence = shard.sequence.next();
}
let reconciler_config = match migrate_req.migration_config {
Some(cfg) => (&cfg).into(),
None => ReconcilerConfig::default(),
};
self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
self.maybe_reconcile_shard(shard, nodes)
};
if let Some(waiter) = waiter {
@@ -7765,54 +7661,29 @@ impl Service {
pub(crate) async fn safekeepers_list(
&self,
) -> Result<Vec<SafekeeperDescribeResponse>, DatabaseError> {
let locked = self.inner.read().unwrap();
let mut list = locked
.safekeepers
.iter()
.map(|sk| sk.1.describe_response())
.collect::<Result<Vec<_>, _>>()?;
list.sort_by_key(|v| v.id);
Ok(list)
self.persistence
.list_safekeepers()
.await?
.into_iter()
.map(|v| v.as_describe_response())
.collect::<Result<Vec<_>, _>>()
}
pub(crate) async fn get_safekeeper(
&self,
id: i64,
) -> Result<SafekeeperDescribeResponse, DatabaseError> {
let locked = self.inner.read().unwrap();
let sk = locked
.safekeepers
.get(&NodeId(id as u64))
.ok_or(diesel::result::Error::NotFound)?;
sk.describe_response()
self.persistence
.safekeeper_get(id)
.await
.and_then(|v| v.as_describe_response())
}
pub(crate) async fn upsert_safekeeper(
&self,
record: crate::persistence::SafekeeperUpsert,
) -> Result<(), DatabaseError> {
let node_id = NodeId(record.id as u64);
self.persistence.safekeeper_upsert(record.clone()).await?;
{
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
match safekeepers.entry(node_id) {
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().update_from_record(record);
}
std::collections::hash_map::Entry::Vacant(entry) => {
entry.insert(Safekeeper::from_persistence(
crate::persistence::SafekeeperPersistence::from_upsert(
record,
SkSchedulingPolicy::Pause,
),
CancellationToken::new(),
));
}
}
locked.safekeepers = Arc::new(safekeepers);
}
Ok(())
self.persistence.safekeeper_upsert(record).await
}
pub(crate) async fn set_safekeeper_scheduling_policy(
@@ -7822,20 +7693,7 @@ impl Service {
) -> Result<(), DatabaseError> {
self.persistence
.set_safekeeper_scheduling_policy(id, scheduling_policy)
.await?;
let node_id = NodeId(id as u64);
// After the change has been persisted successfully, update the in-memory state
{
let mut locked = self.inner.write().unwrap();
let mut safekeepers = (*locked.safekeepers).clone();
let sk = safekeepers
.get_mut(&node_id)
.ok_or(DatabaseError::Logical("Not found".to_string()))?;
sk.skp.scheduling_policy = String::from(scheduling_policy);
locked.safekeepers = Arc::new(safekeepers);
}
Ok(())
.await
}
pub(crate) async fn update_shards_preferred_azs(

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
import abc
import asyncio
import concurrent.futures
import dataclasses
import filecmp
import json
import os
@@ -1676,12 +1675,6 @@ class StorageControllerLeadershipStatus(StrEnum):
CANDIDATE = "candidate"
@dataclass
class StorageControllerMigrationConfig:
secondary_warmup_timeout: str | None
secondary_download_request_timeout: str | None
class NeonStorageController(MetricsGetter, LogUtils):
def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
self.env = env
@@ -2075,20 +2068,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
shards: list[TenantShardId] = body["new_shards"]
return shards
def tenant_shard_migrate(
self,
tenant_shard_id: TenantShardId,
dest_ps_id: int,
config: StorageControllerMigrationConfig | None = None,
):
payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}
if config is not None:
payload["migration_config"] = dataclasses.asdict(config)
def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
self.request(
"PUT",
f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
json=payload,
json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
headers=self.headers(TokenScope.ADMIN),
)
log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
@@ -4988,13 +4972,8 @@ def check_restored_datadir_content(
restored_files = list_files_to_compare(restored_dir_path)
# pg_notify files are always ignored
pgdata_files = [f for f in pgdata_files if not f.startswith("pg_notify")]
restored_files = [f for f in restored_files if not f.startswith("pg_notify")]
# pg_xact and pg_multixact files are optional in basebackup: depending on our configuration they
# may be omitted and loaded on demand.
if pgdata_files != restored_files:
# filter pg_xact and multixact files which are downloaded on demand
pgdata_files = [
f
for f in pgdata_files

View File

@@ -141,7 +141,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"compaction_threshold": 13,
"compaction_upper_limit": 100,
"compaction_l0_first": False,
"compaction_l0_semaphore": False,
"l0_flush_delay_threshold": 25,
"l0_flush_stall_threshold": 42,
"l0_flush_wait_upload": False,

View File

@@ -1,62 +0,0 @@
from __future__ import annotations
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import wait_until
def test_compute_reconfigure(neon_simple_env: NeonEnv):
"""
Test that we can change postgresql.conf settings even if
skip_pg_catalog_updates=True is set.
"""
env = neon_simple_env
TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: "
endpoint = env.endpoints.create_start("main")
# Check that the log line prefix is not set
# or different from TEST_LOG_LINE_PREFIX
with endpoint.cursor() as cursor:
cursor.execute("SHOW log_line_prefix;")
row = cursor.fetchone()
assert row is not None
assert row[0] != TEST_LOG_LINE_PREFIX
endpoint.respec_deep(
**{
"skip_pg_catalog_updates": True,
"cluster": {
"settings": [
{
"name": "log_line_prefix",
"vartype": "string",
"value": TEST_LOG_LINE_PREFIX,
}
]
},
}
)
endpoint.reconfigure()
# Check that in logs we see that it was actually reconfigured,
# not restarted or something else.
endpoint.log_contains("INFO request{method=POST uri=/configure")
# In /configure we only send SIGHUP at the end, so in theory
# it doesn't necessarily mean that Postgres already reloaded
# the new config; and it may race in some envs.
# So we wait until we see the log line that the config was changed.
def check_logs():
endpoint.log_contains(
f'[test_compute_reconfigure]: LOG: parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"'
)
wait_until(check_logs)
# Check that the log line prefix is set
with endpoint.cursor() as cursor:
cursor.execute("SHOW log_line_prefix;")
row = cursor.fetchone()
assert row is not None
assert row[0] == TEST_LOG_LINE_PREFIX

View File

@@ -231,14 +231,14 @@ def test_pgdata_import_smoke(
shard_zero_http = shard_zero_ps.http_client()
shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id)
initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
min_readable_lsn = Lsn(shard_zero_timeline_info["min_readable_lsn"])
latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"])
last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"])
disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"])
_remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"])
remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"])
# assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None`
assert remote_consistent_lsn_visible == disk_consistent_lsn
assert initdb_lsn == min_readable_lsn
assert initdb_lsn == latest_gc_cutoff_lsn
assert disk_consistent_lsn == initdb_lsn + 8
assert last_record_lsn == disk_consistent_lsn
# TODO: assert these values are the same everywhere

View File

@@ -10,18 +10,14 @@ from typing import TYPE_CHECKING
import pytest
from fixtures.common_types import TenantId, TenantShardId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
NeonPageserver,
StorageControllerMigrationConfig,
)
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.utils import (
assert_prefix_empty,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.workload import Workload
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -893,93 +889,3 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
assert progress_3["heatmap_mtime"] is not None
assert progress_3["layers_total"] == progress_3["layers_downloaded"]
assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]
@skip_in_debug_build("only run with release build")
@run_only_on_default_postgres("PG version is not interesting here")
def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_configs()
env.start()
assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}')
env.storage_controller.reconcile_until_idle()
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
# Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(128, upload=True)
workload.write_rows(128, upload=True)
workload.write_rows(128, upload=True)
workload.write_rows(128, upload=True)
workload.stop()
# Expect lots of layers
assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
# Simulate large data by making layer downloads artifically slow
for ps in env.pageservers:
ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
# Upload a heatmap, so that secondaries have something to download
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
# This has no chance to succeed: we have lots of layers and each one takes at least 1000ms.
# However, it pulls the heatmap, which will be important later.
http_client = env.storage_controller.pageserver_api()
(status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
assert status == 202
assert progress["heatmap_mtime"] is not None
assert progress["layers_downloaded"] > 0
assert progress["bytes_downloaded"] > 0
assert progress["layers_total"] > progress["layers_downloaded"]
assert progress["bytes_total"] > progress["bytes_downloaded"]
env.storage_controller.allowed_errors.extend(
[
".*Timed out.*downloading layers.*",
]
)
# Use a custom configuration that gives up earlier than usual.
# We can't hydrate everything anyway because of the failpoints.
config = StorageControllerMigrationConfig(
secondary_warmup_timeout="5s", secondary_download_request_timeout="2s"
)
env.storage_controller.tenant_shard_migrate(
TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config
)
env.storage_controller.reconcile_until_idle()
assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id
ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0
# The new layer map should contain all the layers in the pre-migration one
# and a new in memory layer
assert len(heatmap_before_migration["timelines"][0]["layers"]) + 1 == len(
heatmap_after_migration["timelines"][0]["layers"]
)
log.info(
f'Heatmap size after cold migration is {len(heatmap_after_migration["timelines"][0]["layers"])}'
)
# TODO: Once we have an endpoint for rescuing the cold location, exercise it here.

View File

@@ -261,7 +261,7 @@ def test_isolation(
pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
# This fails with a mismatch on `pg_multixact/offsets/0000`
post_checks(env, test_output_dir, DBNAME, endpoint)
# post_checks(env, test_output_dir, DBNAME, endpoint)
# Run extra Neon-specific pg_regress-based tests. The tests and their

View File

@@ -287,7 +287,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
offset=offset,
)
# Do some update so we can increment gc_cutoff
# Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100)
# Wait for the existing lease to expire.

View File

@@ -1821,7 +1821,7 @@ def test_sharding_gc(
# TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
ps.allowed_errors.extend(
[
".*could not find data for key.*",
".*could not find data for key 020000000000000000000000000000000000.*",
".*could not ingest record.*",
]
)

View File

@@ -318,7 +318,7 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
# TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
ps.allowed_errors.extend(
[
".*could not find data for key.*",
".*could not find data for key 020000000000000000000000000000000000.*",
".*could not ingest record.*",
]
)

View File

@@ -566,14 +566,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
# This test is flaky, probably because PUTs of local fs storage are not atomic.
# Let's keep both remote storage kinds for a while to see if this is the case.
# https://github.com/neondatabase/neon/issues/10761
@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS])
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant

16
vendor/revisions.json vendored
View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.3",
"4d3a722312b496ff7378156caa6d41c2e70c30e4"
"17.2",
"4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d"
],
"v16": [
"16.7",
"999cf81b101ead40e597d5cd729458d8200f4537"
"16.6",
"13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792"
],
"v15": [
"15.11",
"80ed91ce255c765d25be0bb4a02c942fe6311fbf"
"15.10",
"355a7c69d3f907f3612eb406cc7b9c2f55d59b59"
],
"v14": [
"14.16",
"62a86dfc91e0c35a72f2ea5e99e6969b830c0c26"
"14.15",
"c0aedfd3cac447510a2db843b561f0c52901b679"
]
}