mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 01:42:55 +00:00
Merge pull request #10820 from neondatabase/rc/release/2025-02-14
Storage release 2025-02-14
This commit is contained in:
22
.github/workflows/_build-and-test-locally.yml
vendored
22
.github/workflows/_build-and-test-locally.yml
vendored
@@ -20,9 +20,14 @@ on:
|
||||
required: true
|
||||
type: string
|
||||
test-cfg:
|
||||
description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
|
||||
description: 'a json object of postgres versions and lfc states to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
sanitizers:
|
||||
description: 'enabled or disabled'
|
||||
required: false
|
||||
default: 'disabled'
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -48,8 +53,6 @@ jobs:
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build-type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -89,7 +92,7 @@ jobs:
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
SANITIZERS: ${{ matrix.sanitizers }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
@@ -167,7 +170,7 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
env:
|
||||
WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
|
||||
WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
|
||||
run: |
|
||||
export ASAN_OPTIONS=detect_leaks=0
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
|
||||
@@ -177,7 +180,7 @@ jobs:
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
SANITIZERS: ${{ matrix.sanitizers }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
@@ -225,7 +228,7 @@ jobs:
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Run rust tests
|
||||
if: ${{ matrix.sanitizers != 'enabled' }}
|
||||
if: ${{ inputs.sanitizers != 'enabled' }}
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
@@ -287,6 +290,7 @@ jobs:
|
||||
DATABASE_URL: postgresql://localhost:1235/storage_controller
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
run: |
|
||||
export ASAN_OPTIONS=detect_leaks=0
|
||||
/tmp/neon/bin/neon_local init
|
||||
/tmp/neon/bin/neon_local storage_controller start
|
||||
|
||||
@@ -333,7 +337,7 @@ jobs:
|
||||
- name: Pytest regression tests
|
||||
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
|
||||
timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
@@ -351,7 +355,7 @@ jobs:
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
|
||||
USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
|
||||
SANITIZERS: ${{ matrix.sanitizers }}
|
||||
SANITIZERS: ${{ inputs.sanitizers }}
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
|
||||
56
.github/workflows/_push-to-acr.yml
vendored
56
.github/workflows/_push-to-acr.yml
vendored
@@ -1,56 +0,0 @@
|
||||
name: Push images to ACR
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
client_id:
|
||||
description: Client ID of Azure managed identity or Entra app
|
||||
required: true
|
||||
type: string
|
||||
image_tag:
|
||||
description: Tag for the container image
|
||||
required: true
|
||||
type: string
|
||||
images:
|
||||
description: Images to push
|
||||
required: true
|
||||
type: string
|
||||
registry_name:
|
||||
description: Name of the container registry
|
||||
required: true
|
||||
type: string
|
||||
subscription_id:
|
||||
description: Azure subscription ID
|
||||
required: true
|
||||
type: string
|
||||
tenant_id:
|
||||
description: Azure tenant ID
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
push-to-acr:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
|
||||
steps:
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ inputs.client_id }}
|
||||
subscription-id: ${{ inputs.subscription_id }}
|
||||
tenant-id: ${{ inputs.tenant_id }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=${{ inputs.registry_name }}
|
||||
|
||||
- name: Copy docker images to ACR ${{ inputs.registry_name }}
|
||||
run: |
|
||||
images='${{ inputs.images }}'
|
||||
for image in ${images}; do
|
||||
docker buildx imagetools create \
|
||||
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
done
|
||||
101
.github/workflows/_push-to-container-registry.yml
vendored
Normal file
101
.github/workflows/_push-to-container-registry.yml
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
name: Push images to Container Registry
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
# Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
|
||||
image-map:
|
||||
description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
|
||||
required: true
|
||||
type: string
|
||||
aws-region:
|
||||
description: AWS region to log in to. Required when pushing to ECR.
|
||||
required: false
|
||||
type: string
|
||||
aws-account-ids:
|
||||
description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
|
||||
required: false
|
||||
type: string
|
||||
azure-client-id:
|
||||
description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR.
|
||||
required: false
|
||||
type: string
|
||||
azure-subscription-id:
|
||||
description: Azure subscription ID. Required when pushing to ACR.
|
||||
required: false
|
||||
type: string
|
||||
azure-tenant-id:
|
||||
description: Azure tenant ID. Required when pushing to ACR.
|
||||
required: false
|
||||
type: string
|
||||
acr-registry-name:
|
||||
description: ACR registry name. Required when pushing to ACR.
|
||||
required: false
|
||||
type: string
|
||||
secrets:
|
||||
docker-hub-username:
|
||||
description: Docker Hub username. Required when pushing to Docker Hub.
|
||||
required: false
|
||||
docker-hub-password:
|
||||
description: Docker Hub password. Required when pushing to Docker Hub.
|
||||
required: false
|
||||
aws-role-to-assume:
|
||||
description: AWS role to assume. Required when pushing to ECR.
|
||||
required: false
|
||||
|
||||
permissions: {}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
push-to-container-registry:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
id-token: write # Required for aws/azure login
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
sparse-checkout: scripts/push_with_image_map.py
|
||||
sparse-checkout-cone-mode: false
|
||||
|
||||
- name: Print image-map
|
||||
run: echo '${{ inputs.image-map }}' | jq
|
||||
|
||||
- name: Configure AWS credentials
|
||||
if: contains(inputs.image-map, 'amazonaws.com/')
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: "${{ inputs.aws-region }}"
|
||||
role-to-assume: "${{ secrets.aws-role-to-assume }}"
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to ECR
|
||||
if: contains(inputs.image-map, 'amazonaws.com/')
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
with:
|
||||
registries: "${{ inputs.aws-account-ids }}"
|
||||
|
||||
- name: Configure Azure credentials
|
||||
if: contains(inputs.image-map, 'azurecr.io/')
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ inputs.azure-client-id }}
|
||||
subscription-id: ${{ inputs.azure-subscription-id }}
|
||||
tenant-id: ${{ inputs.azure-tenant-id }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: contains(inputs.image-map, 'azurecr.io/')
|
||||
run: |
|
||||
az acr login --name=${{ inputs.acr-registry-name }}
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.docker-hub-username }}
|
||||
password: ${{ secrets.docker-hub-password }}
|
||||
|
||||
- name: Copy docker images to target registries
|
||||
run: python scripts/push_with_image_map.py
|
||||
env:
|
||||
IMAGE_MAP: ${{ inputs.image-map }}
|
||||
269
.github/workflows/build_and_test.yml
vendored
269
.github/workflows/build_and_test.yml
vendored
@@ -263,8 +263,9 @@ jobs:
|
||||
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
|
||||
|
||||
benchmarks:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
|
||||
if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
|
||||
needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
@@ -497,7 +498,7 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
|
||||
needs: [ check-permissions, promote-images-dev, tag ]
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
@@ -571,21 +572,6 @@ jobs:
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Push multi-arch image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
compute-node-image-arch:
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
permissions:
|
||||
@@ -632,16 +618,6 @@ jobs:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: cache.neon.build
|
||||
@@ -729,21 +705,6 @@ jobs:
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, large ]
|
||||
@@ -876,133 +837,109 @@ jobs:
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
|
||||
docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images-dev:
|
||||
needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
|
||||
generate-image-maps:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16 v17
|
||||
|
||||
outputs:
|
||||
neon-dev: ${{ steps.generate.outputs.neon-dev }}
|
||||
neon-prod: ${{ steps.generate.outputs.neon-prod }}
|
||||
compute-dev: ${{ steps.generate.outputs.compute-dev }}
|
||||
compute-prod: ${{ steps.generate.outputs.compute-prod }}
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
sparse-checkout: scripts/generate_image_maps.py
|
||||
sparse-checkout-cone-mode: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
- name: Generate Image Maps
|
||||
id: generate
|
||||
run: python scripts/generate_image_maps.py
|
||||
env:
|
||||
BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
|
||||
BRANCH: "${{ github.ref_name }}"
|
||||
DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
|
||||
PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Copy vm-compute-node images to ECR
|
||||
run: |
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
promote-images-prod:
|
||||
needs: [ check-permissions, tag, test-images, promote-images-dev ]
|
||||
runs-on: ubuntu-22.04
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
|
||||
permissions:
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
statuses: write
|
||||
contents: read
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16 v17
|
||||
|
||||
steps:
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 3600
|
||||
|
||||
- name: Login to Amazon Dev ECR
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: github.ref_name == 'main'
|
||||
run: |
|
||||
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
|
||||
docker buildx imagetools create -t $repo/neon:latest \
|
||||
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
|
||||
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
done
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
|
||||
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Configure AWS-prod credentials
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
mask-aws-account-id: true
|
||||
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
|
||||
|
||||
- name: Login to prod ECR
|
||||
uses: docker/login-action@v3
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
with:
|
||||
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
|
||||
|
||||
- name: Copy all images to prod ECR
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
run: |
|
||||
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
|
||||
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
|
||||
push-to-acr-dev:
|
||||
if: github.ref_name == 'main'
|
||||
needs: [ tag, promote-images-dev ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
push-neon-image-dev:
|
||||
needs: [ generate-image-maps, neon-image ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
|
||||
aws-region: eu-central-1
|
||||
aws-account-ids: "369495373322"
|
||||
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
|
||||
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
secrets:
|
||||
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
|
||||
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
push-to-acr-prod:
|
||||
push-compute-image-dev:
|
||||
needs: [ generate-image-maps, vm-compute-node-image ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
|
||||
aws-region: eu-central-1
|
||||
aws-account-ids: "369495373322"
|
||||
azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
|
||||
acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
secrets:
|
||||
aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
|
||||
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
push-neon-image-prod:
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
needs: [ tag, promote-images-prod ]
|
||||
uses: ./.github/workflows/_push-to-acr.yml
|
||||
needs: [ generate-image-maps, neon-image, test-images ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
|
||||
aws-region: eu-central-1
|
||||
aws-account-ids: "093970136003"
|
||||
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
|
||||
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
secrets:
|
||||
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
|
||||
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
push-compute-image-prod:
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
needs: [ generate-image-maps, vm-compute-node-image, test-images ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
|
||||
aws-region: eu-central-1
|
||||
aws-account-ids: "093970136003"
|
||||
azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
|
||||
acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
secrets:
|
||||
aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
|
||||
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
# This is a bit of a special case so we're not using a generated image map.
|
||||
add-latest-tag-to-neon-extensions-test-image:
|
||||
if: github.ref_name == 'main'
|
||||
needs: [ tag, compute-node-image ]
|
||||
uses: ./.github/workflows/_push-to-container-registry.yml
|
||||
with:
|
||||
image-map: |
|
||||
{
|
||||
"docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
|
||||
"docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
|
||||
}
|
||||
secrets:
|
||||
docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
trigger-custom-extensions-build-and-wait:
|
||||
needs: [ check-permissions, tag ]
|
||||
@@ -1084,7 +1021,7 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
|
||||
needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
|
||||
if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
|
||||
permissions:
|
||||
@@ -1337,7 +1274,7 @@ jobs:
|
||||
done
|
||||
|
||||
pin-build-tools-image:
|
||||
needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
|
||||
needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
|
||||
if: github.ref_name == 'main'
|
||||
uses: ./.github/workflows/pin-build-tools-image.yml
|
||||
with:
|
||||
@@ -1362,7 +1299,8 @@ jobs:
|
||||
- check-codestyle-rust
|
||||
- check-dependencies-rust
|
||||
- files-changed
|
||||
- promote-images-dev
|
||||
- push-compute-image-dev
|
||||
- push-neon-image-dev
|
||||
- test-images
|
||||
- trigger-custom-extensions-build-and-wait
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -1379,6 +1317,7 @@ jobs:
|
||||
|| needs.check-codestyle-python.result == 'skipped'
|
||||
|| needs.check-codestyle-rust.result == 'skipped'
|
||||
|| needs.files-changed.result == 'skipped'
|
||||
|| needs.promote-images-dev.result == 'skipped'
|
||||
|| needs.push-compute-image-dev.result == 'skipped'
|
||||
|| needs.push-neon-image-dev.result == 'skipped'
|
||||
|| needs.test-images.result == 'skipped'
|
||||
|| needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
|
||||
|
||||
@@ -74,7 +74,8 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
|
||||
test-cfg: '[{"pg_version":"v17"}]'
|
||||
sanitizers: enabled
|
||||
secrets: inherit
|
||||
|
||||
|
||||
|
||||
38
.github/workflows/trigger-e2e-tests.yml
vendored
38
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -15,7 +15,14 @@ env:
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -29,6 +36,7 @@ jobs:
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
@@ -68,7 +76,7 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
- name: Wait for `promote-images-dev` job to finish
|
||||
- name: Wait for `push-{neon,compute}-image-dev` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
@@ -79,20 +87,20 @@ jobs:
|
||||
# For PRs we use the run id as the tag
|
||||
BUILD_AND_TEST_RUN_ID=${TAG}
|
||||
while true; do
|
||||
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
|
||||
case "$conclusion" in
|
||||
success)
|
||||
break
|
||||
;;
|
||||
failure | cancelled | skipped)
|
||||
echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
|
||||
sleep 60
|
||||
;;
|
||||
esac
|
||||
gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
|
||||
if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
|
||||
break
|
||||
fi
|
||||
jq -c '.[]' jobs.json | while read -r job; do
|
||||
case $(echo $job | jq .conclusion) in
|
||||
failure | cancelled | skipped)
|
||||
echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..."
|
||||
sleep 60
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
|
||||
65
Cargo.lock
generated
65
Cargo.lock
generated
@@ -786,7 +786,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_core"
|
||||
version = "0.21.0"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
@@ -815,7 +815,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_identity"
|
||||
version = "0.21.0"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
@@ -834,7 +834,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_storage"
|
||||
version = "0.21.0"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"async-lock",
|
||||
@@ -852,7 +852,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_storage_blobs"
|
||||
version = "0.21.0"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
|
||||
dependencies = [
|
||||
"RustyXML",
|
||||
"azure_core",
|
||||
@@ -872,7 +872,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "azure_svc_blobstorage"
|
||||
version = "0.21.0"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
|
||||
source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
|
||||
dependencies = [
|
||||
"azure_core",
|
||||
"bytes",
|
||||
@@ -1293,6 +1293,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"jsonwebtoken",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
@@ -1320,6 +1321,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
@@ -1433,6 +1435,7 @@ dependencies = [
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
@@ -2757,6 +2760,38 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"backtrace",
|
||||
"bytes",
|
||||
"fail",
|
||||
"flate2",
|
||||
"hyper 0.14.30",
|
||||
"inferno 0.12.0",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pprof",
|
||||
"regex",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.8.0"
|
||||
@@ -4111,6 +4146,7 @@ dependencies = [
|
||||
"futures",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
@@ -4211,6 +4247,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"futures",
|
||||
"http-utils",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"reqwest",
|
||||
@@ -4917,6 +4954,7 @@ dependencies = [
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
"http-body-util",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
@@ -5764,6 +5802,7 @@ dependencies = [
|
||||
"futures",
|
||||
"hex",
|
||||
"http 1.1.0",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
@@ -5828,6 +5867,7 @@ dependencies = [
|
||||
name = "safekeeper_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"http-utils",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
@@ -6410,6 +6450,7 @@ dependencies = [
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
@@ -6421,10 +6462,13 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"rustls 0.23.18",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"safekeeper_api",
|
||||
"safekeeper_client",
|
||||
"scoped-futures",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -7574,48 +7618,38 @@ dependencies = [
|
||||
"criterion",
|
||||
"diatomic-waker",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"inferno 0.12.0",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"postgres_connection",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"routerify",
|
||||
"scopeguard",
|
||||
"sentry",
|
||||
"serde",
|
||||
"serde_assert",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
@@ -8210,6 +8244,7 @@ dependencies = [
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"url",
|
||||
"uuid",
|
||||
"zerocopy",
|
||||
"zeroize",
|
||||
"zstd",
|
||||
|
||||
@@ -18,6 +18,7 @@ members = [
|
||||
"storage_scrubber",
|
||||
"workspace_hack",
|
||||
"libs/compute_api",
|
||||
"libs/http-utils",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
"libs/safekeeper_api",
|
||||
@@ -229,6 +230,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
|
||||
41
Dockerfile
41
Dockerfile
@@ -10,6 +10,28 @@ ARG STABLE_PG_VERSION=16
|
||||
ARG DEBIAN_VERSION=bookworm
|
||||
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
|
||||
|
||||
# Here are the INDEX DIGESTS for the images we use.
|
||||
# You can get them following next steps for now:
|
||||
# 1. Get an authentication token from DockerHub:
|
||||
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
|
||||
# 2. Using that token, query index for the given tag:
|
||||
# curl -s -H "Authorization: Bearer $TOKEN" \
|
||||
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
|
||||
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
|
||||
# -I | grep -i docker-content-digest
|
||||
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
|
||||
# and updates on regular bases and in automated way.
|
||||
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
|
||||
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
|
||||
|
||||
# Here we use ${var/search/replace} syntax, to check
|
||||
# if base image is one of the images, we pin image index for.
|
||||
# If var will match one the known images, we will replace it with the known sha.
|
||||
# If no match, than value will be unaffected, and will process with no-pinned image.
|
||||
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
|
||||
|
||||
# Build Postgres
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
|
||||
WORKDIR /home/nonroot
|
||||
@@ -28,6 +50,14 @@ RUN set -e \
|
||||
&& rm -rf pg_install/build \
|
||||
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
|
||||
|
||||
# Prepare cargo-chef recipe
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS plan
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# Build neon binaries
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
@@ -41,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
|
||||
COPY --from=plan /home/nonroot/recipe.json recipe.json
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS=""
|
||||
|
||||
RUN set -e \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
|
||||
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS
|
||||
RUN set -e \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
--bin pg_sni_router \
|
||||
@@ -59,7 +95,7 @@ RUN set -e \
|
||||
|
||||
# Build final image
|
||||
#
|
||||
FROM debian:${DEBIAN_FLAVOR}
|
||||
FROM $BASE_IMAGE_SHA
|
||||
ARG DEFAULT_PG_VERSION
|
||||
WORKDIR /data
|
||||
|
||||
@@ -112,4 +148,3 @@ EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
|
||||
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
|
||||
|
||||
|
||||
@@ -1,6 +1,29 @@
|
||||
ARG DEBIAN_VERSION=bookworm
|
||||
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
|
||||
|
||||
FROM debian:bookworm-slim AS pgcopydb_builder
|
||||
# Here are the INDEX DIGESTS for the images we use.
|
||||
# You can get them following next steps for now:
|
||||
# 1. Get an authentication token from DockerHub:
|
||||
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
|
||||
# 2. Using that token, query index for the given tag:
|
||||
# curl -s -H "Authorization: Bearer $TOKEN" \
|
||||
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
|
||||
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
|
||||
# -I | grep -i docker-content-digest
|
||||
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
|
||||
# and updates on regular bases and in automated way.
|
||||
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
|
||||
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
|
||||
|
||||
# Here we use ${var/search/replace} syntax, to check
|
||||
# if base image is one of the images, we pin image index for.
|
||||
# If var will match one the known images, we will replace it with the known sha.
|
||||
# If no match, than value will be unaffected, and will process with no-pinned image.
|
||||
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
|
||||
|
||||
FROM $BASE_IMAGE_SHA AS pgcopydb_builder
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
@@ -9,7 +32,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
# By default, /bin/sh used in debian images will treat '\n' as eol,
|
||||
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
@@ -58,7 +81,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
|
||||
fi
|
||||
|
||||
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
|
||||
FROM $BASE_IMAGE_SHA AS build_tools
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Add nonroot user
|
||||
@@ -75,7 +98,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
|
||||
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
|
||||
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
# System deps
|
||||
@@ -138,7 +161,8 @@ RUN curl -fsSL \
|
||||
--output sql_exporter.tar.gz \
|
||||
&& mkdir /tmp/sql_exporter \
|
||||
&& tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
|
||||
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
|
||||
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \
|
||||
&& rm sql_exporter.tar.gz
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION=25.1
|
||||
@@ -276,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
|
||||
ARG CARGO_DENY_VERSION=0.16.2
|
||||
ARG CARGO_HACK_VERSION=0.6.33
|
||||
ARG CARGO_NEXTEST_VERSION=0.9.85
|
||||
ARG CARGO_CHEF_VERSION=0.1.71
|
||||
ARG CARGO_DIESEL_CLI_VERSION=2.2.6
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
@@ -290,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
|
||||
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
|
||||
@@ -83,7 +83,28 @@ ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
ARG DEBIAN_VERSION=bookworm
|
||||
ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
|
||||
ARG ALPINE_CURL_VERSION=8.11.1
|
||||
|
||||
# Here are the INDEX DIGESTS for the images we use.
|
||||
# You can get them following next steps for now:
|
||||
# 1. Get an authentication token from DockerHub:
|
||||
# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
|
||||
# 2. Using that token, query index for the given tag:
|
||||
# curl -s -H "Authorization: Bearer $TOKEN" \
|
||||
# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
|
||||
# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
|
||||
# -I | grep -i docker-content-digest
|
||||
# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
|
||||
# and updates on regular bases and in automated way.
|
||||
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
|
||||
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
|
||||
|
||||
# Here we use ${var/search/replace} syntax, to check
|
||||
# if base image is one of the images, we pin image index for.
|
||||
# If var will match one the known images, we will replace it with the known sha.
|
||||
# If no match, than value will be unaffected, and will process with no-pinned image.
|
||||
ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
|
||||
ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
|
||||
|
||||
# By default, build all PostgreSQL extensions. For quick local testing when you don't
|
||||
# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
|
||||
@@ -94,7 +115,7 @@ ARG EXTENSIONS=all
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:$DEBIAN_FLAVOR AS build-deps
|
||||
FROM $BASE_IMAGE_SHA AS build-deps
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
@@ -103,7 +124,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
# By default, /bin/sh used in debian images will treat '\n' as eol,
|
||||
# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
|
||||
echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
|
||||
|
||||
RUN case $DEBIAN_VERSION in \
|
||||
@@ -127,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
|
||||
$VERSION_INSTALLS \
|
||||
&& apt clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -139,11 +160,11 @@ RUN case $DEBIAN_VERSION in \
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
COPY vendor/postgres-${PG_VERSION:?} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION}" != "v14" ]; then \
|
||||
if [ "${PG_VERSION:?}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
fi && \
|
||||
@@ -237,7 +258,7 @@ RUN case "${DEBIAN_VERSION}" in \
|
||||
|
||||
# Postgis 3.5.0 supports v17
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
export POSTGIS_VERSION=3.5.0 \
|
||||
export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \
|
||||
@@ -312,7 +333,7 @@ FROM build-deps AS pgrouting-src
|
||||
ARG DEBIAN_VERSION
|
||||
ARG PG_VERSION
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
export PGROUTING_VERSION=3.6.2 \
|
||||
export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \
|
||||
@@ -358,7 +379,7 @@ COPY compute/patches/plv8-3.1.10.patch .
|
||||
#
|
||||
# Use new version only for v17
|
||||
# because since v3.2, plv8 doesn't include plcoffee and plls extensions
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
export PLV8_TAG=v3.2.3 \
|
||||
;; \
|
||||
@@ -372,7 +393,7 @@ RUN case "${PG_VERSION}" in \
|
||||
git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
|
||||
tar -czf plv8.tar.gz --exclude .git plv8-src && \
|
||||
cd plv8-src && \
|
||||
if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
|
||||
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
|
||||
|
||||
FROM pg-build AS plv8-build
|
||||
ARG PG_VERSION
|
||||
@@ -392,7 +413,7 @@ RUN \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
case "${PG_VERSION}" in \
|
||||
case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
ln -s plv8-3.2.3.so plv8-3.1.8.so && \
|
||||
ln -s plv8-3.2.3.so plv8-3.1.5.so && \
|
||||
@@ -729,7 +750,7 @@ FROM build-deps AS timescaledb-src
|
||||
ARG PG_VERSION
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v14" | "v15") \
|
||||
export TIMESCALEDB_VERSION=2.10.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||
@@ -767,7 +788,7 @@ ARG PG_VERSION
|
||||
|
||||
# version-specific, has separate releases for each version
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v14") \
|
||||
export PG_HINT_PLAN_VERSION=14_1_4_1 \
|
||||
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
|
||||
@@ -843,7 +864,7 @@ ARG PG_VERSION
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
export RDKIT_VERSION=Release_2024_09_1 \
|
||||
export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
|
||||
@@ -970,7 +991,7 @@ ARG PG_VERSION
|
||||
#
|
||||
# last release v0.40.0 - Jul 22, 2024
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
export SEMVER_VERSION=0.40.0 \
|
||||
export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \
|
||||
@@ -1006,7 +1027,7 @@ ARG PG_VERSION
|
||||
# This is our extension, support stopped in favor of pgvector
|
||||
# TODO: deprecate it
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
|
||||
@@ -1039,7 +1060,7 @@ ARG PG_VERSION
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
RUN case "${PG_VERSION:?}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
@@ -1091,7 +1112,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
FROM pg-build-nonroot-with-cargo AS rust-extensions-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
'v17') \
|
||||
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
|
||||
esac && \
|
||||
@@ -1270,7 +1291,7 @@ FROM build-deps AS pgx_ulid-src
|
||||
ARG PG_VERSION
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v14" | "v15" | "v16") \
|
||||
;; \
|
||||
*) \
|
||||
@@ -1302,7 +1323,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src
|
||||
ARG PG_VERSION
|
||||
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION}" in \
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
@@ -1430,8 +1451,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
FROM build-deps AS pg_mooncake-src
|
||||
ARG PG_VERSION
|
||||
WORKDIR /ext-src
|
||||
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
|
||||
echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
|
||||
echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
|
||||
echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
|
||||
chmod a+x neon-test.sh
|
||||
@@ -1578,7 +1599,15 @@ ENV BUILD_TAG=$BUILD_TAG
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
|
||||
RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
|
||||
--mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
|
||||
--mount=type=cache,uid=1000,target=/home/nonroot/target \
|
||||
mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
|
||||
mkdir target-bin && \
|
||||
cp target/release-line-debug-size-lto/compute_ctl \
|
||||
target/release-line-debug-size-lto/fast_import \
|
||||
target/release-line-debug-size-lto/local_proxy \
|
||||
target-bin
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1586,7 +1615,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
|
||||
FROM $BASE_IMAGE_SHA AS pgbouncer
|
||||
RUN set -e \
|
||||
&& echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
|
||||
&& apt update \
|
||||
@@ -1607,7 +1636,7 @@ RUN set -e \
|
||||
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
|
||||
&& cd pgbouncer \
|
||||
&& ./autogen.sh \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) dist_man_MANS= \
|
||||
&& make install dist_man_MANS=
|
||||
|
||||
@@ -1616,13 +1645,12 @@ RUN set -e \
|
||||
# Layer "exporters"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
|
||||
FROM build-deps AS exporters
|
||||
ARG TARGETARCH
|
||||
# Keep sql_exporter version same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py
|
||||
# See comment on the top of the file regading `echo`, `-e` and `\n`
|
||||
RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
|
||||
if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then\
|
||||
postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
|
||||
pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
|
||||
sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
|
||||
@@ -1641,6 +1669,29 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
|
||||
&& echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
|
||||
&& echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "awscli"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS awscli
|
||||
ARG TARGETARCH
|
||||
RUN set -ex; \
|
||||
if [ "${TARGETARCH}" = "amd64" ]; then \
|
||||
TARGETARCH_ALT="x86_64"; \
|
||||
CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
|
||||
elif [ "${TARGETARCH}" = "arm64" ]; then \
|
||||
TARGETARCH_ALT="aarch64"; \
|
||||
CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
|
||||
else \
|
||||
echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
|
||||
fi; \
|
||||
curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
|
||||
echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \
|
||||
unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
|
||||
/tmp/awscliv2/aws/install; \
|
||||
rm -rf /tmp/awscliv2.zip /tmp/awscliv2
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
@@ -1673,7 +1724,7 @@ USER nonroot
|
||||
|
||||
COPY --chown=nonroot compute compute
|
||||
|
||||
RUN make PG_VERSION="${PG_VERSION}" -C compute
|
||||
RUN make PG_VERSION="${PG_VERSION:?}" -C compute
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1699,15 +1750,15 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
|
||||
COPY --from=hypopg-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_hashids-src /ext-src/ /ext-src/
|
||||
COPY --from=rum-src /ext-src/ /ext-src/
|
||||
#COPY --from=pgtap-src /ext-src/ /ext-src/
|
||||
COPY --from=pgtap-src /ext-src/ /ext-src/
|
||||
COPY --from=ip4r-src /ext-src/ /ext-src/
|
||||
COPY --from=prefix-src /ext-src/ /ext-src/
|
||||
COPY --from=hll-src /ext-src/ /ext-src/
|
||||
COPY --from=plpgsql_check-src /ext-src/ /ext-src/
|
||||
#COPY --from=timescaledb-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
|
||||
COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
|
||||
COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch
|
||||
COPY --from=pg_cron-src /ext-src/ /ext-src/
|
||||
#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
|
||||
#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
|
||||
@@ -1736,51 +1787,12 @@ ENV PGDATABASE=postgres
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:$DEBIAN_FLAVOR
|
||||
FROM $BASE_IMAGE_SHA
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
|
||||
|
||||
# pgbouncer and its config
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
|
||||
|
||||
# local_proxy and its config
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
|
||||
RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
|
||||
COPY --from=exporters ./sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
|
||||
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# liblz4-1 for lz4
|
||||
@@ -1790,10 +1802,9 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
|
||||
# libevent for pgbouncer
|
||||
RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
|
||||
|
||||
RUN apt update && \
|
||||
case $DEBIAN_VERSION in \
|
||||
# Version-specific installs for Bullseye (PG14-PG16):
|
||||
@@ -1828,33 +1839,57 @@ RUN apt update && \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4 \
|
||||
libevent-2.1-7 \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates \
|
||||
curl \
|
||||
unzip \
|
||||
$VERSION_INSTALLS && \
|
||||
apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step)
|
||||
ARG TARGETARCH
|
||||
RUN set -ex; \
|
||||
if [ "${TARGETARCH}" = "amd64" ]; then \
|
||||
TARGETARCH_ALT="x86_64"; \
|
||||
CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
|
||||
elif [ "${TARGETARCH}" = "arm64" ]; then \
|
||||
TARGETARCH_ALT="aarch64"; \
|
||||
CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
|
||||
else \
|
||||
echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
|
||||
fi; \
|
||||
curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
|
||||
echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \
|
||||
unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
|
||||
/tmp/awscliv2/aws/install; \
|
||||
rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \
|
||||
true
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache && \
|
||||
# Create remote extension download directory
|
||||
mkdir /usr/local/download_extensions && \
|
||||
chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# aws cli is used by fast_import
|
||||
COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
|
||||
|
||||
# pgbouncer and its config
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
|
||||
|
||||
# local_proxy and its config
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
|
||||
RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
|
||||
COPY --from=exporters ./sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
|
||||
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Make the libraries we built available
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
ENV LANG=en_US.utf8
|
||||
USER postgres
|
||||
|
||||
@@ -6,16 +6,16 @@ index da723b8..5328114 100644
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
-- No.A-1-2-3
|
||||
DROP EXTENSION pg_hint_plan;
|
||||
-- No.A-1-1-4
|
||||
CREATE SCHEMA other_schema;
|
||||
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
@@ -35,7 +35,7 @@ index d372459..6282afe 100644
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
CREATE EXTENSION file_fdw;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
|
||||
@@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
-- No.A-1-2-3
|
||||
DROP EXTENSION pg_hint_plan;
|
||||
-- No.A-1-1-4
|
||||
CREATE SCHEMA other_schema;
|
||||
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
|
||||
CREATE EXTENSION pg_hint_plan;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
@@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
CREATE EXTENSION file_fdw;
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
|
||||
+LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
|
||||
@@ -47,7 +47,9 @@ files:
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
|
||||
#
|
||||
# Also allow it to shut down the VM. The fast_import job does that when it's finished.
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -72,8 +74,8 @@ build: |
|
||||
# At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
|
||||
# and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
|
||||
# for debian version migration.
|
||||
#
|
||||
FROM debian:bookworm-slim as libcgroup-builder
|
||||
ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
|
||||
FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
|
||||
@@ -68,7 +68,8 @@ build: |
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
|
||||
FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
|
||||
@@ -24,6 +24,7 @@ fail.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
|
||||
@@ -41,19 +41,21 @@ use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Parser;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::http::server::Server;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info, warn};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use compute_tools::compute::{
|
||||
@@ -61,7 +63,6 @@ use compute_tools::compute::{
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version_string;
|
||||
use compute_tools::http::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a compute ID if one is not supplied. This exists to keep forward
|
||||
/// compatibility tests working, but will be removed in a future iteration.
|
||||
fn generate_compute_id() -> String {
|
||||
let now = SystemTime::now();
|
||||
|
||||
format!(
|
||||
"compute-{}",
|
||||
now.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
@@ -94,8 +108,20 @@ struct Cli {
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
|
||||
pub remote_ext_config: Option<String>,
|
||||
|
||||
#[arg(long, default_value_t = 3080)]
|
||||
pub http_port: u16,
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
/// outside the compute will talk to the compute through this port. Keep
|
||||
/// the previous name for this argument around for a smoother release
|
||||
/// with the control plane.
|
||||
///
|
||||
/// TODO: Remove the alias after the control plane release which teaches the
|
||||
/// control plane about the renamed argument.
|
||||
#[arg(long, alias = "http-port", default_value_t = 3080)]
|
||||
pub external_http_port: u16,
|
||||
|
||||
/// The port to bind the internal listening HTTP server to. Clients like
|
||||
/// the neon extension (for installing remote extensions) and local_proxy.
|
||||
#[arg(long)]
|
||||
pub internal_http_port: Option<u16>,
|
||||
|
||||
#[arg(short = 'D', long, value_name = "DATADIR")]
|
||||
pub pgdata: String,
|
||||
@@ -130,17 +156,26 @@ struct Cli {
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
|
||||
pub compute_id: Option<String>,
|
||||
#[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
|
||||
pub compute_id: String,
|
||||
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let build_tag = init()?;
|
||||
// For historical reasons, the main thread that processes the spec and launches postgres
|
||||
// is synchronous, but we always have this tokio runtime available and we "enter" it so
|
||||
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
|
||||
// from all parts of compute_ctl.
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let _rt_guard = runtime.enter();
|
||||
|
||||
let build_tag = runtime.block_on(init())?;
|
||||
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
@@ -172,8 +207,8 @@ fn main() -> Result<()> {
|
||||
deinit_and_exit(wait_pg_result);
|
||||
}
|
||||
|
||||
fn init() -> Result<String> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
async fn init() -> Result<String> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
@@ -246,6 +281,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
info!("got spec from cli argument {}", spec_json);
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_str(spec_json)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: false,
|
||||
});
|
||||
}
|
||||
@@ -255,26 +291,19 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
live_config_allowed: true,
|
||||
});
|
||||
}
|
||||
|
||||
if cli.compute_id.is_none() {
|
||||
panic!(
|
||||
"compute spec should be provided by one of the following ways: \
|
||||
--spec OR --spec-path OR --control-plane-uri and --compute-id"
|
||||
);
|
||||
};
|
||||
if cli.control_plane_uri.is_none() {
|
||||
panic!("must specify both --control-plane-uri and --compute-id or none");
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
match get_spec_from_control_plane(
|
||||
cli.control_plane_uri.as_ref().unwrap(),
|
||||
cli.compute_id.as_ref().unwrap(),
|
||||
) {
|
||||
Ok(spec) => Ok(CliSpecParams {
|
||||
spec,
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(resp) => Ok(CliSpecParams {
|
||||
spec: resp.0,
|
||||
compute_ctl_config: resp.1,
|
||||
live_config_allowed: true,
|
||||
}),
|
||||
Err(e) => {
|
||||
@@ -291,6 +320,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
#[allow(dead_code)]
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
live_config_allowed: bool,
|
||||
}
|
||||
|
||||
@@ -300,6 +331,7 @@ fn wait_spec(
|
||||
CliSpecParams {
|
||||
spec,
|
||||
live_config_allowed,
|
||||
compute_ctl_config: _,
|
||||
}: CliSpecParams,
|
||||
) -> Result<Arc<ComputeNode>> {
|
||||
let mut new_state = ComputeState::new();
|
||||
@@ -319,13 +351,15 @@ fn wait_spec(
|
||||
let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
let compute_node = ComputeNode {
|
||||
compute_id: cli.compute_id.clone(),
|
||||
connstr,
|
||||
conn_conf,
|
||||
tokio_conn_conf,
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
http_port: cli.http_port,
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
@@ -343,10 +377,13 @@ fn wait_spec(
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch http service first, so that we can serve control-plane requests
|
||||
// while configuration is still in progress.
|
||||
let _http_handle =
|
||||
launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
// Launch the external HTTP server first, so that we can serve control plane
|
||||
// requests while configuration is still in progress.
|
||||
Server::External(cli.external_http_port).launch(&compute);
|
||||
|
||||
// The internal HTTP server could be launched later, but there isn't much
|
||||
// sense in waiting.
|
||||
Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
@@ -484,21 +521,6 @@ fn start_postgres(
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = if env::var_os("AUTOSCALING").is_some() {
|
||||
Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor")
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
let token = CancellationToken::new();
|
||||
|
||||
@@ -509,16 +531,19 @@ fn start_postgres(
|
||||
Some(cli.filecache_connstr.clone())
|
||||
};
|
||||
|
||||
let vm_monitor = rt.as_ref().map(|rt| {
|
||||
rt.spawn(vm_monitor::start(
|
||||
let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
|
||||
let vm_monitor = tokio::spawn(vm_monitor::start(
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: Some(cli.cgroup.clone()),
|
||||
pgconnstr,
|
||||
addr: cli.vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
});
|
||||
));
|
||||
Some(vm_monitor)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -528,8 +553,6 @@ fn start_postgres(
|
||||
delay_exit,
|
||||
compute,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor,
|
||||
@@ -537,15 +560,13 @@ fn start_postgres(
|
||||
))
|
||||
}
|
||||
|
||||
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
|
||||
type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
|
||||
|
||||
struct StartPostgresResult {
|
||||
delay_exit: bool,
|
||||
// passed through from WaitSpecResult
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
rt: Option<tokio::runtime::Runtime>,
|
||||
#[cfg(target_os = "linux")]
|
||||
token: tokio_util::sync::CancellationToken,
|
||||
#[cfg(target_os = "linux")]
|
||||
@@ -564,10 +585,10 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
// Process has exited. Wait for the log collecting task to finish.
|
||||
let _ = tokio::runtime::Handle::current()
|
||||
.block_on(logs_handle)
|
||||
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
|
||||
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
@@ -588,8 +609,6 @@ fn cleanup_after_postgres_exit(
|
||||
vm_monitor,
|
||||
#[cfg(target_os = "linux")]
|
||||
token,
|
||||
#[cfg(target_os = "linux")]
|
||||
rt,
|
||||
}: StartPostgresResult,
|
||||
) -> Result<bool> {
|
||||
// Terminate the vm_monitor so it releases the file watcher on
|
||||
@@ -602,10 +621,6 @@ fn cleanup_after_postgres_exit(
|
||||
token.cancel();
|
||||
// Kills the actual task running the monitor
|
||||
handle.abort();
|
||||
|
||||
// If handle is some, rt must have been used to produce it, and
|
||||
// hence is also some
|
||||
rt.unwrap().shutdown_timeout(Duration::from_secs(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,16 @@ struct Args {
|
||||
pg_lib_dir: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
pg_port: Option<u16>, // port to run postgres on, 5432 is default
|
||||
|
||||
/// Number of CPUs in the system. This is used to configure # of
|
||||
/// parallel worker processes, for index creation.
|
||||
#[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
|
||||
num_cpus: Option<usize>,
|
||||
|
||||
/// Amount of RAM in the system. This is used to configure shared_buffers
|
||||
/// and maintenance_work_mem.
|
||||
#[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
|
||||
memory_mb: Option<usize>,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
@@ -202,7 +212,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.await
|
||||
.context("initdb")?;
|
||||
|
||||
let nproc = num_cpus::get();
|
||||
// If the caller didn't specify CPU / RAM to use for sizing, default to
|
||||
// number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
|
||||
let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
|
||||
let memory_mb = args.memory_mb.unwrap_or(256);
|
||||
|
||||
// Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
|
||||
// maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
|
||||
// available for misc other stuff that PostgreSQL uses memory for.
|
||||
let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
|
||||
let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
|
||||
|
||||
//
|
||||
// Launch postgres process
|
||||
@@ -212,12 +231,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
.arg(&pgdata_dir)
|
||||
.args(["-p", &format!("{pg_port}")])
|
||||
.args(["-c", "wal_level=minimal"])
|
||||
.args(["-c", "shared_buffers=10GB"])
|
||||
.args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
|
||||
.args(["-c", "max_wal_senders=0"])
|
||||
.args(["-c", "fsync=off"])
|
||||
.args(["-c", "full_page_writes=off"])
|
||||
.args(["-c", "synchronous_commit=off"])
|
||||
.args(["-c", "maintenance_work_mem=8388608"])
|
||||
.args([
|
||||
"-c",
|
||||
&format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
|
||||
])
|
||||
.args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers={nproc}")])
|
||||
.args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
|
||||
|
||||
@@ -140,5 +140,34 @@ pub async fn get_database_schema(
|
||||
warn!("pg_dump stderr: {}", line)
|
||||
}
|
||||
});
|
||||
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct SchemaStream<S> {
|
||||
// We keep a reference to the child process to ensure it stays alive
|
||||
// while the stream is being consumed. When SchemaStream is dropped,
|
||||
// cmd will be dropped, which triggers kill_on_drop and terminates pg_dump
|
||||
cmd: tokio::process::Child,
|
||||
stream: S,
|
||||
}
|
||||
|
||||
impl<S> Stream for SchemaStream<S>
|
||||
where
|
||||
S: Stream<Item = Result<bytes::Bytes, std::io::Error>> + Unpin,
|
||||
{
|
||||
type Item = Result<bytes::Bytes, std::io::Error>;
|
||||
|
||||
fn poll_next(
|
||||
mut self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> std::task::Poll<Option<Self::Item>> {
|
||||
Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx)
|
||||
}
|
||||
}
|
||||
|
||||
let schema_stream = SchemaStream {
|
||||
cmd,
|
||||
stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))),
|
||||
};
|
||||
|
||||
Ok(schema_stream)
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use std::str::FromStr;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
@@ -59,6 +58,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
/// The ID of the compute
|
||||
pub compute_id: String,
|
||||
// Url type maintains proper escaping
|
||||
pub connstr: url::Url,
|
||||
// We connect to Postgres from many different places, so build configs once
|
||||
@@ -81,8 +82,10 @@ pub struct ComputeNode {
|
||||
/// - we push spec and it does configuration
|
||||
/// - but then it is restarted without any spec again
|
||||
pub live_config_allowed: bool,
|
||||
/// The port that the compute's HTTP server listens on
|
||||
pub http_port: u16,
|
||||
/// The port that the compute's external HTTP server listens on
|
||||
pub external_http_port: u16,
|
||||
/// The port that the compute's internal HTTP server listens on
|
||||
pub internal_http_port: u16,
|
||||
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
|
||||
/// To allow HTTP API server to serving status requests, while configuration
|
||||
/// is in progress, lock should be held only for short periods of time to do
|
||||
@@ -546,11 +549,7 @@ impl ComputeNode {
|
||||
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
// Run actual work with new tokio runtime
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
|
||||
|
||||
// Record runtime
|
||||
@@ -597,9 +596,9 @@ impl ComputeNode {
|
||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
let _ = tokio::runtime::Handle::current()
|
||||
.block_on(logs_handle)
|
||||
.map_err(|e| tracing::error!("log task panicked: {:?}", e));
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
@@ -634,7 +633,7 @@ impl ComputeNode {
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
self.http_port,
|
||||
self.internal_http_port,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -784,7 +783,7 @@ impl ComputeNode {
|
||||
pub fn start_postgres(
|
||||
&self,
|
||||
storage_auth_token: Option<String>,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
@@ -800,7 +799,7 @@ impl ComputeNode {
|
||||
.expect("cannot start postgres process");
|
||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||
|
||||
// Start a thread to collect logs from stderr.
|
||||
// Start a task to collect logs from stderr.
|
||||
let stderr = pg.stderr.take().expect("stderr should be captured");
|
||||
let logs_handle = handle_postgres_logs(stderr);
|
||||
|
||||
@@ -809,20 +808,28 @@ impl ComputeNode {
|
||||
Ok((pg, logs_handle))
|
||||
}
|
||||
|
||||
/// Do post configuration of the already started Postgres. This function spawns a background thread to
|
||||
/// Do post configuration of the already started Postgres. This function spawns a background task to
|
||||
/// configure the database after applying the compute spec. Currently, it upgrades the neon extension
|
||||
/// version. In the future, it may upgrade all 3rd-party extensions.
|
||||
#[instrument(skip_all)]
|
||||
pub fn post_apply_config(&self) -> Result<()> {
|
||||
let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
|
||||
thread::spawn(move || {
|
||||
let func = || {
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config"));
|
||||
tokio::spawn(async move {
|
||||
let res = async {
|
||||
let (mut client, connection) = conf.connect(NoTls).await?;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
handle_neon_extension_upgrade(&mut client)
|
||||
.await
|
||||
.context("handle_neon_extension_upgrade")?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
if let Err(err) = func() {
|
||||
}
|
||||
.await;
|
||||
if let Err(err) = res {
|
||||
error!("error while post_apply_config: {err:#}");
|
||||
}
|
||||
});
|
||||
@@ -919,13 +926,10 @@ impl ComputeNode {
|
||||
conf: Arc<tokio_postgres::Config>,
|
||||
concurrency: usize,
|
||||
) -> Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
info!("Applying config with max {} concurrency", concurrency);
|
||||
debug!("Config: {:?}", spec);
|
||||
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
rt.block_on(async {
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let client = Self::get_maintenance_client(&conf).await?;
|
||||
@@ -1319,14 +1323,18 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let conf = conf.as_ref().clone();
|
||||
let mut conf = postgres::config::Config::from(conf);
|
||||
tokio::spawn(async move {
|
||||
let mut conf = conf.as_ref().clone();
|
||||
conf.application_name("compute_ctl:migrations");
|
||||
|
||||
match conf.connect(NoTls) {
|
||||
Ok(mut client) => {
|
||||
if let Err(e) = handle_migrations(&mut client) {
|
||||
match conf.connect(NoTls).await {
|
||||
Ok((mut client, connection)) => {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
if let Err(e) = handle_migrations(&mut client).await {
|
||||
error!("Failed to run migrations: {}", e);
|
||||
}
|
||||
}
|
||||
@@ -1363,16 +1371,11 @@ impl ComputeNode {
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
tokio::spawn(async move {
|
||||
let res = tune_pgbouncer(pgbouncer_settings).await;
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
@@ -1382,41 +1385,42 @@ impl ComputeNode {
|
||||
if let Some(ref local_proxy) = spec.local_proxy_config {
|
||||
info!("configuring local_proxy");
|
||||
|
||||
// Spawn a thread to do the configuration,
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
let _handle = Some(thread::spawn(move || {
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
}
|
||||
}));
|
||||
});
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
|
||||
|
||||
let max_concurrent_connections = spec.reconfigure_concurrency;
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
let max_concurrent_connections = spec.reconfigure_concurrency;
|
||||
// Temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
// creating new extensions, roles, etc.
|
||||
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
// Temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
// creating new extensions, roles, etc.
|
||||
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
|
||||
self.pg_reload_conf()?;
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
|
||||
conf.application_name("apply_config");
|
||||
let conf = Arc::new(conf);
|
||||
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
|
||||
conf.application_name("apply_config");
|
||||
let conf = Arc::new(conf);
|
||||
let spec = Arc::new(spec.clone());
|
||||
|
||||
let spec = Arc::new(spec.clone());
|
||||
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
|
||||
}
|
||||
|
||||
self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
@@ -1431,7 +1435,9 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
pub fn start_compute(
|
||||
&self,
|
||||
) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -1446,16 +1452,11 @@ impl ComputeNode {
|
||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
|
||||
let _handle = tokio::spawn(async move {
|
||||
let res = tune_pgbouncer(pgbouncer_settings).await;
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
@@ -1465,10 +1466,10 @@ impl ComputeNode {
|
||||
if let Some(local_proxy) = &pspec.spec.local_proxy_config {
|
||||
info!("configuring local_proxy");
|
||||
|
||||
// Spawn a thread to do the configuration,
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let _handle = tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
}
|
||||
@@ -1487,7 +1488,8 @@ impl ComputeNode {
|
||||
extension_server::create_control_files(remote_extensions, &self.pgbin);
|
||||
|
||||
let library_load_start_time = Utc::now();
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
@@ -1542,7 +1544,7 @@ impl ComputeNode {
|
||||
self.post_apply_config()?;
|
||||
|
||||
let conf = self.get_conn_conf(None);
|
||||
thread::spawn(move || {
|
||||
tokio::task::spawn_blocking(|| {
|
||||
let res = get_installed_extensions(conf);
|
||||
match res {
|
||||
Ok(extensions) => {
|
||||
@@ -1891,7 +1893,6 @@ LIMIT 100",
|
||||
Ok(ext_version)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(
|
||||
&self,
|
||||
spec: &ComputeSpec,
|
||||
|
||||
@@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
let runtime = tokio::runtime::Handle::current();
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-configurator".into())
|
||||
.spawn(move || {
|
||||
let _rt_guard = runtime.enter();
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})
|
||||
|
||||
@@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode};
|
||||
use serde::Serialize;
|
||||
use tracing::error;
|
||||
|
||||
pub use server::launch_http_server;
|
||||
|
||||
mod extract;
|
||||
mod routes;
|
||||
mod server;
|
||||
pub mod server;
|
||||
|
||||
/// Convenience response builder for JSON responses
|
||||
struct JsonResponse;
|
||||
|
||||
@@ -1,7 +1,21 @@
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use http::StatusCode;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
|
||||
use utils::failpoint_support::apply_failpoint;
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
use crate::http::{extract::Json, JsonResponse};
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::{
|
||||
fmt::Display,
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::Arc,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
@@ -26,46 +26,65 @@ use super::routes::{
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
async fn handle_404() -> Response {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
/// This middleware function allows compute_ctl to generate its own request ID
|
||||
/// if one isn't supplied. The control plane will always send one as a UUID. The
|
||||
/// neon Postgres extension on the other hand does not send one.
|
||||
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
|
||||
let headers = request.headers_mut();
|
||||
|
||||
if headers.get(X_REQUEST_ID).is_none() {
|
||||
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
|
||||
}
|
||||
|
||||
next.run(request).await
|
||||
/// `compute_ctl` has two servers: internal and external. The internal server
|
||||
/// binds to the loopback interface and handles communication from clients on
|
||||
/// the compute. The external server is what receives communication from the
|
||||
/// control plane, the metrics scraper, etc. We make the distinction because
|
||||
/// certain routes in `compute_ctl` only need to be exposed to local processes
|
||||
/// like Postgres via the neon extension and local_proxy.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum Server {
|
||||
Internal(u16),
|
||||
External(u16),
|
||||
}
|
||||
|
||||
/// Run the HTTP server and wait on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
let mut app = Router::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route(
|
||||
"/extension_server/{*filename}",
|
||||
post(extension_server::download_extension),
|
||||
)
|
||||
.route("/extensions", post(extensions::install_extension))
|
||||
.route("/grants", post(grants::add_grant))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate))
|
||||
.fallback(handle_404)
|
||||
.layer(
|
||||
impl Display for Server {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Server::Internal(_) => f.write_str("internal"),
|
||||
Server::External(_) => f.write_str("external"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Server> for Router<Arc<ComputeNode>> {
|
||||
fn from(server: Server) -> Self {
|
||||
let mut router = Router::<Arc<ComputeNode>>::new();
|
||||
|
||||
router = match server {
|
||||
Server::Internal(_) => {
|
||||
router = router
|
||||
.route(
|
||||
"/extension_server/{*filename}",
|
||||
post(extension_server::download_extension),
|
||||
)
|
||||
.route("/extensions", post(extensions::install_extension))
|
||||
.route("/grants", post(grants::add_grant));
|
||||
|
||||
// Add in any testing support
|
||||
if cfg!(feature = "testing") {
|
||||
use super::routes::failpoints;
|
||||
|
||||
router = router.route("/failpoints", post(failpoints::configure_failpoints));
|
||||
}
|
||||
|
||||
router
|
||||
}
|
||||
Server::External(_) => router
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate)),
|
||||
};
|
||||
|
||||
router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
|
||||
ServiceBuilder::new()
|
||||
// Add this middleware since we assume the request ID exists
|
||||
.layer(middleware::from_fn(maybe_add_request_id_header))
|
||||
@@ -105,45 +124,88 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
.with_state(compute);
|
||||
}
|
||||
}
|
||||
|
||||
// Add in any testing support
|
||||
if cfg!(feature = "testing") {
|
||||
use super::routes::failpoints;
|
||||
|
||||
app = app.route("/failpoints", post(failpoints::configure_failpoints))
|
||||
impl Server {
|
||||
async fn handle_404() -> impl IntoResponse {
|
||||
StatusCode::NOT_FOUND
|
||||
}
|
||||
|
||||
// This usually binds to both IPv4 and IPv6 on Linux, see
|
||||
// https://github.com/rust-lang/rust/pull/34440 for more information
|
||||
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
|
||||
let listener = match TcpListener::bind(&addr).await {
|
||||
Ok(listener) => listener,
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed to bind the compute_ctl HTTP server to port {}: {}",
|
||||
port, e
|
||||
);
|
||||
return;
|
||||
async fn handle_405() -> impl IntoResponse {
|
||||
StatusCode::METHOD_NOT_ALLOWED
|
||||
}
|
||||
|
||||
async fn listener(&self) -> Result<TcpListener> {
|
||||
let addr = SocketAddr::new(self.ip(), self.port());
|
||||
let listener = TcpListener::bind(&addr).await?;
|
||||
|
||||
Ok(listener)
|
||||
}
|
||||
|
||||
fn ip(&self) -> IpAddr {
|
||||
match self {
|
||||
// TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
|
||||
// allow binding to localhost
|
||||
Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
|
||||
}
|
||||
};
|
||||
|
||||
if let Ok(local_addr) = listener.local_addr() {
|
||||
info!("compute_ctl HTTP server listening on {}", local_addr);
|
||||
} else {
|
||||
info!("compute_ctl HTTP server listening on port {}", port);
|
||||
}
|
||||
|
||||
if let Err(e) = axum::serve(listener, app).await {
|
||||
error!("compute_ctl HTTP server error: {}", e);
|
||||
fn port(self) -> u16 {
|
||||
match self {
|
||||
Server::Internal(port) => port,
|
||||
Server::External(port) => port,
|
||||
}
|
||||
}
|
||||
|
||||
async fn serve(self, compute: Arc<ComputeNode>) {
|
||||
let listener = self.listener().await.unwrap_or_else(|e| {
|
||||
// If we can't bind, the compute cannot operate correctly
|
||||
panic!(
|
||||
"failed to bind the compute_ctl {} HTTP server to {}: {}",
|
||||
self,
|
||||
SocketAddr::new(self.ip(), self.port()),
|
||||
e
|
||||
);
|
||||
});
|
||||
|
||||
if tracing::enabled!(tracing::Level::INFO) {
|
||||
let local_addr = match listener.local_addr() {
|
||||
Ok(local_addr) => local_addr,
|
||||
Err(_) => SocketAddr::new(self.ip(), self.port()),
|
||||
};
|
||||
|
||||
info!(
|
||||
"compute_ctl {} HTTP server listening at {}",
|
||||
self, local_addr
|
||||
);
|
||||
}
|
||||
|
||||
let router = Router::from(self).with_state(compute);
|
||||
|
||||
if let Err(e) = axum::serve(listener, router).await {
|
||||
error!("compute_ctl {} HTTP server error: {}", self, e);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch(self, compute: &Arc<ComputeNode>) {
|
||||
let state = Arc::clone(compute);
|
||||
|
||||
info!("Launching the {} server", self);
|
||||
|
||||
tokio::spawn(self.serve(state));
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate HTTP server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
/// This middleware function allows compute_ctl to generate its own request ID
|
||||
/// if one isn't supplied. The control plane will always send one as a UUID. The
|
||||
/// neon Postgres extension on the other hand does not send one.
|
||||
async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
|
||||
let headers = request.headers_mut();
|
||||
if headers.get(X_REQUEST_ID).is_none() {
|
||||
headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
|
||||
}
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-server".into())
|
||||
.spawn(move || serve(port, state))?)
|
||||
next.run(request).await
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*;
|
||||
/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
|
||||
/// `tracing-utils` package description.
|
||||
///
|
||||
pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
// Initialize Logging
|
||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
|
||||
@@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
// Initialize OpenTelemetry
|
||||
let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
|
||||
let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;
|
||||
|
||||
// Put it all together
|
||||
tracing_subscriber::registry()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use fail::fail_point;
|
||||
use postgres::{Client, Transaction};
|
||||
use tokio_postgres::{Client, Transaction};
|
||||
use tracing::{error, info};
|
||||
|
||||
use crate::metrics::DB_MIGRATION_FAILED;
|
||||
@@ -21,10 +21,11 @@ impl<'m> MigrationRunner<'m> {
|
||||
}
|
||||
|
||||
/// Get the current value neon_migration.migration_id
|
||||
fn get_migration_id(&mut self) -> Result<i64> {
|
||||
async fn get_migration_id(&mut self) -> Result<i64> {
|
||||
let row = self
|
||||
.client
|
||||
.query_one("SELECT id FROM neon_migration.migration_id", &[])?;
|
||||
.query_one("SELECT id FROM neon_migration.migration_id", &[])
|
||||
.await?;
|
||||
|
||||
Ok(row.get::<&str, i64>("id"))
|
||||
}
|
||||
@@ -34,7 +35,7 @@ impl<'m> MigrationRunner<'m> {
|
||||
/// This function has a fail point called compute-migration, which can be
|
||||
/// used if you would like to fail the application of a series of migrations
|
||||
/// at some point.
|
||||
fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
|
||||
async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> {
|
||||
// We use this fail point in order to check that failing in the
|
||||
// middle of applying a series of migrations fails in an expected
|
||||
// manner
|
||||
@@ -59,31 +60,38 @@ impl<'m> MigrationRunner<'m> {
|
||||
"UPDATE neon_migration.migration_id SET id = $1",
|
||||
&[&migration_id],
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepare the migrations the target database for handling migrations
|
||||
fn prepare_database(&mut self) -> Result<()> {
|
||||
async fn prepare_database(&mut self) -> Result<()> {
|
||||
self.client
|
||||
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?;
|
||||
self.client.simple_query(
|
||||
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
|
||||
)?;
|
||||
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
|
||||
.await?;
|
||||
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
|
||||
self.client
|
||||
.simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?;
|
||||
.simple_query(
|
||||
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
|
||||
)
|
||||
.await?;
|
||||
self.client
|
||||
.simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?;
|
||||
.simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")
|
||||
.await?;
|
||||
self.client
|
||||
.simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run an individual migration in a separate transaction block.
|
||||
fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
|
||||
async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
|
||||
let mut txn = client
|
||||
.transaction()
|
||||
.await
|
||||
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
|
||||
|
||||
if migration.starts_with("-- SKIP") {
|
||||
@@ -92,35 +100,38 @@ impl<'m> MigrationRunner<'m> {
|
||||
// Even though we are skipping the migration, updating the
|
||||
// migration ID should help keep logic easy to understand when
|
||||
// trying to understand the state of a cluster.
|
||||
Self::update_migration_id(&mut txn, migration_id)?;
|
||||
Self::update_migration_id(&mut txn, migration_id).await?;
|
||||
} else {
|
||||
info!("Running migration id={}:\n{}\n", migration_id, migration);
|
||||
|
||||
txn.simple_query(migration)
|
||||
.await
|
||||
.with_context(|| format!("apply migration {migration_id}"))?;
|
||||
|
||||
Self::update_migration_id(&mut txn, migration_id)?;
|
||||
Self::update_migration_id(&mut txn, migration_id).await?;
|
||||
}
|
||||
|
||||
txn.commit()
|
||||
.await
|
||||
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run the configured set of migrations
|
||||
pub fn run_migrations(mut self) -> Result<()> {
|
||||
pub async fn run_migrations(mut self) -> Result<()> {
|
||||
self.prepare_database()
|
||||
.await
|
||||
.context("prepare database to handle migrations")?;
|
||||
|
||||
let mut current_migration = self.get_migration_id()? as usize;
|
||||
let mut current_migration = self.get_migration_id().await? as usize;
|
||||
while current_migration < self.migrations.len() {
|
||||
// The index lags the migration ID by 1, so the current migration
|
||||
// ID is also the next index
|
||||
let migration_id = (current_migration + 1) as i64;
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
match Self::run_migration(self.client, migration_id, migration) {
|
||||
match Self::run_migration(self.client, migration_id, migration).await {
|
||||
Ok(_) => {
|
||||
info!("Finished migration id={}", migration_id);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::str::FromStr;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
@@ -16,6 +15,7 @@ use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::config::Config;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::timeout;
|
||||
use tokio_postgres;
|
||||
use tokio_postgres::NoTls;
|
||||
@@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
|
||||
/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs
|
||||
/// and send them to the logger. In the future we may also want to add context to
|
||||
/// these logs.
|
||||
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
|
||||
std::thread::spawn(move || {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to build tokio runtime");
|
||||
|
||||
let res = runtime.block_on(async move {
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
|
||||
handle_postgres_logs_async(stderr).await
|
||||
});
|
||||
if let Err(e) = res {
|
||||
tracing::error!("error while processing postgres logs: {}", e);
|
||||
}
|
||||
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<Result<()>> {
|
||||
tokio::spawn(async move {
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
|
||||
handle_postgres_logs_async(stderr).await
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use postgres::Client;
|
||||
use reqwest::StatusCode;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
use crate::config;
|
||||
@@ -11,7 +11,9 @@ use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
|
||||
use compute_api::responses::{
|
||||
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
|
||||
};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
// Do control plane request and return response if any. In case of error it
|
||||
@@ -73,14 +75,13 @@ fn do_control_plane_request(
|
||||
pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<Option<ComputeSpec>> {
|
||||
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
let mut attempt = 1;
|
||||
let mut spec: Result<Option<ComputeSpec>> = Ok(None);
|
||||
|
||||
info!("getting spec from control plane: {}", cp_uri);
|
||||
|
||||
@@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane(
|
||||
// - no spec for compute yet (Empty state) -> return Ok(None)
|
||||
// - got spec -> return Ok(Some(spec))
|
||||
while attempt < 4 {
|
||||
spec = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
let result = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
Ok(spec_resp) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[
|
||||
@@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane(
|
||||
])
|
||||
.inc();
|
||||
match spec_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok(None),
|
||||
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
|
||||
ControlPlaneComputeStatus::Attached => {
|
||||
if let Some(spec) = spec_resp.spec {
|
||||
Ok(Some(spec))
|
||||
Ok((Some(spec), spec_resp.compute_ctl_config))
|
||||
} else {
|
||||
bail!("compute is attached, but spec is empty")
|
||||
}
|
||||
@@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane(
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = &spec {
|
||||
if let Err(e) = &result {
|
||||
error!("attempt {} to get spec failed with: {}", attempt, e);
|
||||
} else {
|
||||
return spec;
|
||||
return result;
|
||||
}
|
||||
|
||||
attempt += 1;
|
||||
@@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane(
|
||||
}
|
||||
|
||||
// All attempts failed, return error.
|
||||
spec
|
||||
Err(anyhow::anyhow!(
|
||||
"Exhausted all attempts to retrieve the spec from the control plane"
|
||||
))
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
@@ -166,17 +169,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
client.simple_query(query).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
pub async fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
info!("handle migrations");
|
||||
|
||||
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
@@ -206,7 +209,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations).run_migrations()?;
|
||||
MigrationRunner::new(client, &migrations)
|
||||
.run_migrations()
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -214,7 +219,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
/// Connect to the database as superuser and pre-create anon extension
|
||||
/// if it is present in shared_preload_libraries
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_extension_anon(
|
||||
pub async fn handle_extension_anon(
|
||||
spec: &ComputeSpec,
|
||||
db_owner: &str,
|
||||
db_client: &mut Client,
|
||||
@@ -227,7 +232,7 @@ pub fn handle_extension_anon(
|
||||
if !grants_only {
|
||||
// check if extension is already initialized using anon.is_initialized()
|
||||
let query = "SELECT anon.is_initialized()";
|
||||
match db_client.query(query, &[]) {
|
||||
match db_client.query(query, &[]).await {
|
||||
Ok(rows) => {
|
||||
if !rows.is_empty() {
|
||||
let is_initialized: bool = rows[0].get(0);
|
||||
@@ -249,7 +254,7 @@ pub fn handle_extension_anon(
|
||||
// Users cannot create it themselves, because superuser is required.
|
||||
let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
|
||||
info!("creating anon extension with query: {}", query);
|
||||
match db_client.query(query, &[]) {
|
||||
match db_client.query(query, &[]).await {
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
error!("anon extension creation failed with error: {}", e);
|
||||
@@ -259,7 +264,7 @@ pub fn handle_extension_anon(
|
||||
|
||||
// check that extension is installed
|
||||
query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
|
||||
let rows = db_client.query(query, &[])?;
|
||||
let rows = db_client.query(query, &[]).await?;
|
||||
if rows.is_empty() {
|
||||
error!("anon extension is not installed");
|
||||
return Ok(());
|
||||
@@ -268,7 +273,7 @@ pub fn handle_extension_anon(
|
||||
// Initialize anon extension
|
||||
// This also requires superuser privileges, so users cannot do it themselves.
|
||||
query = "SELECT anon.init()";
|
||||
match db_client.query(query, &[]) {
|
||||
match db_client.query(query, &[]).await {
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
error!("anon.init() failed with error: {}", e);
|
||||
@@ -279,7 +284,7 @@ pub fn handle_extension_anon(
|
||||
|
||||
// check that extension is installed, if not bail early
|
||||
let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
|
||||
match db_client.query(query, &[]) {
|
||||
match db_client.query(query, &[]).await {
|
||||
Ok(rows) => {
|
||||
if rows.is_empty() {
|
||||
error!("anon extension is not installed");
|
||||
@@ -294,12 +299,12 @@ pub fn handle_extension_anon(
|
||||
|
||||
let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
|
||||
info!("granting anon extension permissions with query: {}", query);
|
||||
db_client.simple_query(&query)?;
|
||||
db_client.simple_query(&query).await?;
|
||||
|
||||
// Grant permissions to db_owner to use anon extension functions
|
||||
let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
|
||||
info!("granting anon extension permissions with query: {}", query);
|
||||
db_client.simple_query(&query)?;
|
||||
db_client.simple_query(&query).await?;
|
||||
|
||||
// This is needed, because some functions are defined as SECURITY DEFINER.
|
||||
// In Postgres SECURITY DEFINER functions are executed with the privileges
|
||||
@@ -314,16 +319,16 @@ pub fn handle_extension_anon(
|
||||
where nsp.nspname = 'anon';", db_owner);
|
||||
|
||||
info!("change anon extension functions owner to db owner");
|
||||
db_client.simple_query(&query)?;
|
||||
db_client.simple_query(&query).await?;
|
||||
|
||||
// affects views as well
|
||||
let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
|
||||
info!("granting anon extension permissions with query: {}", query);
|
||||
db_client.simple_query(&query)?;
|
||||
db_client.simple_query(&query).await?;
|
||||
|
||||
let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
|
||||
info!("granting anon extension permissions with query: {}", query);
|
||||
db_client.simple_query(&query)?;
|
||||
db_client.simple_query(&query).await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ postgres_backend.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
whoami.workspace = true
|
||||
|
||||
|
||||
@@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs {
|
||||
lsn: Option<Lsn>,
|
||||
#[clap(long)]
|
||||
pg_port: Option<u16>,
|
||||
#[clap(long, alias = "http-port")]
|
||||
external_http_port: Option<u16>,
|
||||
#[clap(long)]
|
||||
http_port: Option<u16>,
|
||||
internal_http_port: Option<u16>,
|
||||
#[clap(long = "pageserver-id")]
|
||||
endpoint_pageserver_id: Option<NodeId>,
|
||||
|
||||
@@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
args.pg_port,
|
||||
args.http_port,
|
||||
args.external_http_port,
|
||||
args.internal_http_port,
|
||||
args.pg_version,
|
||||
mode,
|
||||
!args.update_catalog,
|
||||
|
||||
@@ -37,6 +37,8 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv4Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
@@ -46,6 +48,8 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::Database;
|
||||
use compute_api::spec::PgIdent;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
@@ -73,7 +77,8 @@ pub struct EndpointConf {
|
||||
timeline_id: TimelineId,
|
||||
mode: ComputeMode,
|
||||
pg_port: u16,
|
||||
http_port: u16,
|
||||
external_http_port: u16,
|
||||
internal_http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
@@ -128,7 +133,7 @@ impl ComputeControlPlane {
|
||||
1 + self
|
||||
.endpoints
|
||||
.values()
|
||||
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
|
||||
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port()))
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
@@ -140,18 +145,27 @@ impl ComputeControlPlane {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_port: Option<u16>,
|
||||
http_port: Option<u16>,
|
||||
external_http_port: Option<u16>,
|
||||
internal_http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
skip_pg_catalog_updates: bool,
|
||||
drop_subscriptions_before_start: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
|
||||
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
|
||||
external_http_address: SocketAddr::new(
|
||||
IpAddr::from(Ipv4Addr::UNSPECIFIED),
|
||||
external_http_port,
|
||||
),
|
||||
internal_http_address: SocketAddr::new(
|
||||
IpAddr::from(Ipv4Addr::LOCALHOST),
|
||||
internal_http_port,
|
||||
),
|
||||
env: self.env.clone(),
|
||||
timeline_id,
|
||||
mode,
|
||||
@@ -176,7 +190,8 @@ impl ComputeControlPlane {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mode,
|
||||
http_port,
|
||||
external_http_port,
|
||||
internal_http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates,
|
||||
@@ -230,9 +245,10 @@ pub struct Endpoint {
|
||||
pub timeline_id: TimelineId,
|
||||
pub mode: ComputeMode,
|
||||
|
||||
// port and address of the Postgres server and `compute_ctl`'s HTTP API
|
||||
// port and address of the Postgres server and `compute_ctl`'s HTTP APIs
|
||||
pub pg_address: SocketAddr,
|
||||
pub http_address: SocketAddr,
|
||||
pub external_http_address: SocketAddr,
|
||||
pub internal_http_address: SocketAddr,
|
||||
|
||||
// postgres major version in the format: 14, 15, etc.
|
||||
pg_version: u32,
|
||||
@@ -287,8 +303,15 @@ impl Endpoint {
|
||||
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
|
||||
|
||||
Ok(Endpoint {
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
|
||||
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
|
||||
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
|
||||
external_http_address: SocketAddr::new(
|
||||
IpAddr::from(Ipv4Addr::UNSPECIFIED),
|
||||
conf.external_http_port,
|
||||
),
|
||||
internal_http_address: SocketAddr::new(
|
||||
IpAddr::from(Ipv4Addr::LOCALHOST),
|
||||
conf.internal_http_port,
|
||||
),
|
||||
endpoint_id,
|
||||
env: env.clone(),
|
||||
timeline_id: conf.timeline_id,
|
||||
@@ -650,24 +673,51 @@ impl Endpoint {
|
||||
println!("Also at '{}'", conn_str);
|
||||
}
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
self.env
|
||||
.pg_bin_dir(self.pg_version)?
|
||||
.join("postgres")
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
//cmd.args([
|
||||
// "--external-http-port",
|
||||
// &self.external_http_address.port().to_string(),
|
||||
//])
|
||||
//.args([
|
||||
// "--internal-http-port",
|
||||
// &self.internal_http_address.port().to_string(),
|
||||
//])
|
||||
cmd.args([
|
||||
"--http-port",
|
||||
&self.external_http_address.port().to_string(),
|
||||
])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
self.env
|
||||
.pg_bin_dir(self.pg_version)?
|
||||
.join("postgres")
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
// TODO: It would be nice if we generated compute IDs with the same
|
||||
// algorithm as the real control plane.
|
||||
//
|
||||
// TODO: Add this back when
|
||||
// https://github.com/neondatabase/neon/pull/10747 is merged.
|
||||
//
|
||||
//.args([
|
||||
// "--compute-id",
|
||||
// &format!(
|
||||
// "compute-{}",
|
||||
// SystemTime::now()
|
||||
// .duration_since(UNIX_EPOCH)
|
||||
// .unwrap()
|
||||
// .as_secs()
|
||||
// ),
|
||||
//])
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
@@ -754,8 +804,8 @@ impl Endpoint {
|
||||
reqwest::Method::GET,
|
||||
format!(
|
||||
"http://{}:{}/status",
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
self.external_http_address.ip(),
|
||||
self.external_http_address.port()
|
||||
),
|
||||
)
|
||||
.send()
|
||||
@@ -828,14 +878,17 @@ impl Endpoint {
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
self.external_http_address.ip(),
|
||||
self.external_http_address.port()
|
||||
))
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.body(format!(
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
))
|
||||
.body(
|
||||
serde_json::to_string(&ConfigurationRequest {
|
||||
spec,
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -357,6 +357,16 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
compaction_l0_first: settings
|
||||
.remove("compaction_l0_first")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_l0_first' as a bool")?,
|
||||
compaction_l0_semaphore: settings
|
||||
.remove("compaction_l0_semaphore")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_l0_semaphore' as a bool")?,
|
||||
l0_flush_delay_threshold: settings
|
||||
.remove("l0_flush_delay_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
|
||||
@@ -17,8 +17,10 @@ use camino::Utf8PathBuf;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
background_process,
|
||||
|
||||
@@ -838,7 +838,10 @@ impl StorageController {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest { node_id }),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id,
|
||||
migration_config: None,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -609,7 +609,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
migration_config: None,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
@@ -623,7 +626,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
migration_config: None,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
@@ -1082,7 +1088,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest { node_id: mv.to }),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id: mv.to,
|
||||
migration_config: None,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
|
||||
|
||||
@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
|
||||
docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
|
||||
@@ -2,4 +2,4 @@
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
pg_prove test.sql
|
||||
pg_prove -d contrib_regression test.sql
|
||||
15
docker-compose/ext-src/pgtap-src/test-upgrade.patch
Normal file
15
docker-compose/ext-src/pgtap-src/test-upgrade.patch
Normal file
@@ -0,0 +1,15 @@
|
||||
diff --git a/test/schedule/create.sql b/test/schedule/create.sql
|
||||
index ba355ed..7e250f5 100644
|
||||
--- a/test/schedule/create.sql
|
||||
+++ b/test/schedule/create.sql
|
||||
@@ -1,3 +1,2 @@
|
||||
\unset ECHO
|
||||
\i test/psql.sql
|
||||
-CREATE EXTENSION pgtap;
|
||||
diff --git a/test/schedule/main.sch b/test/schedule/main.sch
|
||||
index a8a5fbc..0463fc4 100644
|
||||
--- a/test/schedule/main.sch
|
||||
+++ b/test/schedule/main.sch
|
||||
@@ -1,2 +1 @@
|
||||
-test: build
|
||||
test: create
|
||||
6
docker-compose/ext-src/pgtap-src/test-upgrade.sh
Executable file
6
docker-compose/ext-src/pgtap-src/test-upgrade.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname ${0})"
|
||||
patch -p1 <test-upgrade.patch
|
||||
PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
|
||||
${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --max-connections=86 --schedule test/schedule/main.sch --schedule test/build/run.sch --dbname contrib_regression --use-existing
|
||||
@@ -41,7 +41,8 @@ EXTENSIONS='[
|
||||
{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
|
||||
{"extname": "semver", "extdir": "pg_semver-src"},
|
||||
{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
|
||||
{"extname": "pgjwt", "extdir": "pgjwt-src"}
|
||||
{"extname": "pgjwt", "extdir": "pgjwt-src"},
|
||||
{"extname": "pgtap", "extdir": "pgtap-src"}
|
||||
]'
|
||||
EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
|
||||
TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
|
||||
|
||||
@@ -285,10 +285,10 @@ To summarize, list of cplane changes:
|
||||
|
||||
### storage_controller implementation
|
||||
|
||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
||||
Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
|
||||
byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
|
||||
10^6 of timelines shouldn't take more than 100MB.
|
||||
If desired, we may continue using current 'load everything on startup and keep
|
||||
in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
|
||||
byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
|
||||
plus some flags), so 10^6 of timelines shouldn't take more than 100MB.
|
||||
|
||||
Similar to pageserver attachment Intents storage_controller would have in-memory
|
||||
`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
|
||||
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
|
||||
won't do several migrations on the same timeline concurrently. In the first
|
||||
version it is simpler to have more manual control and no retries, i.e. migration
|
||||
failure removes the request. Later we can build retries and automatic
|
||||
scheduling/migration. `MigrationRequest` is
|
||||
scheduling/migration around. `MigrationRequest` is
|
||||
```
|
||||
enum MigrationRequest {
|
||||
To(Vec<NodeId>),
|
||||
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
|
||||
#### Schema
|
||||
|
||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
||||
`scheduling_policy` field (seems like `status` is a better name for it): it is enough
|
||||
to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
|
||||
`decomissioned`.
|
||||
`scheduling_policy`: it is enough to have at least in the beginning only 3
|
||||
fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
|
||||
3) `decomissioned` (node is removed).
|
||||
|
||||
`timelines` table:
|
||||
```
|
||||
@@ -324,18 +324,24 @@ table! {
|
||||
timelines (tenant_id, timeline_id) {
|
||||
timeline_id -> Varchar,
|
||||
tenant_id -> Varchar,
|
||||
start_lsn -> pg_lsn,
|
||||
generation -> Int4,
|
||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
||||
new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
|
||||
new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
|
||||
cplane_notified_generation -> Int4,
|
||||
deleted_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`start_lsn` is needed to create timeline on safekeepers properly, see below. We
|
||||
might also want to add ancestor_timeline_id to preserve the hierarchy, but for
|
||||
this RFC it is not needed.
|
||||
|
||||
#### API
|
||||
|
||||
Node management is similar to pageserver:
|
||||
1) POST `/control/v1/safekeepers` upserts safekeeper.
|
||||
1) POST `/control/v1/safekeepers` inserts safekeeper.
|
||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
||||
@@ -345,25 +351,15 @@ Node management is similar to pageserver:
|
||||
Safekeeper deploy scripts should register safekeeper at storage_contorller as
|
||||
they currently do with cplane, under the same id.
|
||||
|
||||
Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
|
||||
would 1) choose initial set of safekeepers; 2) write to the db initial
|
||||
`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
|
||||
case of conflict; 3) create timeline on the majority of safekeepers (already
|
||||
created is ok).
|
||||
Timeline creation/deletion will work through already existing POST and DELETE
|
||||
`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
|
||||
succeed. See next section on the implementation details.
|
||||
|
||||
We don't want to block timeline creation when one safekeeper is down. Currently
|
||||
this is solved by compute implicitly creating timeline on any safekeeper it is
|
||||
connected to. This creates ugly timeline state on safekeeper when timeline is
|
||||
created, but start LSN is not defined yet. It would be nice to remove this; to
|
||||
do that, controller can in the background retry to create timeline on
|
||||
safekeeper(s) which missed that during initial creation call. It can do that
|
||||
through `pull_timeline` from majority so it doesn't need to remember
|
||||
`parent_lsn` in its db.
|
||||
|
||||
Timeline deletion removes the row from the db and forwards deletion to the
|
||||
current configuration members. Without additional actions deletions might leak,
|
||||
see below on this; initially let's ignore these, reporting to cplane success if
|
||||
at least one safekeeper deleted the timeline (this will remove s3 data).
|
||||
We don't want to block timeline creation/deletion when one safekeeper is down.
|
||||
Currently this is crutched by compute implicitly creating timeline on any
|
||||
safekeeper it is connected to. This creates ugly timeline state on safekeeper
|
||||
when timeline is created, but start LSN is not defined yet. Next section
|
||||
describes dealing with this.
|
||||
|
||||
Tenant deletion repeats timeline deletion for all timelines.
|
||||
|
||||
@@ -395,26 +391,6 @@ Similar call should be added for the tenant.
|
||||
It would be great to have some way of subscribing to the results (apart from
|
||||
looking at logs/metrics).
|
||||
|
||||
Migration is executed as described above. One subtlety is that (local) deletion on
|
||||
source safekeeper might fail, which is not a problem if we are going to
|
||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
||||
1) Don't attempt deletion at all if node status is `offline`.
|
||||
2) If it failed, just issue warning.
|
||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
||||
safekeeper 2) compare each one against configuration storage: if timeline
|
||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
||||
be deleted under generation number if node is not member of current generation.
|
||||
|
||||
Automating this is untrivial; we'd need to register all potential missing
|
||||
deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
|
||||
which switches configurations. Similarly when timeline is fully deleted to
|
||||
prevent cplane operation from blocking when some safekeeper is not available
|
||||
deletion should be also registered.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets.
|
||||
|
||||
3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
|
||||
current in memory state of the timeline and pending `MigrationRequest`,
|
||||
if any.
|
||||
@@ -423,12 +399,153 @@ safekeeper sets.
|
||||
migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
|
||||
(incrementing generation as always).
|
||||
|
||||
#### API implementation and reconciliation
|
||||
|
||||
For timeline creation/deletion we want to preserve the basic assumption that
|
||||
unreachable minority (1 sk of 3) doesn't block their completion, but eventually
|
||||
we want to finish creation/deletion on nodes which missed it (unless they are
|
||||
removed). Similarly for migration; it may and should finish even though excluded
|
||||
members missed their exclusion. And of course e.g. such pending exclusion on
|
||||
node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
|
||||
another example, if some node missed timeline creation it clearly must not block
|
||||
migration from it. Hence it is natural to have per safekeeper background
|
||||
reconciler which retries these ops until they succeed. There are 3 possible
|
||||
operation types, and the type is defined by timeline state (membership
|
||||
configuration and whether it is deleted) and safekeeper id: we may need to
|
||||
create timeline on sk (node added), locally delete it (node excluded, somewhat
|
||||
similar to detach) or globally delete it (timeline is deleted).
|
||||
|
||||
Next, on storage controller restart in principle these pending operations can be
|
||||
figured out by comparing safekeepers state against storcon state. But it seems
|
||||
better to me to materialize them in the database; it is not expensive, avoids
|
||||
these startup scans which themselves can fail etc and makes it very easy to see
|
||||
outstanding work directly at the source of truth -- the db. So we can add table
|
||||
`safekeeper_timeline_pending_ops`
|
||||
```
|
||||
table! {
|
||||
// timeline_id, sk_id is primary key
|
||||
safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
|
||||
sk_id -> int8,
|
||||
tenant_id -> Varchar,
|
||||
timeline_id -> Varchar,
|
||||
generation -> Int4,
|
||||
op_type -> Varchar,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`op_type` can be `include` (seed from peers and ensure generation is up to
|
||||
date), `exclude` (remove locally) and `delete`. Field is actually not strictly
|
||||
needed as it can be computed from current configuration, but gives more explicit
|
||||
observability.
|
||||
|
||||
`generation` is necessary there because after op is done reconciler must remove
|
||||
it and not remove another row with higher gen which in theory might appear.
|
||||
|
||||
Any insert of row should overwrite (remove) all rows with the same sk and
|
||||
timeline id but lower `generation` as next op makes previous obsolete. Insertion
|
||||
of `op_type` `delete` overwrites all rows.
|
||||
|
||||
About `exclude`: rather than adding explicit safekeeper http endpoint, it is
|
||||
reasonable to reuse membership switch endpoint: if safekeeper is not member
|
||||
of the configuration it locally removes the timeline on the switch. In this case
|
||||
404 should also be considered an 'ok' answer by the caller.
|
||||
|
||||
So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
|
||||
joined with timeline configuration to get current conf (with generation `n`)
|
||||
for the safekeeper and does the jobs, infinitely retrying failures:
|
||||
1) If node is member (`include`):
|
||||
- Check if timeline exists on it, if not, call pull_timeline on it from
|
||||
other members
|
||||
- Call switch configuration to the current
|
||||
2) If node is not member (`exclude`):
|
||||
- Call switch configuration to the current, 404 is ok.
|
||||
3) If timeline is deleted (`delete`), call delete.
|
||||
|
||||
In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
|
||||
timeline with generation <= `n` if `op_type` is not `delete`.
|
||||
In case 3 also remove `safekeeper_timeline_pending_ops`
|
||||
entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline.
|
||||
|
||||
Let's consider in details how APIs can be implemented from this angle.
|
||||
|
||||
Timeline creation. It is assumed that cplane retries it until success, so all
|
||||
actions must be idempotent. Now, a tricky point here is timeline start LSN. For
|
||||
the initial (tenant creation) call cplane doesn't know it. However, setting
|
||||
start_lsn on safekeepers during creation is a good thing -- it provides a
|
||||
guarantee that walproposer can always find a common point in WAL histories of
|
||||
safekeeper and its own, and so absense of it would be a clear sign of
|
||||
corruption. The following sequence works:
|
||||
1) Create timeline (or observe that it exists) on pageserver,
|
||||
figuring out last_record_lsn in response.
|
||||
2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
|
||||
db. Note that last_record_lsn returned on the previous step is movable as it
|
||||
changes once ingestion starts, insert must not overwrite it (as well as other
|
||||
fields like membership conf). On the contrary, start_lsn used in the next
|
||||
step must be set to the value in the db. cplane_notified_generation can be set
|
||||
to 1 (initial generation) in insert to avoid notifying cplane about initial
|
||||
conf as cplane will receive it in timeline creation request anyway.
|
||||
3) Issue timeline creation calls to at least majority of safekeepers. Using
|
||||
majority here is not necessary but handy because it guarantees that any live
|
||||
majority will have at least one sk with created timeline and so
|
||||
reconciliation task can use pull_timeline shared with migration instead of
|
||||
create timeline special init case. OFC if timeline is already exists call is
|
||||
ignored.
|
||||
4) For minority of safekeepers which could have missed creation insert
|
||||
entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
|
||||
because response to cplane is sent only after it has happened, and cplane
|
||||
retries the call until 200 response.
|
||||
|
||||
There is a small question how request handler (timeline creation in this
|
||||
case) would interact with per sk reconciler. As always I prefer to do the
|
||||
simplest possible thing and here it seems to be just waking it up so it
|
||||
re-reads the db for work to do. Passing work in memory is faster, but
|
||||
that shouldn't matter, and path to scan db for work will exist anyway,
|
||||
simpler to reuse it.
|
||||
|
||||
For pg version / wal segment size: while we may persist them in `timelines`
|
||||
table, it is not necessary as initial creation at step 3 can take them from
|
||||
pageserver or cplane creation call and later pull_timeline will carry them
|
||||
around.
|
||||
|
||||
Timeline migration.
|
||||
1) CAS to the db to create joint conf, and in the same transaction create
|
||||
`safekeeper_timeline_pending_ops` `include` entries to initialize new members
|
||||
as well as deliver this conf to current ones; poke per sk reconcilers to work
|
||||
on it. Also any conf change should also poke cplane notifier task(s).
|
||||
2) Once it becomes possible per alg description above, get out of joint conf
|
||||
with another CAS. Task should get wakeups from per sk reconcilers because
|
||||
conf switch is required for advancement; however retries should be sleep
|
||||
based as well as LSN advancement might be needed, though in happy path
|
||||
it isn't. To see whether further transition is possible on wakup migration
|
||||
executor polls safekeepers per the algorithm. CAS creating new conf with only
|
||||
new members should again insert entries to `safekeeper_timeline_pending_ops`
|
||||
to switch them there, as well as `exclude` rows to remove timeline from
|
||||
old members.
|
||||
|
||||
Timeline deletion: just set `deleted_at` on the timeline row and insert
|
||||
`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
|
||||
per sk reconcilers.
|
||||
|
||||
When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
|
||||
for it must be cleared in the same transaction.
|
||||
|
||||
One more task pool should infinitely retry notifying control plane about changed
|
||||
safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
|
||||
|
||||
#### Dealing with multiple instances of storage_controller
|
||||
|
||||
Operations described above executed concurrently might create some errors but do
|
||||
not prevent progress, so while we normally don't want to run multiple instances
|
||||
of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
||||
|
||||
To harden against some controller instance creating some work in
|
||||
`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
|
||||
the job per sk reconcilers apart from explicit wakups should scan for work
|
||||
periodically. It is possible to remove that though if all db updates are
|
||||
protected with leadership token/term -- then such scans are needed only after
|
||||
leadership is acquired.
|
||||
|
||||
Any interactions with db update in-memory controller state, e.g. if migration
|
||||
request failed because different one is in progress, controller remembers that
|
||||
and tries to finish it.
|
||||
@@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed.
|
||||
|
||||
We should use Compute <-> safekeeper protocol change to include other (long
|
||||
yearned) modifications:
|
||||
- send data in network order to make arm work.
|
||||
- send data in network order without putting whole structs to be arch independent
|
||||
- remove term_start_lsn from AppendRequest
|
||||
- add horizon to TermHistory
|
||||
- add to ProposerGreeting number of connection from this wp to sk
|
||||
|
||||
@@ -7,6 +7,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
use crate::{
|
||||
privilege::Privilege,
|
||||
responses::ComputeCtlConfig,
|
||||
spec::{ComputeSpec, ExtVersion, PgIdent},
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Request of the /configure API
|
||||
///
|
||||
/// We now pass only `spec` in the configuration request, but later we can
|
||||
/// extend it and something like `restart: bool` or something else. So put
|
||||
/// `spec` into a struct initially to be more flexible in the future.
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ConfigurationRequest {
|
||||
pub spec: ComputeSpec,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use jsonwebtoken::jwk::JwkSet;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
|
||||
use crate::{
|
||||
@@ -135,13 +136,27 @@ pub struct CatalogObjects {
|
||||
pub databases: Vec<Database>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeCtlConfig {
|
||||
pub jwks: JwkSet,
|
||||
}
|
||||
|
||||
impl Default for ComputeCtlConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
jwks: JwkSet {
|
||||
keys: Vec::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
/// This is not actually a compute API response, so consider moving
|
||||
/// to a different place.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
|
||||
|
||||
37
libs/http-utils/Cargo.toml
Normal file
37
libs/http-utils/Cargo.toml
Normal file
@@ -0,0 +1,37 @@
|
||||
[package]
|
||||
name = "http-utils"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
backtrace.workspace = true
|
||||
bytes.workspace = true
|
||||
inferno.workspace = true
|
||||
fail.workspace = true
|
||||
flate2.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jemalloc_pprof.workspace = true
|
||||
once_cell.workspace = true
|
||||
pprof.workspace = true
|
||||
regex.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_path_to_error.workspace = true
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,7 +1,6 @@
|
||||
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use crate::http::request::{get_query_param, parse_query_param};
|
||||
use crate::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use crate::pprof;
|
||||
use crate::request::{get_query_param, parse_query_param};
|
||||
use ::pprof::protos::Message as _;
|
||||
use ::pprof::ProfilerGuardBuilder;
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -19,6 +18,7 @@ use tokio::sync::{mpsc, Mutex, Notify};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
use utils::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
|
||||
use std::future::Future;
|
||||
use std::io::Write as _;
|
||||
@@ -718,9 +718,9 @@ pub fn check_permission_with(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use futures::future::poll_fn;
|
||||
use hyper::service::Service;
|
||||
use routerify::RequestServiceBuilder;
|
||||
use std::future::poll_fn;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -5,6 +5,8 @@ use std::error::Error as StdError;
|
||||
use thiserror::Error;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use utils::auth::AuthError;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
#[error("Bad request: {0:#?}")]
|
||||
@@ -96,6 +98,15 @@ impl ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AuthError> for ApiError {
|
||||
fn from(_value: AuthError) -> Self {
|
||||
// Don't pass on the value of the AuthError as a precautionary measure.
|
||||
// Being intentionally vague in public error communication hurts debugability
|
||||
// but it is more secure.
|
||||
ApiError::Forbidden("JWT authentication error".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct HttpErrorBody {
|
||||
pub msg: String,
|
||||
50
libs/http-utils/src/failpoints.rs
Normal file
50
libs/http-utils/src/failpoints.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
use crate::error::ApiError;
|
||||
use crate::json::{json_request, json_response};
|
||||
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use utils::failpoint_support::apply_failpoint;
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because neon was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1,8 +1,12 @@
|
||||
pub mod endpoint;
|
||||
pub mod error;
|
||||
pub mod failpoints;
|
||||
pub mod json;
|
||||
pub mod pprof;
|
||||
pub mod request;
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
/// Current fast way to apply simple http routing in various Neon binaries.
|
||||
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
|
||||
pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
|
||||
@@ -121,6 +121,7 @@ pub struct ConfigToml {
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -262,6 +263,11 @@ pub struct TenantConfigToml {
|
||||
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
|
||||
pub compaction_upper_limit: usize,
|
||||
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
|
||||
/// If true, compact down L0 across all tenant timelines before doing regular compaction.
|
||||
pub compaction_l0_first: bool,
|
||||
/// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
|
||||
/// has an effect if `compaction_l0_first` is `true`.
|
||||
pub compaction_l0_semaphore: bool,
|
||||
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
|
||||
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
|
||||
/// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
|
||||
@@ -490,7 +496,7 @@ impl Default for ConfigToml {
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
timeline_offloading: false,
|
||||
timeline_offloading: true,
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
virtual_file_io_mode: None,
|
||||
@@ -510,6 +516,11 @@ impl Default for ConfigToml {
|
||||
} else {
|
||||
GetVectoredConcurrentIo::SidecarTask
|
||||
},
|
||||
enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
|
||||
Some(true)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -537,6 +548,8 @@ pub mod tenant_conf_defaults {
|
||||
// most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
|
||||
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
|
||||
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
|
||||
pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
|
||||
pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
|
||||
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
|
||||
crate::models::CompactionAlgorithm::Legacy;
|
||||
@@ -586,6 +599,8 @@ impl Default for TenantConfigToml {
|
||||
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
|
||||
compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
|
||||
l0_flush_delay_threshold: None,
|
||||
l0_flush_stall_threshold: None,
|
||||
l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
|
||||
@@ -616,7 +631,7 @@ impl Default for TenantConfigToml {
|
||||
image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
timeline_offloading: false,
|
||||
timeline_offloading: true,
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: None,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
|
||||
@@ -182,6 +182,18 @@ pub struct TenantDescribeResponseShard {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
#[serde(default)]
|
||||
pub migration_config: Option<MigrationConfig>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MigrationConfig {
|
||||
#[serde(default)]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub secondary_warmup_timeout: Option<Duration>,
|
||||
#[serde(default)]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub secondary_download_request_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Debug)]
|
||||
|
||||
@@ -464,6 +464,10 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_l0_first: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub compaction_l0_semaphore: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub l0_flush_delay_threshold: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub l0_flush_stall_threshold: FieldPatch<usize>,
|
||||
@@ -529,6 +533,8 @@ pub struct TenantConfig {
|
||||
pub compaction_upper_limit: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
pub compaction_l0_first: Option<bool>,
|
||||
pub compaction_l0_semaphore: Option<bool>,
|
||||
pub l0_flush_delay_threshold: Option<usize>,
|
||||
pub l0_flush_stall_threshold: Option<usize>,
|
||||
pub l0_flush_wait_upload: Option<bool>,
|
||||
@@ -567,6 +573,8 @@ impl TenantConfig {
|
||||
mut compaction_threshold,
|
||||
mut compaction_upper_limit,
|
||||
mut compaction_algorithm,
|
||||
mut compaction_l0_first,
|
||||
mut compaction_l0_semaphore,
|
||||
mut l0_flush_delay_threshold,
|
||||
mut l0_flush_stall_threshold,
|
||||
mut l0_flush_wait_upload,
|
||||
@@ -606,6 +614,10 @@ impl TenantConfig {
|
||||
.compaction_upper_limit
|
||||
.apply(&mut compaction_upper_limit);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch.compaction_l0_first.apply(&mut compaction_l0_first);
|
||||
patch
|
||||
.compaction_l0_semaphore
|
||||
.apply(&mut compaction_l0_semaphore);
|
||||
patch
|
||||
.l0_flush_delay_threshold
|
||||
.apply(&mut l0_flush_delay_threshold);
|
||||
@@ -669,6 +681,8 @@ impl TenantConfig {
|
||||
compaction_threshold,
|
||||
compaction_upper_limit,
|
||||
compaction_algorithm,
|
||||
compaction_l0_first,
|
||||
compaction_l0_semaphore,
|
||||
l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload,
|
||||
@@ -1122,7 +1136,24 @@ pub struct TimelineInfo {
|
||||
pub ancestor_lsn: Option<Lsn>,
|
||||
pub last_record_lsn: Lsn,
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
|
||||
/// Legacy field for compat with control plane. Synonym of `min_readable_lsn`.
|
||||
/// TODO: remove once control plane no longer reads it.
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
|
||||
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
|
||||
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
|
||||
/// as it is easier to reason about.
|
||||
pub applied_gc_cutoff_lsn: Lsn,
|
||||
|
||||
/// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
|
||||
/// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
|
||||
/// LSN at which it is legal to create a branch or ephemeral endpoint.
|
||||
///
|
||||
/// Note that holders of valid LSN leases may be able to create branches and read pages earlier
|
||||
/// than this LSN, but new leases may not be taken out earlier than this LSN.
|
||||
pub min_readable_lsn: Lsn,
|
||||
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we have succesfully uploaded to remote storage
|
||||
|
||||
@@ -21,23 +21,17 @@ bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
diatomic-waker.workspace = true
|
||||
flate2.workspace = true
|
||||
git-version.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
inferno.workspace = true
|
||||
itertools.workspace = true
|
||||
fail.workspace = true
|
||||
futures = { workspace = true }
|
||||
jemalloc_pprof.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
pprof.workspace = true
|
||||
regex.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
@@ -54,8 +48,6 @@ rand.workspace = true
|
||||
scopeguard.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
@@ -64,12 +56,6 @@ metrics.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
serde_path_to_error.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -10,7 +10,7 @@ use jsonwebtoken::{
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{http::error::ApiError, id::TenantId};
|
||||
use crate::id::TenantId;
|
||||
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
@@ -90,15 +90,6 @@ impl Display for AuthError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AuthError> for ApiError {
|
||||
fn from(_value: AuthError) -> Self {
|
||||
// Don't pass on the value of the AuthError as a precautionary measure.
|
||||
// Being intentionally vague in public error communication hurts debugability
|
||||
// but it is more secure.
|
||||
ApiError::Forbidden("JWT authentication error".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwtAuth {
|
||||
decoding_keys: Vec<DecodingKey>,
|
||||
validation: Validation,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::Future;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -29,6 +30,11 @@ pub async fn exponential_backoff(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration {
|
||||
let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
Duration::from_secs_f64(seconds)
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
|
||||
@@ -1,13 +1,6 @@
|
||||
//! Failpoint support code shared between pageserver and safekeepers.
|
||||
|
||||
use crate::http::{
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// Declare a failpoint that can use to `pause` failpoint action.
|
||||
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
@@ -184,45 +177,3 @@ fn exit_failpoint() {
|
||||
tracing::info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because neon was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
//! between other crates in this repository.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
pub mod backoff;
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
@@ -33,9 +31,6 @@ pub mod shard;
|
||||
mod hex;
|
||||
pub use hex::Hex;
|
||||
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
// definition of the Generation type for pageserver attachment APIs
|
||||
pub mod generation;
|
||||
|
||||
@@ -96,8 +91,6 @@ pub mod circuit_breaker;
|
||||
|
||||
pub mod try_rcu;
|
||||
|
||||
pub mod pprof;
|
||||
|
||||
pub mod guard_arc_swap;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
|
||||
@@ -79,6 +79,7 @@ pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tenant_size_model.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
|
||||
@@ -11,6 +11,7 @@ testing = [ "pageserver_api/testing" ]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
reqwest = { workspace = true, features = [ "stream" ] }
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use std::{collections::HashMap, error::Error as _};
|
||||
|
||||
use bytes::Bytes;
|
||||
use detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
|
||||
use detach_ancestor::AncestorDetached;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
|
||||
pub enum BasebackupError {
|
||||
#[error("basebackup pageserver error {0:#}")]
|
||||
Server(#[from] anyhow::Error),
|
||||
#[error("basebackup client error {0:#}")]
|
||||
Client(#[source] io::Error),
|
||||
#[error("basebackup client error {0:#} when {1}")]
|
||||
Client(#[source] io::Error, &'static str),
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
@@ -234,7 +234,7 @@ where
|
||||
self.ar
|
||||
.append(&header, self.buf.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "flush"))?;
|
||||
|
||||
self.total_blocks += nblocks;
|
||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
@@ -273,9 +273,9 @@ where
|
||||
for dir in subdirs.iter() {
|
||||
let header = new_tar_header_dir(dir)?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.append(&header, io::empty())
|
||||
.await
|
||||
.context("could not add directory to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
|
||||
}
|
||||
|
||||
// Send config files.
|
||||
@@ -286,13 +286,13 @@ where
|
||||
self.ar
|
||||
.append(&header, data)
|
||||
.await
|
||||
.context("could not add config file to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
|
||||
} else {
|
||||
let header = new_tar_header(filepath, 0)?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.append(&header, io::empty())
|
||||
.await
|
||||
.context("could not add config file to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
|
||||
}
|
||||
}
|
||||
if !lazy_slru_download {
|
||||
@@ -406,7 +406,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
|
||||
}
|
||||
|
||||
if min_restart_lsn != Lsn::MAX {
|
||||
@@ -419,7 +419,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &data[..])
|
||||
.await
|
||||
.context("could not add restart.lsn file to basebackup tarball")?;
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
|
||||
}
|
||||
for xid in self
|
||||
.timeline
|
||||
@@ -451,9 +451,9 @@ where
|
||||
let crc32 = crc32c::crc32c(&content);
|
||||
content.extend_from_slice(&crc32.to_le_bytes());
|
||||
let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
|
||||
self.ar.append(&header, &*content).await.context(
|
||||
"could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
|
||||
)?;
|
||||
self.ar.append(&header, &*content).await.map_err(|e| {
|
||||
BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
|
||||
})?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
@@ -464,7 +464,10 @@ where
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file().await?;
|
||||
self.ar.finish().await.map_err(BasebackupError::Client)?;
|
||||
self.ar
|
||||
.finish()
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
@@ -482,9 +485,9 @@ where
|
||||
let file_name = dst.to_segfile_name(0);
|
||||
let header = new_tar_header(&file_name, 0)?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.append(&header, io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -515,7 +518,7 @@ where
|
||||
self.ar
|
||||
.append(&header, segment_data.as_slice())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;
|
||||
|
||||
seg += 1;
|
||||
startblk = endblk;
|
||||
@@ -566,7 +569,7 @@ where
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;
|
||||
|
||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||
|
||||
@@ -576,7 +579,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
|
||||
} else {
|
||||
warn!("global/pg_filenode.map is missing");
|
||||
}
|
||||
@@ -612,9 +615,9 @@ where
|
||||
let path = format!("base/{}", dbnode);
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar
|
||||
.append(&header, &mut io::empty())
|
||||
.append(&header, io::empty())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
@@ -627,14 +630,14 @@ where
|
||||
self.ar
|
||||
.append(&header, pg_version_str.as_bytes())
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &img[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
@@ -663,7 +666,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &buf[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -693,7 +696,7 @@ where
|
||||
zenith_signal.as_bytes(),
|
||||
)
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
|
||||
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
@@ -718,7 +721,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &pg_control_bytes[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
@@ -742,7 +745,7 @@ where
|
||||
self.ar
|
||||
.append(&header, &wal_seg[..])
|
||||
.await
|
||||
.map_err(BasebackupError::Client)?;
|
||||
.map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -592,7 +592,7 @@ fn start_pageserver(
|
||||
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let service = http_utils::RouterService::new(router).unwrap();
|
||||
let server = hyper0::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown({
|
||||
|
||||
@@ -193,6 +193,10 @@ pub struct PageServerConf {
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
|
||||
/// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
|
||||
/// files read.
|
||||
pub enable_read_path_debugging: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -355,6 +359,7 @@ impl PageServerConf {
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -440,6 +445,7 @@ impl PageServerConf {
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::time::Duration;
|
||||
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
@@ -463,45 +462,18 @@ impl DeletionQueueClient {
|
||||
///
|
||||
/// The `current_generation` is the generation of this pageserver's current attachment. The
|
||||
/// generations in `layers` are the generations in which those layers were written.
|
||||
pub(crate) async fn push_layers(
|
||||
pub(crate) fn push_layers(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
current_generation: Generation,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
if current_generation.is_none() {
|
||||
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
||||
// None generations are not valid for attached tenants: they must always be attached in
|
||||
// a known generation. None generations are still permitted for layers in the index because
|
||||
// they may be historical.
|
||||
assert!(!current_generation.is_none());
|
||||
|
||||
let mut layer_paths = Vec::new();
|
||||
for (layer, meta) in layers {
|
||||
layer_paths.push(remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
meta.shard,
|
||||
&layer,
|
||||
meta.generation,
|
||||
));
|
||||
}
|
||||
self.push_immediate(layer_paths).await?;
|
||||
return self.flush_immediate().await;
|
||||
}
|
||||
|
||||
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
|
||||
}
|
||||
|
||||
/// When a Tenant has a generation, push_layers is always synchronous because
|
||||
/// the ListValidator channel is an unbounded channel.
|
||||
///
|
||||
/// This can be merged into push_layers when we remove the Generation-less mode
|
||||
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
||||
pub(crate) fn push_layers_sync(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
current_generation: Generation,
|
||||
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
metrics::DELETION_QUEUE
|
||||
.keys_submitted
|
||||
.inc_by(layers.len() as u64);
|
||||
@@ -957,14 +929,12 @@ mod test {
|
||||
|
||||
// File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
|
||||
info!("Pushing");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
||||
)?;
|
||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||
|
||||
assert_local_files(&[], &deletion_prefix);
|
||||
@@ -1017,14 +987,12 @@ mod test {
|
||||
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
|
||||
|
||||
tracing::debug!("Pushing...");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
stale_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
stale_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// We enqueued the operation in a stale generation: it should have failed validation
|
||||
tracing::debug!("Flushing...");
|
||||
@@ -1032,14 +1000,12 @@ mod test {
|
||||
assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
|
||||
|
||||
tracing::debug!("Pushing...");
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
latest_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
latest_generation,
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// We enqueued the operation in a fresh generation: it should have passed validation
|
||||
tracing::debug!("Flushing...");
|
||||
@@ -1074,28 +1040,24 @@ mod test {
|
||||
// generation gets that treatment)
|
||||
let remote_layer_file_name_historical =
|
||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation.previous(),
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation.previous(),
|
||||
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
// Inject a deletion in the generation before generation_now: after restart,
|
||||
// this deletion should get executed, because we execute deletions in the
|
||||
// immediately previous generation on the same node.
|
||||
let remote_layer_file_name_previous =
|
||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
||||
client
|
||||
.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
||||
)
|
||||
.await?;
|
||||
client.push_layers(
|
||||
tenant_shard_id,
|
||||
TIMELINE_ID,
|
||||
now_generation,
|
||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
||||
)?;
|
||||
|
||||
client.flush().await?;
|
||||
assert_remote_files(
|
||||
@@ -1139,6 +1101,7 @@ pub(crate) mod mock {
|
||||
use tracing::info;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
pub struct ConsumerState {
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::{
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
|
||||
tasks::sleep_random,
|
||||
},
|
||||
CancellableTask, DiskUsageEvictionTask,
|
||||
};
|
||||
@@ -210,14 +211,8 @@ async fn disk_usage_eviction_task(
|
||||
info!("disk usage based eviction task finishing");
|
||||
};
|
||||
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
{
|
||||
if random_init_delay(task_config.period, &cancel)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return;
|
||||
}
|
||||
if sleep_random(task_config.period, &cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut iteration_no = 0;
|
||||
|
||||
@@ -1080,7 +1080,10 @@ components:
|
||||
type: integer
|
||||
state:
|
||||
type: string
|
||||
latest_gc_cutoff_lsn:
|
||||
min_readable_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
applied_gc_cutoff_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
|
||||
@@ -13,6 +13,12 @@ use enumset::EnumSet;
|
||||
use futures::future::join_all;
|
||||
use futures::StreamExt;
|
||||
use futures::TryFutureExt;
|
||||
use http_utils::endpoint::{
|
||||
profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
|
||||
};
|
||||
use http_utils::failpoints::failpoints_handler;
|
||||
use http_utils::request::must_parse_query_param;
|
||||
use http_utils::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
@@ -60,13 +66,6 @@ use tokio::time::Instant;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::{
|
||||
profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
|
||||
};
|
||||
use utils::http::request::must_parse_query_param;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -104,6 +103,13 @@ use crate::tenant::OffloadedTimeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use http_utils::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_request_maybe, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
TimelineInfo,
|
||||
@@ -111,13 +117,6 @@ use pageserver_api::models::{
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
generation::Generation,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_request_maybe, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
@@ -483,6 +482,11 @@ async fn build_timeline_info_common(
|
||||
|
||||
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
|
||||
|
||||
let min_readable_lsn = std::cmp::max(
|
||||
timeline.get_gc_cutoff_lsn(),
|
||||
*timeline.get_applied_gc_cutoff_lsn(),
|
||||
);
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -494,7 +498,12 @@ async fn build_timeline_info_common(
|
||||
initdb_lsn,
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
// Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
|
||||
// we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
|
||||
// actually trimmed data to), which can pass each other when PITR is changed.
|
||||
latest_gc_cutoff_lsn: min_readable_lsn,
|
||||
min_readable_lsn,
|
||||
applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
|
||||
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
|
||||
current_logical_size_is_accurate: match current_logical_size.accuracy() {
|
||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||
@@ -561,7 +570,7 @@ async fn reload_auth_validation_keys_handler(
|
||||
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
|
||||
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
|
||||
|
||||
match JwtAuth::from_key_path(key_path) {
|
||||
match utils::auth::JwtAuth::from_key_path(key_path) {
|
||||
Ok(new_auth) => {
|
||||
shared_auth.swap(new_auth);
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -2152,6 +2161,7 @@ async fn timeline_compact_handler(
|
||||
let state = get_state(&request);
|
||||
|
||||
let mut flags = EnumSet::empty();
|
||||
flags |= CompactFlags::NoYield; // run compaction to completion
|
||||
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
|
||||
flags |= CompactFlags::ForceL0Compaction;
|
||||
|
||||
@@ -489,7 +489,6 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
@@ -774,11 +773,11 @@ impl PageServerHandler {
|
||||
|
||||
let batched_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelExists,
|
||||
@@ -793,11 +792,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelSize,
|
||||
@@ -812,11 +810,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetDbSize,
|
||||
@@ -831,11 +828,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetSlruSegment,
|
||||
@@ -850,12 +846,20 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
|
||||
// avoid a somewhat costly Span::record() by constructing the entire span in one go.
|
||||
macro_rules! mkspan {
|
||||
(before shard routing) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
|
||||
}};
|
||||
($shard_id:expr) => {{
|
||||
tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! respond_error {
|
||||
($error:expr) => {{
|
||||
($span:expr, $error:expr) => {{
|
||||
let error = BatchedFeMessage::RespondError {
|
||||
span,
|
||||
span: $span,
|
||||
error: BatchedPageStreamError {
|
||||
req: req.hdr,
|
||||
err: $error,
|
||||
@@ -868,27 +872,35 @@ impl PageServerHandler {
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let shard = match timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Page(key))
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await
|
||||
{
|
||||
Ok(tl) => tl,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return respond_error!(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into()
|
||||
));
|
||||
}
|
||||
Err(e) => {
|
||||
return respond_error!(e.into());
|
||||
let span = mkspan!(before shard routing);
|
||||
match e {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
//
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return respond_error!(
|
||||
span,
|
||||
PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into()
|
||||
)
|
||||
);
|
||||
}
|
||||
e => {
|
||||
return respond_error!(span, e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let span = mkspan!(shard.tenant_shard_id.shard_slug());
|
||||
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
@@ -902,7 +914,7 @@ impl PageServerHandler {
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_latest_gc_cutoff_lsn(),
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
ctx,
|
||||
)
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
@@ -910,7 +922,7 @@ impl PageServerHandler {
|
||||
{
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(e);
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
BatchedFeMessage::GetPage {
|
||||
@@ -922,11 +934,10 @@ impl PageServerHandler {
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
@@ -1358,7 +1369,7 @@ impl PageServerHandler {
|
||||
.take()
|
||||
.expect("implementation error: timeline_handles should not be locked");
|
||||
|
||||
let request_span = info_span!("request", shard_id = tracing::field::Empty);
|
||||
let request_span = info_span!("request");
|
||||
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
|
||||
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
|
||||
self.handle_pagerequests_pipelined(
|
||||
@@ -1799,7 +1810,7 @@ impl PageServerHandler {
|
||||
req: &PagestreamExistsRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
@@ -1826,7 +1837,7 @@ impl PageServerHandler {
|
||||
req: &PagestreamNblocksRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
@@ -1853,7 +1864,7 @@ impl PageServerHandler {
|
||||
req: &PagestreamDbSizeRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
@@ -1943,7 +1954,7 @@ impl PageServerHandler {
|
||||
req: &PagestreamGetSlruSegmentRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
@@ -2039,7 +2050,8 @@ impl PageServerHandler {
|
||||
{
|
||||
fn map_basebackup_error(err: BasebackupError) -> QueryError {
|
||||
match err {
|
||||
BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
// TODO: passthrough the error site to the final error message?
|
||||
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
BasebackupError::Server(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
@@ -2052,6 +2064,7 @@ impl PageServerHandler {
|
||||
.unwrap()
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
if timeline.is_archived() == Some(true) {
|
||||
// TODO after a grace period, turn this log line into a hard error
|
||||
@@ -2059,7 +2072,7 @@ impl PageServerHandler {
|
||||
//return Err(QueryError::NotFound("timeline is archived".into()))
|
||||
}
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
@@ -2139,10 +2152,12 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
}
|
||||
writer
|
||||
.flush()
|
||||
.await
|
||||
.map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
|
||||
writer.flush().await.map_err(|e| {
|
||||
map_basebackup_error(BasebackupError::Client(
|
||||
e,
|
||||
"handle_basebackup_request,flush",
|
||||
))
|
||||
})?;
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)
|
||||
|
||||
@@ -611,7 +611,7 @@ impl Timeline {
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
pausable_failpoint!("find-lsn-for-timestamp-pausable");
|
||||
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
|
||||
let gc_cutoff_planned = {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
gc_info.min_cutoff()
|
||||
|
||||
@@ -328,8 +328,8 @@ pub enum TaskKind {
|
||||
// Eviction. One per timeline.
|
||||
Eviction,
|
||||
|
||||
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
|
||||
IngestHousekeeping,
|
||||
// Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
|
||||
TenantHousekeeping,
|
||||
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
@@ -20,6 +20,7 @@ use chrono::NaiveDateTime;
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::CompactInfoResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
@@ -39,6 +40,8 @@ use remote_timeline_client::manifest::{
|
||||
use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
|
||||
use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
|
||||
use secondary::heatmap::HeatMapTenant;
|
||||
use secondary::heatmap::HeatMapTimeline;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -51,10 +54,14 @@ use timeline::compaction::GcCompactionQueue;
|
||||
use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
use timeline::offload::OffloadError;
|
||||
use timeline::CompactFlags;
|
||||
use timeline::CompactOptions;
|
||||
use timeline::CompactionError;
|
||||
use timeline::PreviousHeatmap;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -258,6 +265,7 @@ struct TimelinePreload {
|
||||
timeline_id: TimelineId,
|
||||
client: RemoteTimelineClient,
|
||||
index_part: Result<MaybeDeletedIndexPart, DownloadError>,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
}
|
||||
|
||||
pub(crate) struct TenantPreload {
|
||||
@@ -349,6 +357,9 @@ pub struct Tenant {
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// Signals the tenant compaction loop that there is L0 compaction work to be done.
|
||||
pub(crate) l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
/// Scheduled gc-compaction tasks.
|
||||
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
|
||||
|
||||
@@ -1121,6 +1132,7 @@ impl Tenant {
|
||||
resources: TimelineResources,
|
||||
mut index_part: IndexPart,
|
||||
metadata: TimelineMetadata,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
cause: LoadTimelineCause,
|
||||
ctx: &RequestContext,
|
||||
@@ -1151,6 +1163,7 @@ impl Tenant {
|
||||
let timeline = self.create_timeline_struct(
|
||||
timeline_id,
|
||||
&metadata,
|
||||
previous_heatmap,
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
@@ -1550,8 +1563,18 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
|
||||
// pulled the first heatmap. Not entirely necessary since the storage controller
|
||||
// will kick the secondary in any case and cause a download.
|
||||
let maybe_heatmap_at = self.read_on_disk_heatmap().await;
|
||||
|
||||
let timelines = self
|
||||
.load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
|
||||
.load_timelines_metadata(
|
||||
remote_timeline_ids,
|
||||
remote_storage,
|
||||
maybe_heatmap_at,
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(TenantPreload {
|
||||
@@ -1564,6 +1587,26 @@ impl Tenant {
|
||||
})
|
||||
}
|
||||
|
||||
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
|
||||
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
|
||||
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
|
||||
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
|
||||
Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
|
||||
Err(err) => {
|
||||
error!("Failed to deserialize old heatmap: {err}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::NotFound => None,
|
||||
_ => {
|
||||
error!("Unexpected IO error reading old heatmap: {err}");
|
||||
None
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Background task that downloads all data for a tenant and brings it to Active state.
|
||||
///
|
||||
@@ -1651,7 +1694,10 @@ impl Tenant {
|
||||
match index_part {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => {
|
||||
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
|
||||
remote_index_and_client.insert(timeline_id, (index_part, preload.client));
|
||||
remote_index_and_client.insert(
|
||||
timeline_id,
|
||||
(index_part, preload.client, preload.previous_heatmap),
|
||||
);
|
||||
}
|
||||
MaybeDeletedIndexPart::Deleted(index_part) => {
|
||||
info!(
|
||||
@@ -1670,7 +1716,7 @@ impl Tenant {
|
||||
// layer file.
|
||||
let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
|
||||
for (timeline_id, remote_metadata) in sorted_timelines {
|
||||
let (index_part, remote_client) = remote_index_and_client
|
||||
let (index_part, remote_client, previous_heatmap) = remote_index_and_client
|
||||
.remove(&timeline_id)
|
||||
.expect("just put it in above");
|
||||
|
||||
@@ -1690,12 +1736,8 @@ impl Tenant {
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
previous_heatmap,
|
||||
self.get_timeline_resources_for(remote_client),
|
||||
LoadTimelineCause::Attach,
|
||||
ctx,
|
||||
)
|
||||
@@ -1844,11 +1886,13 @@ impl Tenant {
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline_id))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn load_remote_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
resources: TimelineResources,
|
||||
cause: LoadTimelineCause,
|
||||
ctx: &RequestContext,
|
||||
@@ -1878,6 +1922,7 @@ impl Tenant {
|
||||
resources,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
previous_heatmap,
|
||||
ancestor,
|
||||
cause,
|
||||
ctx,
|
||||
@@ -1889,14 +1934,29 @@ impl Tenant {
|
||||
self: &Arc<Tenant>,
|
||||
timeline_ids: HashSet<TimelineId>,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
heatmap: Option<(HeatMapTenant, std::time::Instant)>,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
|
||||
let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
|
||||
|
||||
let mut part_downloads = JoinSet::new();
|
||||
for timeline_id in timeline_ids {
|
||||
let cancel_clone = cancel.clone();
|
||||
|
||||
let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
|
||||
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
|
||||
heatmap: h,
|
||||
read_at: hs.1,
|
||||
})
|
||||
});
|
||||
part_downloads.spawn(
|
||||
self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
self.load_timeline_metadata(
|
||||
timeline_id,
|
||||
remote_storage.clone(),
|
||||
previous_timeline_heatmap,
|
||||
cancel_clone,
|
||||
)
|
||||
.instrument(info_span!("download_index_part", %timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1944,6 +2004,7 @@ impl Tenant {
|
||||
self: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
cancel: CancellationToken,
|
||||
) -> impl Future<Output = TimelinePreload> {
|
||||
let client = self.build_timeline_client(timeline_id, remote_storage);
|
||||
@@ -1959,6 +2020,7 @@ impl Tenant {
|
||||
client,
|
||||
timeline_id,
|
||||
index_part,
|
||||
previous_heatmap,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2070,7 +2132,12 @@ impl Tenant {
|
||||
})?;
|
||||
|
||||
let timeline_preload = self
|
||||
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
|
||||
.load_timeline_metadata(
|
||||
timeline_id,
|
||||
self.remote_storage.clone(),
|
||||
None,
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let index_part = match timeline_preload.index_part {
|
||||
@@ -2104,6 +2171,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
index_part,
|
||||
remote_metadata,
|
||||
None,
|
||||
timeline_resources,
|
||||
LoadTimelineCause::Unoffload,
|
||||
&ctx,
|
||||
@@ -2819,7 +2887,7 @@ impl Tenant {
|
||||
};
|
||||
let metadata = index_part.metadata.clone();
|
||||
self
|
||||
.load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
|
||||
.load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
|
||||
create_guard: timeline_create_guard, activate, }, &ctx)
|
||||
.await?
|
||||
.ready_to_activate()
|
||||
@@ -2898,150 +2966,194 @@ impl Tenant {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Perform one compaction iteration.
|
||||
/// This function is periodically called by compactor task.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
/// Performs one compaction iteration. Called periodically from the compaction loop. Returns
|
||||
/// whether another compaction is needed, if we still have pending work or if we yield for
|
||||
/// immediate L0 compaction.
|
||||
///
|
||||
/// Returns whether we have pending compaction task.
|
||||
/// Compaction can also be explicitly requested for a timeline via the HTTP API.
|
||||
async fn compaction_iteration(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, timeline::CompactionError> {
|
||||
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
// Don't compact inactive tenants.
|
||||
if !self.is_active() {
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.load();
|
||||
|
||||
// Note that compaction usually requires deletions, but we don't respect
|
||||
// may_delete_layers_hint here: that is because tenants in AttachedMulti
|
||||
// should proceed with compaction even if they can't do deletion, to avoid
|
||||
// accumulating dangerously deep stacks of L0 layers. Deletions will be
|
||||
// enqueued inside RemoteTimelineClient, and executed layer if/when we transition
|
||||
// to AttachedSingle state.
|
||||
if !conf.location.may_upload_layers_hint() {
|
||||
info!("Skipping compaction in location state {:?}", conf.location);
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
// Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
|
||||
// since we need to compact L0 even in AttachedMulti to bound read amplification.
|
||||
let location = self.tenant_conf.load().location;
|
||||
if !location.may_upload_layers_hint() {
|
||||
info!("skipping compaction in location state {location:?}");
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines_to_compact_or_offload;
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
timelines_to_compact_or_offload = timelines
|
||||
.iter()
|
||||
.filter_map(|(timeline_id, timeline)| {
|
||||
let (is_active, (can_offload, _)) =
|
||||
(timeline.is_active(), timeline.can_offload());
|
||||
let has_no_unoffloaded_children = {
|
||||
!timelines
|
||||
.iter()
|
||||
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
|
||||
};
|
||||
let config_allows_offload = self.conf.timeline_offloading
|
||||
|| self
|
||||
.tenant_conf
|
||||
.load()
|
||||
.tenant_conf
|
||||
.timeline_offloading
|
||||
.unwrap_or_default();
|
||||
let can_offload =
|
||||
can_offload && has_no_unoffloaded_children && config_allows_offload;
|
||||
if (is_active, can_offload) == (false, false) {
|
||||
None
|
||||
} else {
|
||||
Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
}
|
||||
|
||||
// Before doing any I/O work, check our circuit breaker
|
||||
// Don't compact if the circuit breaker is tripped.
|
||||
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
|
||||
info!("Skipping compaction due to previous failures");
|
||||
return Ok(CompactionOutcome::Done);
|
||||
info!("skipping compaction due to previous failures");
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
|
||||
let mut has_pending_task = false;
|
||||
// Collect all timelines to compact, along with offload instructions and L0 counts.
|
||||
let mut compact: Vec<Arc<Timeline>> = Vec::new();
|
||||
let mut offload: HashSet<TimelineId> = HashSet::new();
|
||||
let mut l0_counts: HashMap<TimelineId, usize> = HashMap::new();
|
||||
|
||||
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
|
||||
{
|
||||
// pending_task_left == None: cannot compact, maybe still pending tasks
|
||||
// pending_task_left == Some(Pending): compaction task left
|
||||
// pending_task_left == Some(Done): no compaction task left
|
||||
let pending_task_left = if *can_compact {
|
||||
let compaction_outcome = timeline
|
||||
.compact(cancel, EnumSet::empty(), ctx)
|
||||
.instrument(info_span!("compact_timeline", %timeline_id))
|
||||
.await
|
||||
.inspect_err(|e| match e {
|
||||
timeline::CompactionError::ShuttingDown => (),
|
||||
timeline::CompactionError::Offload(_) => {
|
||||
// Failures to offload timelines do not trip the circuit breaker, because
|
||||
// they do not do lots of writes the way compaction itself does: it is cheap
|
||||
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
|
||||
}
|
||||
timeline::CompactionError::Other(e) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
|
||||
}
|
||||
})?;
|
||||
if let CompactionOutcome::Pending = compaction_outcome {
|
||||
Some(CompactionOutcome::Pending)
|
||||
} else {
|
||||
let queue = {
|
||||
let guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
guard.get(timeline_id).cloned()
|
||||
};
|
||||
if let Some(queue) = queue {
|
||||
let outcome = queue
|
||||
.iteration(cancel, ctx, &self.gc_block, timeline)
|
||||
.await?;
|
||||
Some(outcome)
|
||||
} else {
|
||||
Some(CompactionOutcome::Done)
|
||||
}
|
||||
let offload_enabled = self.get_timeline_offloading_enabled();
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for (&timeline_id, timeline) in timelines.iter() {
|
||||
// Skip inactive timelines.
|
||||
if !timeline.is_active() {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
|
||||
if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
|
||||
pausable_failpoint!("before-timeline-auto-offload");
|
||||
match offload_timeline(self, timeline)
|
||||
.instrument(info_span!("offload_timeline", %timeline_id))
|
||||
.await
|
||||
{
|
||||
Err(OffloadError::NotArchived) => {
|
||||
// Ignore this, we likely raced with unarchival
|
||||
Ok(())
|
||||
}
|
||||
other => other,
|
||||
}?;
|
||||
|
||||
// Schedule the timeline for compaction.
|
||||
compact.push(timeline.clone());
|
||||
|
||||
// Schedule the timeline for offloading if eligible.
|
||||
let can_offload = offload_enabled
|
||||
&& timeline.can_offload().0
|
||||
&& !timelines
|
||||
.iter()
|
||||
.any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
if can_offload {
|
||||
offload.insert(timeline_id);
|
||||
}
|
||||
}
|
||||
} // release timelines lock
|
||||
|
||||
for timeline in &compact {
|
||||
// Collect L0 counts. Can't await while holding lock above.
|
||||
if let Ok(lm) = timeline.layers.read().await.layer_map() {
|
||||
l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to
|
||||
// bound read amplification.
|
||||
//
|
||||
// TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC
|
||||
// compaction and offloading. We leave that as a potential problem to solve later. Consider
|
||||
// splitting L0 and image/GC compaction to separate background jobs.
|
||||
if self.get_compaction_l0_first() {
|
||||
let compaction_threshold = self.get_compaction_threshold();
|
||||
let compact_l0 = compact
|
||||
.iter()
|
||||
.map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0)))
|
||||
.filter(|&(_, l0)| l0 >= compaction_threshold)
|
||||
.sorted_by_key(|&(_, l0)| l0)
|
||||
.rev()
|
||||
.map(|(tli, _)| tli.clone())
|
||||
.collect_vec();
|
||||
|
||||
let mut has_pending_l0 = false;
|
||||
for timeline in compact_l0 {
|
||||
let outcome = timeline
|
||||
.compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
|
||||
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
|
||||
match outcome {
|
||||
CompactionOutcome::Done => {}
|
||||
CompactionOutcome::Skipped => {}
|
||||
CompactionOutcome::Pending => has_pending_l0 = true,
|
||||
CompactionOutcome::YieldForL0 => has_pending_l0 = true,
|
||||
}
|
||||
}
|
||||
if has_pending_l0 {
|
||||
return Ok(CompactionOutcome::YieldForL0); // do another pass
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2: image compaction and timeline offloading. If any timelines have accumulated
|
||||
// more L0 layers, they may also be compacted here.
|
||||
//
|
||||
// NB: image compaction may yield if there is pending L0 compaction.
|
||||
//
|
||||
// TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
|
||||
// different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
|
||||
// We leave this for a later PR.
|
||||
//
|
||||
// TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
|
||||
// amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
|
||||
let mut has_pending = false;
|
||||
for timeline in compact {
|
||||
if !timeline.is_active() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut outcome = timeline
|
||||
.compact(cancel, EnumSet::default(), ctx)
|
||||
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
|
||||
|
||||
// If we're done compacting, check the scheduled GC compaction queue for more work.
|
||||
if outcome == CompactionOutcome::Done {
|
||||
let queue = self
|
||||
.scheduled_compaction_tasks
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
if let Some(queue) = queue {
|
||||
outcome = queue
|
||||
.iteration(cancel, ctx, &self.gc_block, &timeline)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// If we're done compacting, offload the timeline if requested.
|
||||
if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) {
|
||||
pausable_failpoint!("before-timeline-auto-offload");
|
||||
offload_timeline(self, &timeline)
|
||||
.instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id))
|
||||
.await
|
||||
.or_else(|err| match err {
|
||||
// Ignore this, we likely raced with unarchival.
|
||||
OffloadError::NotArchived => Ok(()),
|
||||
err => Err(err),
|
||||
})?;
|
||||
}
|
||||
|
||||
match outcome {
|
||||
CompactionOutcome::Done => {}
|
||||
CompactionOutcome::Skipped => {}
|
||||
CompactionOutcome::Pending => has_pending = true,
|
||||
// This mostly makes sense when the L0-only pass above is enabled, since there's
|
||||
// otherwise no guarantee that we'll start with the timeline that has high L0.
|
||||
CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0),
|
||||
}
|
||||
}
|
||||
|
||||
// Success! Untrip the breaker if necessary.
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.success(&CIRCUIT_BREAKERS_UNBROKEN);
|
||||
|
||||
Ok(if has_pending_task {
|
||||
CompactionOutcome::Pending
|
||||
} else {
|
||||
CompactionOutcome::Done
|
||||
})
|
||||
match has_pending {
|
||||
true => Ok(CompactionOutcome::Pending),
|
||||
false => Ok(CompactionOutcome::Done),
|
||||
}
|
||||
}
|
||||
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
match err {
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
@@ -3088,32 +3200,28 @@ impl Tenant {
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
// this happens during ingest: this background housekeeping is for freezing layers
|
||||
// that are open but haven't been written to for some time.
|
||||
async fn ingest_housekeeping(&self) {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines = {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|timeline| {
|
||||
if timeline.is_active() {
|
||||
Some(timeline.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
/// Performs periodic housekeeping, via the tenant housekeeping background task.
|
||||
async fn housekeeping(&self) {
|
||||
// Call through to all timelines to freeze ephemeral layers as needed. This usually happens
|
||||
// during ingest, but we don't want idle timelines to hold open layers for too long.
|
||||
let timelines = self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tli| tli.is_active())
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
|
||||
for timeline in &timelines {
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
}
|
||||
|
||||
// Shut down walredo if idle.
|
||||
const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
|
||||
if let Some(ref walredo_mgr) = self.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
@@ -3823,6 +3931,13 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
|
||||
}
|
||||
|
||||
pub fn get_compaction_l0_first(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_l0_first
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
|
||||
}
|
||||
|
||||
pub fn get_gc_horizon(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -3877,6 +3992,16 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub fn get_timeline_offloading_enabled(&self) -> bool {
|
||||
if self.conf.timeline_offloading {
|
||||
return true;
|
||||
}
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.timeline_offloading
|
||||
.unwrap_or(self.conf.default_tenant_conf.timeline_offloading)
|
||||
}
|
||||
|
||||
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
|
||||
fn build_tenant_manifest(&self) -> TenantManifest {
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
@@ -3971,6 +4096,7 @@ impl Tenant {
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
resources: TimelineResources,
|
||||
cause: CreateTimelineCause,
|
||||
@@ -3994,6 +4120,7 @@ impl Tenant {
|
||||
self.conf,
|
||||
Arc::clone(&self.tenant_conf),
|
||||
new_metadata,
|
||||
previous_heatmap,
|
||||
ancestor,
|
||||
new_timeline_id,
|
||||
self.tenant_shard_id,
|
||||
@@ -4115,6 +4242,7 @@ impl Tenant {
|
||||
// use an extremely long backoff.
|
||||
Some(Duration::from_secs(3600 * 24)),
|
||||
)),
|
||||
l0_compaction_trigger: Arc::new(Notify::new()),
|
||||
scheduled_compaction_tasks: Mutex::new(Default::default()),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
|
||||
@@ -4635,24 +4763,24 @@ impl Tenant {
|
||||
// We check it against both the planned GC cutoff stored in 'gc_info',
|
||||
// and the 'latest_gc_cutoff' of the last GC that was performed. The
|
||||
// planned GC cutoff in 'gc_info' is normally larger than
|
||||
// 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
|
||||
// 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
|
||||
// changed the GC settings for the tenant to make the PITR window
|
||||
// larger, but some of the data was already removed by an earlier GC
|
||||
// iteration.
|
||||
|
||||
// check against last actual 'latest_gc_cutoff' first
|
||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||
let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let planned_cutoff = gc_info.min_cutoff();
|
||||
if gc_info.lsn_covered_by_lease(start_lsn) {
|
||||
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
|
||||
tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
|
||||
} else {
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
*applied_gc_cutoff_lsn,
|
||||
))
|
||||
.map_err(CreateTimelineError::AncestorLsn)?;
|
||||
|
||||
@@ -4691,7 +4819,7 @@ impl Tenant {
|
||||
dst_prev,
|
||||
Some(src_id),
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||
*src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
@@ -5023,12 +5151,19 @@ impl Tenant {
|
||||
)
|
||||
}
|
||||
|
||||
/// Call this before constructing a timeline, to build its required structures
|
||||
/// Builds required resources for a new timeline.
|
||||
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
|
||||
let remote_client = self.build_timeline_remote_client(timeline_id);
|
||||
self.get_timeline_resources_for(remote_client)
|
||||
}
|
||||
|
||||
/// Builds timeline resources for the given remote client.
|
||||
fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
@@ -5057,6 +5192,7 @@ impl Tenant {
|
||||
.create_timeline_struct(
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
None,
|
||||
ancestor,
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
@@ -5474,6 +5610,8 @@ pub(crate) mod harness {
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
|
||||
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
|
||||
compaction_l0_first: Some(tenant_conf.compaction_l0_first),
|
||||
compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore),
|
||||
l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
|
||||
@@ -6061,8 +6199,8 @@ mod tests {
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
|
||||
let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
|
||||
assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
|
||||
match tline.get(*TEST_KEY, Lsn(0x25)) {
|
||||
Ok(_) => panic!("request for page should have failed"),
|
||||
Err(err) => assert!(err.to_string().contains("not found at")),
|
||||
@@ -7701,6 +7839,18 @@ mod tests {
|
||||
}
|
||||
|
||||
tline.freeze_and_flush().await?;
|
||||
// Force layers to L1
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
{
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceL0Compaction);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if iter % 5 == 0 {
|
||||
let (_, before_delta_file_accessed) =
|
||||
@@ -7713,6 +7863,7 @@ mod tests {
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceImageLayerCreation);
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
flags.insert(CompactFlags::ForceL0Compaction);
|
||||
flags
|
||||
},
|
||||
&ctx,
|
||||
@@ -8159,6 +8310,8 @@ mod tests {
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
|
||||
tline.force_set_disk_consistent_lsn(Lsn(0x40));
|
||||
tline
|
||||
.compact(
|
||||
&cancel,
|
||||
@@ -8172,8 +8325,7 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Image layers are created at last_record_lsn
|
||||
// Image layers are created at repartition LSN
|
||||
let images = tline
|
||||
.inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
|
||||
.await
|
||||
@@ -8344,7 +8496,7 @@ mod tests {
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -8452,7 +8604,7 @@ mod tests {
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
@@ -8620,8 +8772,8 @@ mod tests {
|
||||
|
||||
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
|
||||
info!(
|
||||
"latest_gc_cutoff_lsn: {}",
|
||||
*timeline.get_latest_gc_cutoff_lsn()
|
||||
"applied_gc_cutoff_lsn: {}",
|
||||
*timeline.get_applied_gc_cutoff_lsn()
|
||||
);
|
||||
timeline.force_set_disk_consistent_lsn(end_lsn);
|
||||
|
||||
@@ -8647,7 +8799,7 @@ mod tests {
|
||||
|
||||
// Make lease on a already GC-ed LSN.
|
||||
// 0/80 does not have a valid lease + is below latest_gc_cutoff
|
||||
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
|
||||
assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect_err("lease request on GC-ed LSN should fail");
|
||||
@@ -8838,7 +8990,7 @@ mod tests {
|
||||
};
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -8925,7 +9077,7 @@ mod tests {
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
@@ -9378,7 +9530,7 @@ mod tests {
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -9525,7 +9677,7 @@ mod tests {
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x38))
|
||||
.wait()
|
||||
@@ -9626,7 +9778,7 @@ mod tests {
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -9877,7 +10029,7 @@ mod tests {
|
||||
|
||||
{
|
||||
parent_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x10))
|
||||
.wait()
|
||||
@@ -9897,7 +10049,7 @@ mod tests {
|
||||
|
||||
{
|
||||
branch_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x50))
|
||||
.wait()
|
||||
@@ -10253,7 +10405,7 @@ mod tests {
|
||||
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -10638,7 +10790,7 @@ mod tests {
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
@@ -10889,7 +11041,7 @@ mod tests {
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
|
||||
@@ -285,6 +285,14 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_l0_first: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_l0_semaphore: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub l0_flush_delay_threshold: Option<usize>,
|
||||
@@ -416,6 +424,12 @@ impl TenantConfOpt {
|
||||
.as_ref()
|
||||
.unwrap_or(&global_conf.compaction_algorithm)
|
||||
.clone(),
|
||||
compaction_l0_first: self
|
||||
.compaction_l0_first
|
||||
.unwrap_or(global_conf.compaction_l0_first),
|
||||
compaction_l0_semaphore: self
|
||||
.compaction_l0_semaphore
|
||||
.unwrap_or(global_conf.compaction_l0_semaphore),
|
||||
l0_flush_delay_threshold: self
|
||||
.l0_flush_delay_threshold
|
||||
.or(global_conf.l0_flush_delay_threshold),
|
||||
@@ -466,7 +480,7 @@ impl TenantConfOpt {
|
||||
.lsn_lease_length_for_ts
|
||||
.unwrap_or(global_conf.lsn_lease_length_for_ts),
|
||||
timeline_offloading: self
|
||||
.lazy_slru_download
|
||||
.timeline_offloading
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: self
|
||||
.wal_receiver_protocol_override
|
||||
@@ -493,6 +507,8 @@ impl TenantConfOpt {
|
||||
mut compaction_threshold,
|
||||
mut compaction_upper_limit,
|
||||
mut compaction_algorithm,
|
||||
mut compaction_l0_first,
|
||||
mut compaction_l0_semaphore,
|
||||
mut l0_flush_delay_threshold,
|
||||
mut l0_flush_stall_threshold,
|
||||
mut l0_flush_wait_upload,
|
||||
@@ -538,6 +554,10 @@ impl TenantConfOpt {
|
||||
.compaction_upper_limit
|
||||
.apply(&mut compaction_upper_limit);
|
||||
patch.compaction_algorithm.apply(&mut compaction_algorithm);
|
||||
patch.compaction_l0_first.apply(&mut compaction_l0_first);
|
||||
patch
|
||||
.compaction_l0_semaphore
|
||||
.apply(&mut compaction_l0_semaphore);
|
||||
patch
|
||||
.l0_flush_delay_threshold
|
||||
.apply(&mut l0_flush_delay_threshold);
|
||||
@@ -619,6 +639,8 @@ impl TenantConfOpt {
|
||||
compaction_threshold,
|
||||
compaction_upper_limit,
|
||||
compaction_algorithm,
|
||||
compaction_l0_first,
|
||||
compaction_l0_semaphore,
|
||||
l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload,
|
||||
@@ -681,6 +703,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
compaction_period: value.compaction_period.map(humantime),
|
||||
compaction_threshold: value.compaction_threshold,
|
||||
compaction_upper_limit: value.compaction_upper_limit,
|
||||
compaction_l0_first: value.compaction_l0_first,
|
||||
compaction_l0_semaphore: value.compaction_l0_semaphore,
|
||||
l0_flush_delay_threshold: value.l0_flush_delay_threshold,
|
||||
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
|
||||
l0_flush_wait_upload: value.l0_flush_wait_upload,
|
||||
|
||||
@@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 {
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<TimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
|
||||
// The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`].
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
@@ -2816,8 +2816,8 @@ where
|
||||
}
|
||||
|
||||
use {
|
||||
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
crate::tenant::gc_result::GcResult, http_utils::error::ApiError,
|
||||
pageserver_api::models::TimelineGcRequest,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -517,7 +517,7 @@ impl RemoteTimelineClient {
|
||||
if let Ok(queue) = queue_locked.initialized_mut() {
|
||||
let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
|
||||
for d in blocked_deletions {
|
||||
if let Err(e) = self.deletion_queue_client.push_layers_sync(
|
||||
if let Err(e) = self.deletion_queue_client.push_layers(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
self.generation,
|
||||
@@ -2151,7 +2151,6 @@ impl RemoteTimelineClient {
|
||||
self.generation,
|
||||
delete.layers.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::time::SystemTime;
|
||||
use std::{collections::HashMap, time::SystemTime};
|
||||
|
||||
use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
|
||||
|
||||
@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
|
||||
use utils::{generation::Generation, id::TimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(super) struct HeatMapTenant {
|
||||
pub(crate) struct HeatMapTenant {
|
||||
/// Generation of the attached location that uploaded the heatmap: this is not required
|
||||
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
|
||||
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
|
||||
@@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant {
|
||||
pub(super) upload_period_ms: Option<u128>,
|
||||
}
|
||||
|
||||
impl HeatMapTenant {
|
||||
pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
|
||||
self.timelines
|
||||
.into_iter()
|
||||
.map(|htl| (htl.timeline_id, htl))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
@@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline {
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(crate) name: LayerName,
|
||||
pub(crate) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
pub(crate) access_time: SystemTime,
|
||||
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
}
|
||||
|
||||
@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
|
||||
ancestor_lsn,
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
|
||||
next_pitr_cutoff,
|
||||
retention_param_cutoff,
|
||||
lease_points,
|
||||
|
||||
@@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use super::PageReconstructError;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
@@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState {
|
||||
|
||||
pub(crate) io_concurrency: IoConcurrency,
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
|
||||
pub(crate) read_path: Option<ReadPath>,
|
||||
}
|
||||
|
||||
/// The level of IO concurrency to be used on the read path
|
||||
@@ -609,6 +611,7 @@ impl ValuesReconstructState {
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
read_path: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -136,6 +136,22 @@ pub(crate) fn local_layer_path(
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum LastEviction {
|
||||
Never,
|
||||
At(std::time::Instant),
|
||||
Evicting,
|
||||
}
|
||||
|
||||
impl LastEviction {
|
||||
pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
|
||||
match self {
|
||||
LastEviction::Never => false,
|
||||
LastEviction::At(evicted_at) => evicted_at > &timepoint,
|
||||
LastEviction::Evicting => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
/// Creates a layer value for a file we know to not be resident.
|
||||
pub(crate) fn for_evicted(
|
||||
@@ -353,7 +369,6 @@ impl Layer {
|
||||
/// while the guard exists.
|
||||
///
|
||||
/// Returns None if the layer is currently evicted or becoming evicted.
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
|
||||
let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
|
||||
|
||||
@@ -406,6 +421,17 @@ impl Layer {
|
||||
self.0.metadata()
|
||||
}
|
||||
|
||||
pub(crate) fn last_evicted_at(&self) -> LastEviction {
|
||||
match self.0.last_evicted_at.try_lock() {
|
||||
Ok(lock) => match *lock {
|
||||
None => LastEviction::Never,
|
||||
Some(at) => LastEviction::At(at),
|
||||
},
|
||||
Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
|
||||
Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
|
||||
self.0
|
||||
.timeline
|
||||
@@ -530,7 +556,6 @@ impl ResidentOrWantedEvicted {
|
||||
/// This is not used on the read path (anything that calls
|
||||
/// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
|
||||
/// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
|
||||
#[cfg(test)]
|
||||
fn get(&self) -> Option<Arc<DownloadedLayer>> {
|
||||
match self {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
|
||||
@@ -658,7 +683,9 @@ struct LayerInner {
|
||||
|
||||
/// When the Layer was last evicted but has not been downloaded since.
|
||||
///
|
||||
/// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
|
||||
/// This is used for skipping evicted layers from the previous heatmap (see
|
||||
/// `[Timeline::generate_heatmap]`) and for updating metrics
|
||||
/// (see [`LayerImplMetrics::redownload_after`]).
|
||||
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,48 +1,64 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
//! This module contains per-tenant background processes, e.g. compaction and GC.
|
||||
|
||||
use std::ops::ControlFlow;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::cmp::max;
|
||||
use std::future::Future;
|
||||
use std::ops::{ControlFlow, RangeInclusive};
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::Rng;
|
||||
use scopeguard::defer;
|
||||
use tokio::sync::{Semaphore, SemaphorePermit};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::compaction::CompactionOutcome;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::Rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{backoff, completion, pausable_failpoint};
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
|
||||
use utils::backoff::exponential_backoff_duration;
|
||||
use utils::completion::Barrier;
|
||||
use utils::pausable_failpoint;
|
||||
|
||||
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
// repartitioning in the async context. this should give leave us some workers
|
||||
// unblocked to be blocked on other work, hopefully easing any outside visible
|
||||
// effects of restarts.
|
||||
//
|
||||
// 6/8 is a guess; previously we ran with unlimited 8 and more from
|
||||
// spawn_blocking.
|
||||
(total_threads * 3).checked_div(4).unwrap_or(0),
|
||||
);
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(
|
||||
permits < total_threads,
|
||||
"need threads avail for shorter work"
|
||||
);
|
||||
tokio::sync::Semaphore::new(permits)
|
||||
});
|
||||
/// Semaphore limiting concurrent background tasks (across all tenants).
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
|
||||
static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
Semaphore::new(permits)
|
||||
});
|
||||
|
||||
/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if
|
||||
/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled.
|
||||
///
|
||||
/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
|
||||
/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less
|
||||
/// important (e.g. due to page images in delta layers) and can wait for other background tasks.
|
||||
///
|
||||
/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note
|
||||
/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same
|
||||
/// thread pool.
|
||||
static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
|
||||
let total_threads = TOKIO_WORKER_THREADS.get();
|
||||
let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(permits < total_threads, "need threads for other work");
|
||||
Semaphore::new(permits)
|
||||
});
|
||||
|
||||
/// Background jobs.
|
||||
///
|
||||
/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
|
||||
/// do any significant IO or CPU work.
|
||||
#[derive(
|
||||
Debug,
|
||||
PartialEq,
|
||||
@@ -55,10 +71,13 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub(crate) enum BackgroundLoopKind {
|
||||
/// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is
|
||||
/// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics.
|
||||
L0Compaction,
|
||||
Compaction,
|
||||
Gc,
|
||||
Eviction,
|
||||
IngestHouseKeeping,
|
||||
TenantHouseKeeping,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
InitialLogicalSizeCalculation,
|
||||
@@ -67,40 +86,29 @@ pub(crate) enum BackgroundLoopKind {
|
||||
}
|
||||
|
||||
pub struct BackgroundLoopSemaphorePermit<'a> {
|
||||
_permit: tokio::sync::SemaphorePermit<'static>,
|
||||
_permit: SemaphorePermit<'static>,
|
||||
_recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
|
||||
}
|
||||
|
||||
/// Cancellation safe.
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
/// Acquires a semaphore permit, to limit concurrent background jobs.
|
||||
pub(crate) async fn acquire_concurrency_permit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
_ctx: &RequestContext,
|
||||
) -> BackgroundLoopSemaphorePermit<'static> {
|
||||
// TODO: use a lower threshold and remove the pacer once we resolve some blockage.
|
||||
const WARN_THRESHOLD: Duration = Duration::from_secs(600);
|
||||
static WARN_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
|
||||
let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
|
||||
let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
|
||||
|
||||
if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
|
||||
pausable_failpoint!("initial-size-calculation-permit-pause");
|
||||
}
|
||||
|
||||
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
|
||||
let permit = CONCURRENT_BACKGROUND_TASKS
|
||||
.acquire()
|
||||
.await
|
||||
.expect("should never close");
|
||||
let semaphore = match loop_kind {
|
||||
BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS,
|
||||
_ => &CONCURRENT_BACKGROUND_TASKS,
|
||||
};
|
||||
let permit = semaphore.acquire().await.expect("should never close");
|
||||
|
||||
let waited = recorder.acquired();
|
||||
if waited >= WARN_THRESHOLD {
|
||||
let waited = waited.as_secs_f64();
|
||||
WARN_PACER
|
||||
.lock()
|
||||
.unwrap()
|
||||
.call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
|
||||
}
|
||||
recorder.acquired();
|
||||
|
||||
BackgroundLoopSemaphorePermit {
|
||||
_permit: permit,
|
||||
@@ -108,12 +116,10 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
}
|
||||
}
|
||||
|
||||
/// Start per tenant background loops: compaction and gc.
|
||||
pub fn start_background_loops(
|
||||
tenant: &Arc<Tenant>,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
/// Start per tenant background loops: compaction, GC, and ingest housekeeping.
|
||||
pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
|
||||
let tenant_shard_id = tenant.tenant_shard_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
@@ -122,13 +128,15 @@ pub fn start_background_loops(
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
compaction_loop(tenant, cancel)
|
||||
// If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
@@ -137,6 +145,7 @@ pub fn start_background_loops(
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
@@ -145,13 +154,15 @@ pub fn start_background_loops(
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
gc_loop(tenant, cancel)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
@@ -162,21 +173,23 @@ pub fn start_background_loops(
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
TaskKind::TenantHousekeeping,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
&format!("housekeeping for tenant {tenant_shard_id}"),
|
||||
{
|
||||
let tenant = Arc::clone(tenant);
|
||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||
let can_start = can_start.cloned();
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let cancel = task_mgr::shutdown_token(); // NB: must be in async context
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()) },
|
||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
_ = Barrier::maybe_wait(can_start) => {}
|
||||
};
|
||||
ingest_housekeeping_loop(tenant, cancel)
|
||||
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
|
||||
tenant_housekeeping_loop(tenant, cancel)
|
||||
.instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
@@ -184,372 +197,292 @@ pub fn start_background_loops(
|
||||
);
|
||||
}
|
||||
|
||||
///
|
||||
/// Compaction task's main loop
|
||||
///
|
||||
/// Compaction task's main loop.
|
||||
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
const BASE_BACKOFF_SECS: f64 = 1.0;
|
||||
const MAX_BACKOFF_SECS: f64 = 300.0;
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
loop {
|
||||
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let mut period = tenant.get_compaction_period();
|
||||
let mut error_run = 0; // consecutive errors
|
||||
|
||||
// Stagger the compaction loop across tenants.
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
if sleep_random(period, &cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
loop {
|
||||
// Recheck that we're still active.
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Refresh the period. If compaction is disabled, check again in a bit.
|
||||
period = tenant.get_compaction_period();
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic compaction is disabled");
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
_ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {},
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let period = tenant.get_compaction_period();
|
||||
// Wait for the next compaction run.
|
||||
let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff), if error_run > 0 => {},
|
||||
_ = tokio::time::sleep(period), if error_run == 0 => {},
|
||||
_ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {},
|
||||
_ = cancel.cancelled() => return,
|
||||
}
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
// Run compaction.
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Compaction,
|
||||
};
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
|
||||
match output {
|
||||
Ok(outcome) => {
|
||||
error_run = 0;
|
||||
// If there's more compaction work, L0 or not, schedule an immediate run.
|
||||
match outcome {
|
||||
CompactionOutcome::Done => {}
|
||||
CompactionOutcome::Skipped => {}
|
||||
CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(),
|
||||
CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(),
|
||||
}
|
||||
}
|
||||
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic compaction is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10)
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Compaction,
|
||||
};
|
||||
|
||||
// Run compaction
|
||||
let IterationResult { output, elapsed } = iteration
|
||||
.run(tenant.compaction_iteration(&cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(outcome) => {
|
||||
error_run_count = 0;
|
||||
// schedule the next compaction immediately in case there is a pending compaction task
|
||||
sleep_duration = if let CompactionOutcome::Pending = outcome {
|
||||
Duration::from_secs(1)
|
||||
} else {
|
||||
period
|
||||
};
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
|
||||
// the duration is recorded by performance tests by enabling debug in this function
|
||||
tracing::debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
};
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
// TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
|
||||
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
|
||||
if let Some(walredo_mgr) = &tenant.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
Err(err) => {
|
||||
error_run += 1;
|
||||
let backoff =
|
||||
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
log_compaction_error(&err, error_run, backoff, cancel.is_cancelled());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// NB: this log entry is recorded by performance tests.
|
||||
debug!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"compaction iteration complete"
|
||||
);
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
err: &CompactionError,
|
||||
error_count: u32,
|
||||
sleep_duration: Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
let level = match err {
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
Offload(_) => Some(LooksLike::Error),
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
let is_stopping = {
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
upload_queue || timeline
|
||||
};
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
let is_stopping = upload_queue || timeline;
|
||||
|
||||
if is_stopping {
|
||||
Some(LooksLike::Info)
|
||||
Level::INFO
|
||||
} else {
|
||||
Some(LooksLike::Error)
|
||||
Level::ERROR
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
match level {
|
||||
Level::ERROR => {
|
||||
error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
|
||||
}
|
||||
Level::INFO => {
|
||||
info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
|
||||
}
|
||||
level => unimplemented!("unexpected level {level:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
/// GC task's main loop.
|
||||
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
const MAX_BACKOFF_SECS: f64 = 300.0;
|
||||
// How many errors we have seen consequtively
|
||||
let mut error_run_count = 0;
|
||||
let mut error_run = 0; // consecutive errors
|
||||
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx =
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
// GC might require downloading, to find the cutoff LSN that corresponds to the
|
||||
// cutoff specified as time.
|
||||
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let mut first = true;
|
||||
|
||||
let mut first = true;
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
}
|
||||
loop {
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
let period = tenant.get_gc_period();
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
|
||||
let delays = async {
|
||||
random_init_delay(period, &cancel).await?;
|
||||
Ok::<_, Cancelled>(())
|
||||
};
|
||||
|
||||
if delays.await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO || gc_horizon == 0 {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::Gc,
|
||||
};
|
||||
// Run gc
|
||||
let IterationResult { output, elapsed: _ } =
|
||||
iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
|
||||
.await;
|
||||
match output {
|
||||
Ok(_) => {
|
||||
error_run_count = 0;
|
||||
sleep_duration = period;
|
||||
}
|
||||
Err(crate::tenant::GcError::TenantCancelled) => {
|
||||
return;
|
||||
}
|
||||
Err(e) => {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
if first {
|
||||
first = false;
|
||||
if sleep_random(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
|
||||
ControlFlow::Break(()) => return,
|
||||
ControlFlow::Continue(()) => (),
|
||||
},
|
||||
}
|
||||
|
||||
// We run ingest housekeeping with the same frequency as compaction: it is not worth
|
||||
// having a distinct setting. But we don't run it in the same task, because compaction
|
||||
// blocks on acquiring the background job semaphore.
|
||||
let period = tenant.get_compaction_period();
|
||||
|
||||
// If compaction period is set to zero (to disable it), then we will use a reasonable default
|
||||
let period = if period == Duration::ZERO {
|
||||
humantime::Duration::from_str(
|
||||
pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
|
||||
)
|
||||
.unwrap()
|
||||
.into()
|
||||
} else {
|
||||
period
|
||||
};
|
||||
|
||||
// Jitter the period by +/- 5%
|
||||
let period =
|
||||
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
|
||||
|
||||
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
|
||||
// a tenant, since it won't have started writing any ephemeral files yet.
|
||||
if tokio::time::timeout(period, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
let gc_horizon = tenant.get_gc_horizon();
|
||||
let sleep_duration;
|
||||
if period == Duration::ZERO || gc_horizon == 0 {
|
||||
#[cfg(not(feature = "testing"))]
|
||||
info!("automatic GC is disabled");
|
||||
// check again in 10 seconds, in case it's been enabled again.
|
||||
sleep_duration = Duration::from_secs(10);
|
||||
} else {
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::IngestHouseKeeping,
|
||||
kind: BackgroundLoopKind::Gc,
|
||||
};
|
||||
iteration.run(tenant.ingest_housekeeping()).await;
|
||||
|
||||
// TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
|
||||
// Or just spawn another background loop for this throttle, it's not like it's super costly.
|
||||
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
// Run gc
|
||||
let IterationResult { output, elapsed: _ } = iteration
|
||||
.run(tenant.gc_iteration(
|
||||
None,
|
||||
gc_horizon,
|
||||
tenant.get_pitr_interval(),
|
||||
&cancel,
|
||||
&ctx,
|
||||
))
|
||||
.await;
|
||||
match output {
|
||||
Ok(_) => {
|
||||
error_run = 0;
|
||||
sleep_duration = period;
|
||||
}
|
||||
Err(crate::tenant::GcError::TenantCancelled) => {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.pagestream_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
.await;
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
Err(e) => {
|
||||
error_run += 1;
|
||||
let wait_duration =
|
||||
exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS);
|
||||
|
||||
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
// if the tenant has a proper status already, no need to wait for anything
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
ControlFlow::Continue(())
|
||||
} else {
|
||||
let mut tenant_state_updates = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
match tenant_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = &*tenant_state_updates.borrow();
|
||||
match new_state {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => {
|
||||
debug!("Not running the task loop, tenant is not active: {state:?}");
|
||||
continue;
|
||||
}
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
return ControlFlow::Break(());
|
||||
|
||||
sleep_duration = wait_duration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tenant housekeeping's main loop.
|
||||
async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
let mut last_throttle_flag_reset_at = Instant::now();
|
||||
loop {
|
||||
if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the same period as compaction; it's not worth a separate setting. But if it's set to
|
||||
// zero (to disable compaction), then use a reasonable default. Jitter it by 5%.
|
||||
let period = match tenant.get_compaction_period() {
|
||||
Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(),
|
||||
period => period,
|
||||
};
|
||||
|
||||
let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Do tenant housekeeping.
|
||||
let iteration = Iteration {
|
||||
started_at: Instant::now(),
|
||||
period,
|
||||
kind: BackgroundLoopKind::TenantHouseKeeping,
|
||||
};
|
||||
iteration.run(tenant.housekeeping()).await;
|
||||
|
||||
// Log any getpage throttling.
|
||||
info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
|
||||
let now = Instant::now();
|
||||
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
|
||||
let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
|
||||
if count_throttled == 0 {
|
||||
return;
|
||||
}
|
||||
let allowed_rps = tenant.pagestream_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
count_accounted_start, // log after pre-existing fields to not break existing log scraping
|
||||
allowed_rps=%format_args!("{allowed_rps:.0}"),
|
||||
"shard was throttled in the last n_seconds"
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
|
||||
async fn wait_for_active_tenant(
|
||||
tenant: &Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> ControlFlow<()> {
|
||||
if tenant.current_state() == TenantState::Active {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
|
||||
let mut update_rx = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
result = update_rx.changed() => if result.is_err() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
match &*update_rx.borrow() {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -558,26 +491,41 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
|
||||
#[error("cancelled")]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Provide a random delay for background task initialization.
|
||||
/// Sleeps for a random interval up to the given max value.
|
||||
///
|
||||
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
|
||||
/// different periods for more stable load.
|
||||
pub(crate) async fn random_init_delay(
|
||||
period: Duration,
|
||||
pub(crate) async fn sleep_random(
|
||||
max: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Cancelled> {
|
||||
if period == Duration::ZERO {
|
||||
return Ok(());
|
||||
}
|
||||
) -> Result<Duration, Cancelled> {
|
||||
sleep_random_range(Duration::ZERO..=max, cancel).await
|
||||
}
|
||||
|
||||
let d = {
|
||||
let mut rng = rand::thread_rng();
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
match tokio::time::timeout(d, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
/// Sleeps for a random interval in the given range. Returns the duration.
|
||||
pub(crate) async fn sleep_random_range(
|
||||
interval: RangeInclusive<Duration>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Duration, Cancelled> {
|
||||
let delay = rand::thread_rng().gen_range(interval);
|
||||
if delay == Duration::ZERO {
|
||||
return Ok(delay);
|
||||
}
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(delay) => Ok(delay),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sleeps for an interval with a random jitter.
|
||||
pub(crate) async fn sleep_jitter(
|
||||
duration: Duration,
|
||||
jitter: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Duration, Cancelled> {
|
||||
let from = duration.saturating_sub(jitter);
|
||||
let to = duration.saturating_add(jitter);
|
||||
sleep_random_range(from..=to, cancel).await
|
||||
}
|
||||
|
||||
struct Iteration {
|
||||
@@ -593,42 +541,25 @@ struct IterationResult<O> {
|
||||
|
||||
impl Iteration {
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let Self {
|
||||
started_at,
|
||||
period,
|
||||
kind,
|
||||
} = self;
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
pub(crate) async fn run<F: Future<Output = O>, O>(self, fut: F) -> IterationResult<O> {
|
||||
let mut fut = pin!(fut);
|
||||
|
||||
// Wrap `fut` into a future that logs a message every `period` so that we get a
|
||||
// very obvious breadcrumb in the logs _while_ a slow iteration is happening.
|
||||
let liveness_logger = async move {
|
||||
loop {
|
||||
match tokio::time::timeout(period, &mut fut).await {
|
||||
Ok(x) => return x,
|
||||
Err(_) => {
|
||||
// info level as per the same rationale why warn_when_period_overrun is info
|
||||
// => https://github.com/neondatabase/neon/pull/5724
|
||||
info!("still running");
|
||||
}
|
||||
}
|
||||
let output = loop {
|
||||
match tokio::time::timeout(self.period, &mut fut).await {
|
||||
Ok(r) => break r,
|
||||
Err(_) => info!("still running"),
|
||||
}
|
||||
};
|
||||
|
||||
let output = liveness_logger.await;
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
warn_when_period_overrun(elapsed, period, kind);
|
||||
let elapsed = self.started_at.elapsed();
|
||||
warn_when_period_overrun(elapsed, self.period, self.kind);
|
||||
|
||||
IterationResult { output, elapsed }
|
||||
}
|
||||
}
|
||||
/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
|
||||
|
||||
// NB: the `task` and `period` are used for metrics labels.
|
||||
pub(crate) fn warn_when_period_overrun(
|
||||
elapsed: Duration,
|
||||
period: Duration,
|
||||
@@ -645,7 +576,7 @@ pub(crate) fn warn_when_period_overrun(
|
||||
?task,
|
||||
"task iteration took longer than the configured period"
|
||||
);
|
||||
crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
|
||||
metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
|
||||
.with_label_values(&[task.into(), &format!("{}", period.as_secs())])
|
||||
.inc();
|
||||
}
|
||||
|
||||
@@ -45,11 +45,9 @@ use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
};
|
||||
use tokio::sync::{oneshot, watch, Notify};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::critical;
|
||||
@@ -152,16 +150,15 @@ use super::{
|
||||
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
|
||||
MaybeOffloaded,
|
||||
};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
|
||||
};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{
|
||||
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
|
||||
storage_layer::ReadableLayer,
|
||||
};
|
||||
use super::{
|
||||
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
|
||||
GcError,
|
||||
};
|
||||
use super::{secondary::heatmap::HeatMapLayer, GcError};
|
||||
|
||||
#[cfg(test)]
|
||||
use pageserver_api::value::Value;
|
||||
@@ -227,6 +224,7 @@ pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -353,8 +351,11 @@ pub struct Timeline {
|
||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
||||
// The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
|
||||
// we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
|
||||
// Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
|
||||
// the planned GC cutoff.
|
||||
pub applied_gc_cutoff_lsn: Rcu<Lsn>,
|
||||
|
||||
pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
|
||||
|
||||
@@ -426,6 +427,9 @@ pub struct Timeline {
|
||||
/// If true, the last compaction failed.
|
||||
compaction_failed: AtomicBool,
|
||||
|
||||
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
|
||||
l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
/// Make sure we only have one running gc at a time.
|
||||
///
|
||||
/// Must only be taken in two places:
|
||||
@@ -460,6 +464,16 @@ pub struct Timeline {
|
||||
|
||||
/// If Some, collects GetPage metadata for an ongoing PageTrace.
|
||||
pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
|
||||
|
||||
previous_heatmap: ArcSwapOption<PreviousHeatmap>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
Active {
|
||||
heatmap: HeatMapTimeline,
|
||||
read_at: std::time::Instant,
|
||||
},
|
||||
Obsolete,
|
||||
}
|
||||
|
||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||
@@ -626,6 +640,71 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes
|
||||
/// only and not used by the "real read path".
|
||||
pub enum ReadPathLayerId {
|
||||
PersistentLayer(PersistentLayerKey),
|
||||
InMemoryLayer(Range<Lsn>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPathLayerId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
|
||||
ReadPathLayerId::InMemoryLayer(range) => {
|
||||
write!(f, "in-mem {}..{}", range.start, range.end)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct ReadPath {
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
path: Vec<(ReadPathLayerId, KeySpace, Range<Lsn>)>,
|
||||
}
|
||||
|
||||
impl ReadPath {
|
||||
pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self {
|
||||
Self {
|
||||
keyspace,
|
||||
lsn,
|
||||
path: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_layer_visit(
|
||||
&mut self,
|
||||
layer_to_read: &ReadableLayer,
|
||||
keyspace_to_read: &KeySpace,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) {
|
||||
let id = match layer_to_read {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
ReadPathLayerId::PersistentLayer(layer.layer_desc().key())
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
ReadPathLayerId::InMemoryLayer(layer.get_lsn_range())
|
||||
}
|
||||
};
|
||||
self.path
|
||||
.push((id, keyspace_to_read.clone(), lsn_range.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?;
|
||||
for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() {
|
||||
writeln!(
|
||||
f,
|
||||
"{}: {} {}..{} {}",
|
||||
idx, layer_id, lsn_range.start, lsn_range.end, keyspace
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
@@ -633,6 +712,8 @@ pub struct MissingKeyError {
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
/// Debug information about the read path if there's an error
|
||||
read_path: Option<ReadPath>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
@@ -649,10 +730,15 @@ impl std::fmt::Display for MissingKeyError {
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
}
|
||||
@@ -802,8 +888,12 @@ pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
ForceImageLayerCreation,
|
||||
ForceL0Compaction,
|
||||
OnlyL0Compaction,
|
||||
EnhancedGcBottomMostCompaction,
|
||||
DryRun,
|
||||
/// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
|
||||
/// compaction via HTTP API.
|
||||
NoYield,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
@@ -999,9 +1089,15 @@ impl Timeline {
|
||||
(history, gc_info.within_ancestor_pitr)
|
||||
}
|
||||
|
||||
/// Lock and get timeline's GC cutoff
|
||||
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read()
|
||||
/// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
|
||||
pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.applied_gc_cutoff_lsn.read()
|
||||
}
|
||||
|
||||
/// Read timeline's planned GC cutoff: this is the logical end of history that users
|
||||
/// are allowed to read (based on configured PITR), even if physically we have more history.
|
||||
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
}
|
||||
|
||||
/// Look up given page version.
|
||||
@@ -1069,6 +1165,7 @@ impl Timeline {
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
@@ -1195,6 +1292,13 @@ impl Timeline {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let read_path = if self.conf.enable_read_path_debugging {
|
||||
Some(ReadPath::new(keyspace.clone(), lsn))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let traversal_res: Result<(), _> = self
|
||||
.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.await;
|
||||
@@ -1471,6 +1575,7 @@ impl Timeline {
|
||||
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
|
||||
|
||||
let mut gc_info = self.gc_info.write().unwrap();
|
||||
let planned_cutoff = gc_info.min_cutoff();
|
||||
|
||||
let valid_until = SystemTime::now() + length;
|
||||
|
||||
@@ -1491,7 +1596,7 @@ impl Timeline {
|
||||
existing_lease.clone()
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
|
||||
// Reject already GC-ed LSN if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
@@ -1500,9 +1605,12 @@ impl Timeline {
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
if lsn < planned_cutoff {
|
||||
bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1704,35 +1812,48 @@ impl Timeline {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
|
||||
/// compaction tasks.
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
///
|
||||
/// NB: the cancellation token is usually from a background task, but can also come from a
|
||||
/// request task.
|
||||
pub(crate) async fn compact_with_options(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
// most likely the cancellation token is from background task, but in tests it could be the
|
||||
// request task as well.
|
||||
// Acquire the compaction lock and task semaphore.
|
||||
//
|
||||
// L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved
|
||||
// out by other background tasks (including image compaction). We request this via
|
||||
// `BackgroundLoopKind::L0Compaction`.
|
||||
//
|
||||
// If this is a regular compaction pass, and L0-only compaction is enabled in the config,
|
||||
// then we should yield for immediate L0 compaction if necessary while we're waiting for the
|
||||
// background task semaphore. There's no point yielding otherwise, since we'd just end up
|
||||
// right back here.
|
||||
let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
|
||||
let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
|
||||
true => BackgroundLoopKind::L0Compaction,
|
||||
false => BackgroundLoopKind::Compaction,
|
||||
};
|
||||
let yield_for_l0 = !is_l0_only
|
||||
&& self.get_compaction_l0_first()
|
||||
&& !options.flags.contains(CompactFlags::NoYield);
|
||||
|
||||
let prepare = async move {
|
||||
let acquire = async move {
|
||||
let guard = self.compaction_lock.lock().await;
|
||||
|
||||
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
BackgroundLoopKind::Compaction,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await;
|
||||
(guard, permit)
|
||||
};
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
tuple = prepare => { tuple },
|
||||
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
|
||||
_ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
|
||||
(guard, permit) = acquire => (guard, permit),
|
||||
_ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
_ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
|
||||
_ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
|
||||
};
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
@@ -1740,7 +1861,7 @@ impl Timeline {
|
||||
// Last record Lsn could be zero in case the timeline was just created
|
||||
if !last_record_lsn.is_valid() {
|
||||
warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
|
||||
let result = match self.get_compaction_algorithm_settings().kind {
|
||||
@@ -2242,6 +2363,20 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
|
||||
}
|
||||
|
||||
pub fn get_compaction_l0_first(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_l0_first
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
|
||||
}
|
||||
|
||||
pub fn get_compaction_l0_semaphore(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_l0_semaphore
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore)
|
||||
}
|
||||
|
||||
fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
|
||||
// Disable L0 flushes by default. This and compaction needs further tuning.
|
||||
const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
|
||||
@@ -2442,6 +2577,7 @@ impl Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
metadata: &TimelineMetadata,
|
||||
previous_heatmap: Option<PreviousHeatmap>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -2542,7 +2678,7 @@ impl Timeline {
|
||||
LastImageLayerCreationStatus::default(),
|
||||
)),
|
||||
|
||||
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
|
||||
applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
|
||||
initdb_lsn: metadata.initdb_lsn(),
|
||||
|
||||
current_logical_size: if disk_consistent_lsn.is_valid() {
|
||||
@@ -2583,6 +2719,7 @@ impl Timeline {
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
compaction_failed: AtomicBool::default(),
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
standby_horizon: AtomicLsn::new(0),
|
||||
@@ -2603,6 +2740,8 @@ impl Timeline {
|
||||
create_idempotency,
|
||||
|
||||
page_trace: Default::default(),
|
||||
|
||||
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -2632,7 +2771,7 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
FlushLoopState::Exited => {
|
||||
warn!(
|
||||
info!(
|
||||
"ignoring attempt to restart exited flush_loop {}/{}",
|
||||
self.tenant_shard_id, self.timeline_id
|
||||
);
|
||||
@@ -3056,7 +3195,7 @@ impl Timeline {
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
async move {
|
||||
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
let wait_for_permit = super::tasks::acquire_concurrency_permit(
|
||||
BackgroundLoopKind::InitialLogicalSizeCalculation,
|
||||
background_ctx,
|
||||
);
|
||||
@@ -3341,12 +3480,52 @@ impl Timeline {
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
// Firstly, if there's any heatmap left over from when this location
|
||||
// was a secondary, take that into account. Keep layers that are:
|
||||
// * present in the layer map
|
||||
// * visible
|
||||
// * non-resident
|
||||
// * not evicted since we read the heatmap
|
||||
//
|
||||
// Without this, a new cold, attached location would clobber the previous
|
||||
// heatamp.
|
||||
let previous_heatmap = self.previous_heatmap.load();
|
||||
let visible_non_resident = match previous_heatmap.as_deref() {
|
||||
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
|
||||
Some(heatmap.layers.iter().filter_map(|hl| {
|
||||
let desc: PersistentLayerDesc = hl.name.clone().into();
|
||||
let layer = guard.try_get_from_key(&desc.key())?;
|
||||
|
||||
if layer.visibility() == LayerVisibilityHint::Covered {
|
||||
return None;
|
||||
}
|
||||
|
||||
if layer.is_likely_resident() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if layer.last_evicted_at().happened_after(*read_at) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((desc, hl.metadata.clone(), hl.access_time))
|
||||
}))
|
||||
}
|
||||
Some(PreviousHeatmap::Obsolete) => None,
|
||||
None => None,
|
||||
};
|
||||
|
||||
// Secondly, all currently visible, resident layers are included.
|
||||
let resident = guard.likely_resident_layers().filter_map(|layer| {
|
||||
match layer.visibility() {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
|
||||
Some((
|
||||
layer.layer_desc().clone(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
|
||||
@@ -3355,7 +3534,18 @@ impl Timeline {
|
||||
}
|
||||
});
|
||||
|
||||
let mut layers = resident.collect::<Vec<_>>();
|
||||
let mut layers = match visible_non_resident {
|
||||
Some(non_resident) => {
|
||||
let mut non_resident = non_resident.peekable();
|
||||
if non_resident.peek().is_none() {
|
||||
self.previous_heatmap
|
||||
.store(Some(PreviousHeatmap::Obsolete.into()));
|
||||
}
|
||||
|
||||
non_resident.chain(resident).collect::<Vec<_>>()
|
||||
}
|
||||
None => resident.collect::<Vec<_>>(),
|
||||
};
|
||||
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
@@ -3502,6 +3692,7 @@ impl Timeline {
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
backtrace: None,
|
||||
read_path: std::mem::take(&mut reconstruct_state.read_path),
|
||||
}));
|
||||
}
|
||||
|
||||
@@ -3543,7 +3734,7 @@ impl Timeline {
|
||||
// the timeline, then it will remove layers that are required for fulfilling
|
||||
// the current get request (read-path cannot "look back" and notice the new
|
||||
// image layer).
|
||||
let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
|
||||
let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();
|
||||
|
||||
// See `compaction::compact_with_gc` for why we need this.
|
||||
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
|
||||
@@ -3620,6 +3811,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
@@ -3920,6 +4114,12 @@ impl Timeline {
|
||||
}
|
||||
let flush_duration = flush_timer.stop_and_record();
|
||||
|
||||
// Notify the tenant compaction loop if L0 compaction is needed.
|
||||
let l0_count = *watch_l0.borrow();
|
||||
if l0_count >= self.get_compaction_threshold() {
|
||||
self.l0_compaction_trigger.notify_one();
|
||||
}
|
||||
|
||||
// Delay the next flush to backpressure if compaction can't keep up. We delay by the
|
||||
// flush duration such that the flush takes 2x as long. This is propagated up to WAL
|
||||
// ingestion by having ephemeral layer rolls wait for flushes.
|
||||
@@ -4092,6 +4292,7 @@ impl Timeline {
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
LastImageLayerCreationStatus::Initial,
|
||||
false, // don't yield for L0, we're flushing L0
|
||||
)
|
||||
.await?;
|
||||
debug_assert!(
|
||||
@@ -4220,7 +4421,7 @@ impl Timeline {
|
||||
let update = crate::tenant::metadata::MetadataUpdate::new(
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
*self.applied_gc_cutoff_lsn.read(),
|
||||
);
|
||||
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
@@ -4664,6 +4865,7 @@ impl Timeline {
|
||||
mode: ImageLayerCreationMode,
|
||||
ctx: &RequestContext,
|
||||
last_status: LastImageLayerCreationStatus,
|
||||
yield_for_l0: bool,
|
||||
) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
@@ -4860,7 +5062,7 @@ impl Timeline {
|
||||
|
||||
if let ImageLayerCreationMode::Try = mode {
|
||||
// We have at least made some progress
|
||||
if batch_image_writer.pending_layer_num() >= 1 {
|
||||
if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 {
|
||||
// The `Try` mode is currently only used on the compaction path. We want to avoid
|
||||
// image layer generation taking too long time and blocking L0 compaction. So in this
|
||||
// mode, we also inspect the current number of L0 layers and skip image layer generation
|
||||
@@ -5447,7 +5649,7 @@ impl Timeline {
|
||||
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
|
||||
// cannot advance beyond what was already GC'd, and respect space-based retention
|
||||
GcCutoffs {
|
||||
time: *self.get_latest_gc_cutoff_lsn(),
|
||||
time: *self.get_applied_gc_cutoff_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
@@ -5568,7 +5770,7 @@ impl Timeline {
|
||||
let mut result: GcResult = GcResult::default();
|
||||
|
||||
// Nothing to GC. Return early.
|
||||
let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
|
||||
if latest_gc_cutoff >= new_gc_cutoff {
|
||||
info!(
|
||||
"Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
|
||||
@@ -5582,7 +5784,7 @@ impl Timeline {
|
||||
//
|
||||
// The GC cutoff should only ever move forwards.
|
||||
let waitlist = {
|
||||
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
|
||||
let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
|
||||
if *write_guard > new_gc_cutoff {
|
||||
return Err(GcError::BadLsn {
|
||||
why: format!(
|
||||
@@ -6522,18 +6724,32 @@ fn is_send() {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use tracing::Instrument;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
storage_layer::{Layer, LayerName, LayerVisibilityHint},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
PreviousHeatmap, Timeline,
|
||||
};
|
||||
|
||||
use super::HeatMapTimeline;
|
||||
|
||||
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
|
||||
assert_eq!(lhs.layers.len(), rhs.layers.len());
|
||||
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
|
||||
for (l, r) in lhs_rhs {
|
||||
assert_eq!(l.name, r.name);
|
||||
assert_eq!(l.metadata, r.metadata);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heatmap_generation() {
|
||||
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
|
||||
@@ -6607,7 +6823,7 @@ mod tests {
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in heatmap.layers {
|
||||
for layer in &heatmap.layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
@@ -6622,6 +6838,144 @@ mod tests {
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
}
|
||||
|
||||
// Evict all the layers and stash the old heatmap in the timeline.
|
||||
// This simulates a migration to a cold secondary location.
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let mut all_layers = Vec::new();
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
all_layers.push(layer.clone());
|
||||
layer.evict_and_wait(forever).await.unwrap();
|
||||
}
|
||||
drop(guard);
|
||||
|
||||
timeline
|
||||
.previous_heatmap
|
||||
.store(Some(Arc::new(PreviousHeatmap::Active {
|
||||
heatmap: heatmap.clone(),
|
||||
read_at: std::time::Instant::now(),
|
||||
})));
|
||||
|
||||
// Generate a new heatmap and assert that it contains the same layers as the old one.
|
||||
let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
|
||||
assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
|
||||
|
||||
// Download each layer one by one. Generate the heatmap at each step and check
|
||||
// that it's stable.
|
||||
for layer in all_layers {
|
||||
if layer.visibility() == LayerVisibilityHint::Covered {
|
||||
continue;
|
||||
}
|
||||
|
||||
eprintln!("Downloading {layer} and re-generating heatmap");
|
||||
|
||||
let _resident = layer
|
||||
.download_and_keep_resident()
|
||||
.instrument(tracing::info_span!(
|
||||
parent: None,
|
||||
"download_layer",
|
||||
tenant_id = %timeline.tenant_shard_id.tenant_id,
|
||||
shard_id = %timeline.tenant_shard_id.shard_slug(),
|
||||
timeline_id = %timeline.timeline_id
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
|
||||
assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
|
||||
}
|
||||
|
||||
// Everything from the post-migration heatmap is now resident.
|
||||
// Check that we drop it from memory.
|
||||
assert!(matches!(
|
||||
timeline.previous_heatmap.load().as_deref(),
|
||||
Some(PreviousHeatmap::Obsolete)
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_previous_heatmap_obsoletion() {
|
||||
let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let l0_delta = DeltaLayerTestDesc::new(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
Key::from_hex("000000000000000000000000000000000000").unwrap()
|
||||
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x25),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
|
||||
let image_layer = (
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("bar"),
|
||||
)],
|
||||
);
|
||||
|
||||
let delta_layers = vec![l0_delta];
|
||||
let image_layers = vec![image_layer];
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Layer visibility is an input to heatmap generation, so refresh it first
|
||||
timeline.update_layer_visibility().await.unwrap();
|
||||
|
||||
let heatmap = timeline
|
||||
.generate_heatmap()
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
// Both layers should be in the heatmap
|
||||
assert!(!heatmap.layers.is_empty());
|
||||
|
||||
// Now simulate a migration.
|
||||
timeline
|
||||
.previous_heatmap
|
||||
.store(Some(Arc::new(PreviousHeatmap::Active {
|
||||
heatmap: heatmap.clone(),
|
||||
read_at: std::time::Instant::now(),
|
||||
})));
|
||||
|
||||
// Evict all the layers in the previous heatmap
|
||||
let guard = timeline.layers.read().await;
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
for layer in guard.likely_resident_layers() {
|
||||
layer.evict_and_wait(forever).await.unwrap();
|
||||
}
|
||||
drop(guard);
|
||||
|
||||
// Generate a new heatmap and check that the previous heatmap
|
||||
// has been marked obsolete.
|
||||
let post_eviction_heatmap = timeline
|
||||
.generate_heatmap()
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert!(post_eviction_heatmap.layers.is_empty());
|
||||
assert!(matches!(
|
||||
timeline.previous_heatmap.load().as_deref(),
|
||||
Some(PreviousHeatmap::Obsolete)
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -609,6 +609,11 @@ pub enum CompactionOutcome {
|
||||
/// Still has pending layers to be compacted after this round. Ideally, the scheduler
|
||||
/// should immediately schedule another compaction.
|
||||
Pending,
|
||||
/// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only
|
||||
/// guaranteed when `compaction_l0_first` is enabled).
|
||||
YieldForL0,
|
||||
/// Compaction was skipped, because the timeline is ineligible for compaction.
|
||||
Skipped,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -687,10 +692,25 @@ impl Timeline {
|
||||
|
||||
// Define partitioning schema if needed
|
||||
|
||||
let l0_l1_boundary_lsn = {
|
||||
// We do the repartition on the L0-L1 boundary. All data below the boundary
|
||||
// are compacted by L0 with low read amplification, thus making the `repartition`
|
||||
// function run fast.
|
||||
let guard = self.layers.read().await;
|
||||
let l0_min_lsn = guard
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.iter()
|
||||
.map(|l| l.get_lsn_range().start)
|
||||
.min()
|
||||
.unwrap_or(self.get_disk_consistent_lsn());
|
||||
l0_min_lsn.max(self.get_ancestor_lsn())
|
||||
};
|
||||
|
||||
// 1. L0 Compact
|
||||
let l0_compaction_outcome = {
|
||||
let l0_outcome = {
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
let l0_compaction_outcome = self
|
||||
let l0_outcome = self
|
||||
.compact_level0(
|
||||
target_file_size,
|
||||
options.flags.contains(CompactFlags::ForceL0Compaction),
|
||||
@@ -698,91 +718,103 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
l0_compaction_outcome
|
||||
l0_outcome
|
||||
};
|
||||
|
||||
if let CompactionOutcome::Pending = l0_compaction_outcome {
|
||||
// Yield and do not do any other kind of compaction. True means
|
||||
// that we have pending L0 compaction tasks and the compaction scheduler
|
||||
// will prioritize compacting this tenant/timeline again.
|
||||
info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
if options.flags.contains(CompactFlags::OnlyL0Compaction) {
|
||||
return Ok(l0_outcome);
|
||||
}
|
||||
|
||||
// 2. Repartition and create image layers if necessary
|
||||
let partition_count = match self
|
||||
.repartition(
|
||||
self.get_last_record_lsn(), // TODO: use L0-L1 boundary
|
||||
self.get_compaction_target_size(),
|
||||
options.flags,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
// Yield if we have pending L0 compaction. The scheduler will do another pass.
|
||||
if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
|
||||
&& !options.flags.contains(CompactFlags::NoYield)
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
info!("image/ancestor compaction yielding for L0 compaction");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
if l0_l1_boundary_lsn < self.partitioning.read().1 {
|
||||
// We never go backwards when repartition and create image layers.
|
||||
info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
|
||||
} else {
|
||||
// 2. Repartition and create image layers if necessary
|
||||
match self
|
||||
.repartition(
|
||||
l0_l1_boundary_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
options.flags,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified "enough".
|
||||
let (image_layers, outcome) = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if options
|
||||
.flags
|
||||
.contains(CompactFlags::ForceImageLayerCreation)
|
||||
{
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.await
|
||||
.inspect_err(|err| {
|
||||
if let CreateImageLayersError::GetVectoredError(
|
||||
GetVectoredError::MissingKey(_),
|
||||
) = err
|
||||
{
|
||||
critical!("missing key during compaction: {err:?}");
|
||||
}
|
||||
})?;
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
.parts
|
||||
.extend(sparse_partitioning.into_dense().parts);
|
||||
|
||||
self.last_image_layer_creation_status
|
||||
.store(Arc::new(outcome.clone()));
|
||||
// 3. Create new image layers for partitions that have been modified "enough".
|
||||
let (image_layers, outcome) = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
if options
|
||||
.flags
|
||||
.contains(CompactFlags::ForceImageLayerCreation)
|
||||
{
|
||||
ImageLayerCreationMode::Force
|
||||
} else {
|
||||
ImageLayerCreationMode::Try
|
||||
},
|
||||
&image_ctx,
|
||||
self.last_image_layer_creation_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.clone(),
|
||||
!options.flags.contains(CompactFlags::NoYield),
|
||||
)
|
||||
.await
|
||||
.inspect_err(|err| {
|
||||
if let CreateImageLayersError::GetVectoredError(
|
||||
GetVectoredError::MissingKey(_),
|
||||
) = err
|
||||
{
|
||||
critical!("missing key during compaction: {err:?}");
|
||||
}
|
||||
})?;
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
|
||||
// Yield and do not do any other kind of compaction.
|
||||
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
|
||||
return Ok(CompactionOutcome::Pending);
|
||||
self.last_image_layer_creation_status
|
||||
.store(Arc::new(outcome.clone()));
|
||||
|
||||
self.upload_new_image_layers(image_layers)?;
|
||||
if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
|
||||
// Yield and do not do any other kind of compaction.
|
||||
info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
partitioning.parts.len()
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!(
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
1
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
let partition_count = self.partitioning.read().0 .0.parts.len();
|
||||
|
||||
// 4. Shard ancestor compaction
|
||||
|
||||
@@ -820,7 +852,7 @@ impl Timeline {
|
||||
//
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
@@ -2170,7 +2202,7 @@ impl Timeline {
|
||||
|
||||
// TODO: ensure the child branches will not use anything below the watermark, or consider
|
||||
// them when computing the watermark.
|
||||
gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
|
||||
gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
|
||||
}
|
||||
|
||||
/// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
|
||||
@@ -2238,8 +2270,11 @@ impl Timeline {
|
||||
split_key_ranges.push((start, end));
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let all_layers = {
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
let mut current_start = None;
|
||||
let ranges_num = split_key_ranges.len();
|
||||
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
|
||||
@@ -2251,14 +2286,23 @@ impl Timeline {
|
||||
// We have already processed this partition.
|
||||
continue;
|
||||
}
|
||||
let res = layer_map.range_search(start..end, compact_below_lsn);
|
||||
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
|
||||
let overlapping_layers = {
|
||||
let mut desc = Vec::new();
|
||||
for layer in all_layers.iter() {
|
||||
if overlaps_with(&layer.get_key_range(), &(start..end))
|
||||
&& layer.get_lsn_range().start <= compact_below_lsn
|
||||
{
|
||||
desc.push(layer.clone());
|
||||
}
|
||||
}
|
||||
desc
|
||||
};
|
||||
let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::<u64>();
|
||||
if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
// Try to extend the compaction range so that we include at least one full layer file.
|
||||
let extended_end = res
|
||||
.found
|
||||
.keys()
|
||||
.map(|layer| layer.layer.key_range.end)
|
||||
let extended_end = overlapping_layers
|
||||
.iter()
|
||||
.map(|layer| layer.key_range.end)
|
||||
.min();
|
||||
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
|
||||
// In this case, we simply use the specified key range end.
|
||||
@@ -2285,7 +2329,6 @@ impl Timeline {
|
||||
current_start = Some(end);
|
||||
}
|
||||
}
|
||||
drop(guard);
|
||||
Ok(compact_jobs)
|
||||
}
|
||||
|
||||
|
||||
@@ -17,13 +17,11 @@ use crate::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TenantManifestError, TimelineOrOffloaded,
|
||||
TenantManifestError, Timeline, TimelineOrOffloaded,
|
||||
},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
@@ -296,12 +294,8 @@ impl DeleteTimelineFlow {
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
None, // Previous heatmap is not needed for deletion
|
||||
tenant.get_timeline_resources_for(remote_client),
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
@@ -341,6 +335,13 @@ impl DeleteTimelineFlow {
|
||||
let tenant_shard_id = timeline.tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id();
|
||||
|
||||
// Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
|
||||
let Ok(tenant_guard) = tenant.gate.enter() else {
|
||||
// It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
|
||||
info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
|
||||
return;
|
||||
};
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
@@ -348,6 +349,8 @@ impl DeleteTimelineFlow {
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
let _guard = tenant_guard;
|
||||
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
|
||||
// Only log as an error if it's not a cancellation.
|
||||
if matches!(err, DeleteTimelineError::Cancelled) {
|
||||
|
||||
@@ -6,17 +6,20 @@ use crate::{
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
remote_timeline_client::index::GcBlockingReason::DetachAncestor,
|
||||
storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
|
||||
storage_layer::{
|
||||
layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer,
|
||||
},
|
||||
Tenant,
|
||||
},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use http_utils::error::ApiError;
|
||||
use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
|
||||
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum Error {
|
||||
@@ -351,18 +354,7 @@ pub(super) async fn prepare(
|
||||
|
||||
// FIXME: the fsync should be mandatory, after both rewrites and copies
|
||||
if wrote_any {
|
||||
let timeline_dir = VirtualFile::open(
|
||||
&detached
|
||||
.conf
|
||||
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
fsync_timeline_dir(detached, ctx).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -376,7 +368,7 @@ pub(super) async fn prepare(
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let owned = remote_copy(
|
||||
let (owned, did_hardlink) = remote_copy(
|
||||
&adopted,
|
||||
&timeline,
|
||||
timeline.generation,
|
||||
@@ -384,16 +376,20 @@ pub(super) async fn prepare(
|
||||
&timeline.cancel,
|
||||
)
|
||||
.await?;
|
||||
tracing::info!(layer=%owned, "remote copied");
|
||||
Ok(owned)
|
||||
tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied");
|
||||
Ok((owned, did_hardlink))
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
}
|
||||
|
||||
let mut should_fsync = false;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok(owned)) => {
|
||||
Ok(Ok((owned, did_hardlink))) => {
|
||||
if did_hardlink {
|
||||
should_fsync = true;
|
||||
}
|
||||
new_layers.push(owned);
|
||||
}
|
||||
Ok(Err(failed)) => {
|
||||
@@ -403,7 +399,10 @@ pub(super) async fn prepare(
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fsync directory again if we hardlinked something
|
||||
// fsync directory again if we hardlinked something
|
||||
if should_fsync {
|
||||
fsync_timeline_dir(detached, ctx).await;
|
||||
}
|
||||
|
||||
let prepared = PreparedTimelineDetach { layers: new_layers };
|
||||
|
||||
@@ -629,35 +628,52 @@ async fn copy_lsn_prefix(
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
|
||||
/// storage on successful return without the adopted layer being added to `index_part.json`.
|
||||
/// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote
|
||||
/// storage on successful return. without the adopted layer being added to `index_part.json`.
|
||||
/// Returns (Layer, did hardlink)
|
||||
async fn remote_copy(
|
||||
adopted: &Layer,
|
||||
adoptee: &Arc<Timeline>,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Layer, Error> {
|
||||
// depending if Layer::keep_resident we could hardlink
|
||||
|
||||
) -> Result<(Layer, bool), Error> {
|
||||
let mut metadata = adopted.metadata();
|
||||
debug_assert!(metadata.generation <= generation);
|
||||
metadata.generation = generation;
|
||||
metadata.shard = shard_identity.shard_index();
|
||||
|
||||
let owned = crate::tenant::storage_layer::Layer::for_evicted(
|
||||
adoptee.conf,
|
||||
adoptee,
|
||||
adopted.layer_desc().layer_name(),
|
||||
metadata,
|
||||
);
|
||||
let conf = adoptee.conf;
|
||||
let file_name = adopted.layer_desc().layer_name();
|
||||
|
||||
adoptee
|
||||
// depending if Layer::keep_resident, do a hardlink
|
||||
let did_hardlink;
|
||||
let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
|
||||
let adopted_path = adopted_resident.local_path();
|
||||
let adoptee_path = local_layer_path(
|
||||
conf,
|
||||
&adoptee.tenant_shard_id,
|
||||
&adoptee.timeline_id,
|
||||
&file_name,
|
||||
&metadata.generation,
|
||||
);
|
||||
std::fs::hard_link(adopted_path, &adoptee_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
did_hardlink = true;
|
||||
Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
|
||||
} else {
|
||||
did_hardlink = false;
|
||||
Layer::for_evicted(conf, adoptee, file_name, metadata)
|
||||
};
|
||||
|
||||
let layer = adoptee
|
||||
.remote_client
|
||||
.copy_timeline_layer(adopted, &owned, cancel)
|
||||
.await
|
||||
.map(move |()| owned)
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
|
||||
Ok((layer, did_hardlink))
|
||||
}
|
||||
|
||||
pub(crate) enum DetachingAndReparenting {
|
||||
@@ -1001,3 +1017,16 @@ fn check_no_archived_children_of_ancestor(
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) {
|
||||
let path = &timeline
|
||||
.conf
|
||||
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id);
|
||||
let timeline_dir = VirtualFile::open(&path, ctx)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use crate::{
|
||||
tenant::{
|
||||
size::CalculateSyntheticSizeError,
|
||||
storage_layer::LayerVisibilityHint,
|
||||
tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit},
|
||||
tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
@@ -83,8 +83,6 @@ impl Timeline {
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
let Ok(guard) = self.gate.enter() else {
|
||||
return;
|
||||
@@ -97,7 +95,7 @@ impl Timeline {
|
||||
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &self.cancel).await.is_err() {
|
||||
if sleep_random(period, &self.cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -334,10 +332,8 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
|
||||
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
BackgroundLoopKind::Eviction,
|
||||
ctx,
|
||||
);
|
||||
let acquire_permit =
|
||||
crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);
|
||||
|
||||
tokio::select! {
|
||||
permit = acquire_permit => ControlFlow::Continue(permit),
|
||||
|
||||
@@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
||||
/// bad storage or bad configuration, and we can't fix that from inside
|
||||
/// a running process.
|
||||
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
||||
tracing::error!("Fatal I/O error: {e}: {context})");
|
||||
let backtrace = std::backtrace::Backtrace::force_capture();
|
||||
tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}");
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
@@ -947,13 +948,18 @@ impl VirtualFileInner {
|
||||
where
|
||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||
{
|
||||
let file_guard = match self.lock_file().await {
|
||||
let file_guard = match self
|
||||
.lock_file()
|
||||
.await
|
||||
.maybe_fatal_err("lock_file inside VirtualFileInner::read_at")
|
||||
{
|
||||
Ok(file_guard) => file_guard,
|
||||
Err(e) => return (buf, Err(e)),
|
||||
};
|
||||
|
||||
observe_duration!(StorageIoOperation::Read, {
|
||||
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
||||
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
|
||||
if let Ok(size) = res {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&[
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "extension_server.h"
|
||||
#include "extension_server.h"
|
||||
#include "neon_utils.h"
|
||||
|
||||
static int extension_server_port = 0;
|
||||
@@ -45,7 +45,7 @@ neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
handle = alloc_curl_handle();
|
||||
|
||||
curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
|
||||
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
|
||||
@@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode)
|
||||
* neon_truncate() -- Truncate relation to specified number of blocks.
|
||||
*/
|
||||
static void
|
||||
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
|
||||
@@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
mdtruncate(reln, forknum, nblocks);
|
||||
mdtruncate(reln, forknum, old_blocks, nblocks);
|
||||
return;
|
||||
|
||||
default:
|
||||
@@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdtruncate(reln, forknum, nblocks);
|
||||
mdtruncate(reln, forknum, old_blocks, nblocks);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
BlockNumber old_blocks, BlockNumber nblocks);
|
||||
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
|
||||
@@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
* inmem_truncate() -- Truncate relation to specified number of blocks.
|
||||
*/
|
||||
static void
|
||||
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
74
poetry.lock
generated
74
poetry.lock
generated
@@ -1030,52 +1030,56 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "43.0.1"
|
||||
version = "44.0.1"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = "!=3.9.0,!=3.9.1,>=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
|
||||
{file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"},
|
||||
{file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"},
|
||||
{file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"},
|
||||
{file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"},
|
||||
{file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
|
||||
nox = ["nox"]
|
||||
pep8test = ["check-sdist", "click", "mypy", "ruff"]
|
||||
sdist = ["build"]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"]
|
||||
docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"]
|
||||
nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"]
|
||||
pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"]
|
||||
sdist = ["build (>=1.0.0)"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -37,6 +37,7 @@ hex.workspace = true
|
||||
hmac.workspace = true
|
||||
hostname.workspace = true
|
||||
http.workspace = true
|
||||
http-utils.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper0.workspace = true
|
||||
|
||||
@@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation
|
||||
|
||||
If both postgres and proxy are running you may send a SQL query:
|
||||
```console
|
||||
curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
|
||||
-H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
|
||||
curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \
|
||||
-H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
|
||||
@@ -104,7 +104,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
|
||||
|
||||
## Test proxy locally
|
||||
|
||||
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
|
||||
Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`.
|
||||
|
||||
We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
|
||||
```sh
|
||||
@@ -125,7 +125,7 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPER
|
||||
|
||||
Let's create self-signed certificate by running:
|
||||
```sh
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
|
||||
```
|
||||
|
||||
Then we need to build proxy with 'testing' feature and run, e.g.:
|
||||
@@ -136,5 +136,5 @@ RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backe
|
||||
Now from client you can start a new session:
|
||||
|
||||
```sh
|
||||
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
|
||||
PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
|
||||
```
|
||||
|
||||
@@ -108,6 +108,10 @@ impl<T> Backend<'_, T> {
|
||||
Self::Local(_) => panic!("Local backend has no API"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_local_proxy(&self) -> bool {
|
||||
matches!(self, Self::Local(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Backend<'a, T> {
|
||||
|
||||
@@ -1,416 +1,7 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use compute_api::spec::LocalProxySpec;
|
||||
use futures::future::Either;
|
||||
use proxy::auth::backend::jwt::JwkCache;
|
||||
use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
|
||||
use proxy::auth::{self};
|
||||
use proxy::cancellation::CancellationHandler;
|
||||
use proxy::config::{
|
||||
self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
|
||||
};
|
||||
use proxy::control_plane::locks::ApiLocks;
|
||||
use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::intern::RoleNameInt;
|
||||
use proxy::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use proxy::rate_limiter::{
|
||||
BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
|
||||
};
|
||||
use proxy::scram::threadpool::ThreadPool;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::{self, GlobalConnPoolOptions};
|
||||
use proxy::tls::client_config::compute_client_config_with_root_certs;
|
||||
use proxy::types::RoleName;
|
||||
use proxy::url::ApiUrl;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::Parser;
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{pid_file, project_build_tag, project_git_version};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Neon proxy/router
|
||||
#[derive(Parser)]
|
||||
#[command(version = GIT_VERSION, about)]
|
||||
struct LocalProxyCliArgs {
|
||||
/// listen for incoming metrics connections on ip:port
|
||||
#[clap(long, default_value = "127.0.0.1:7001")]
|
||||
metrics: String,
|
||||
/// listen for incoming http connections on ip:port
|
||||
#[clap(long)]
|
||||
http: String,
|
||||
/// timeout for the TLS handshake
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
handshake_timeout: tokio::time::Duration,
|
||||
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
|
||||
connect_compute_lock: String,
|
||||
#[clap(flatten)]
|
||||
sql_over_http: SqlOverHttpArgs,
|
||||
/// User rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
user_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
/// Address of the postgres server
|
||||
#[clap(long, default_value = "127.0.0.1:5432")]
|
||||
postgres: SocketAddr,
|
||||
/// Address of the compute-ctl api service
|
||||
#[clap(long, default_value = "http://127.0.0.1:3080/")]
|
||||
compute_ctl: ApiUrl,
|
||||
/// Path of the local proxy config file
|
||||
#[clap(long, default_value = "./local_proxy.json")]
|
||||
config_path: Utf8PathBuf,
|
||||
/// Path of the local proxy PID file
|
||||
#[clap(long, default_value = "./local_proxy.pid")]
|
||||
pid_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
struct SqlOverHttpArgs {
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 200)]
|
||||
sql_over_http_pool_max_total_conns: usize,
|
||||
|
||||
/// How long pooled connections should remain idle for before closing
|
||||
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_idle_timeout: tokio::time::Duration,
|
||||
|
||||
#[clap(long, default_value_t = 100)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 16)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_request_size_bytes: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_response_size_bytes: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init_local_proxy()?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
// TODO: refactor these to use labels
|
||||
debug!("Version: {GIT_VERSION}");
|
||||
debug!("Build_tag: {BUILD_TAG}");
|
||||
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
});
|
||||
|
||||
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
|
||||
Ok(t) => Some(t),
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let args = LocalProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
let auth_backend = build_auth_backend(&args)?;
|
||||
|
||||
// before we bind to any ports, write the process ID to a file
|
||||
// so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
//
|
||||
// This also claims a "lock" that makes sure only one instance
|
||||
// of local_proxy runs at a time.
|
||||
let _process_guard = loop {
|
||||
match pid_file::claim_for_current_process(&args.pid_path) {
|
||||
Ok(guard) => break guard,
|
||||
Err(e) => {
|
||||
// compute-ctl might have tried to read the pid-file to let us
|
||||
// know about some config change. We should try again.
|
||||
error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
|
||||
let http_listener = TcpListener::bind(args.http).await?;
|
||||
let shutdown = CancellationToken::new();
|
||||
|
||||
// todo: should scale with CU
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
LeakyBucketConfig {
|
||||
rps: 10.0,
|
||||
max: 100.0,
|
||||
},
|
||||
16,
|
||||
));
|
||||
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
|
||||
let refresh_config_notify = Arc::new(Notify::new());
|
||||
maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), {
|
||||
let refresh_config_notify = Arc::clone(&refresh_config_notify);
|
||||
move || {
|
||||
refresh_config_notify.notify_one();
|
||||
}
|
||||
}));
|
||||
|
||||
// trigger the first config load **after** setting up the signal hook
|
||||
// to avoid the race condition where:
|
||||
// 1. No config file registered when local_proxy starts up
|
||||
// 2. The config file is written but the signal hook is not yet received
|
||||
// 3. local_proxy completes startup but has no config loaded, despite there being a registerd config.
|
||||
refresh_config_notify.notify_one();
|
||||
tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
|
||||
|
||||
maintenance_tasks.spawn(proxy::http::health_server::task_main(
|
||||
metrics_listener,
|
||||
AppMetrics {
|
||||
jemalloc,
|
||||
neon_metrics,
|
||||
proxy: proxy::metrics::Metrics::get(),
|
||||
},
|
||||
));
|
||||
|
||||
let task = serverless::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
http_listener,
|
||||
shutdown.clone(),
|
||||
Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
|
||||
endpoint_rate_limiter,
|
||||
);
|
||||
|
||||
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {},
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((res, _)) => res?,
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.connect_compute_lock.parse()?;
|
||||
info!(
|
||||
?limiter,
|
||||
shards,
|
||||
?epoch,
|
||||
"Using NodeLocks (connect_compute)"
|
||||
);
|
||||
let connect_compute_locks = ApiLocks::new(
|
||||
"connect_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().proxy.connect_compute_lock,
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: false,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
gc_epoch: Duration::from_secs(60),
|
||||
pool_shards: 2,
|
||||
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
|
||||
opt_in: false,
|
||||
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
|
||||
max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
|
||||
};
|
||||
|
||||
let compute_config = ComputeConfig {
|
||||
retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?,
|
||||
tls: Arc::new(compute_client_config_with_root_certs()?),
|
||||
timeout: Duration::from_secs(2),
|
||||
};
|
||||
|
||||
Ok(Box::leak(Box::new(ProxyConfig {
|
||||
tls_config: None,
|
||||
metric_collection: None,
|
||||
http_config,
|
||||
authentication_config: AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(0),
|
||||
scram_protocol_timeout: Duration::from_secs(10),
|
||||
rate_limiter_enabled: false,
|
||||
rate_limiter: BucketRateLimiter::new(vec![]),
|
||||
rate_limit_ip_subnet: 64,
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_vpc_acccess_proxy: false,
|
||||
is_auth_broker: false,
|
||||
accept_jwts: true,
|
||||
console_redirect_confirmation_timeout: Duration::ZERO,
|
||||
},
|
||||
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
region: "local".into(),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute: compute_config,
|
||||
})))
|
||||
}
|
||||
|
||||
/// auth::Backend is created at proxy startup, and lives forever.
|
||||
fn build_auth_backend(
|
||||
args: &LocalProxyCliArgs,
|
||||
) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
|
||||
let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
|
||||
LocalBackend::new(args.postgres, args.compute_ctl.clone()),
|
||||
));
|
||||
|
||||
Ok(Box::leak(Box::new(auth_backend)))
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
enum RefreshConfigError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] std::io::Error),
|
||||
#[error(transparent)]
|
||||
Parse(#[from] serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Validate(anyhow::Error),
|
||||
}
|
||||
|
||||
async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
|
||||
let mut init = true;
|
||||
loop {
|
||||
rx.notified().await;
|
||||
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
// don't log for file not found errors if this is the first time we are checking
|
||||
// for computes that don't use local_proxy, this is not an error.
|
||||
Err(RefreshConfigError::Read(e))
|
||||
if init && e.kind() == std::io::ErrorKind::NotFound =>
|
||||
{
|
||||
debug!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
}
|
||||
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> {
|
||||
let bytes = tokio::fs::read(&path).await?;
|
||||
let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
|
||||
|
||||
let mut jwks_set = vec![];
|
||||
|
||||
fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
|
||||
let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
|
||||
|
||||
ensure!(
|
||||
jwks_url.has_authority()
|
||||
&& (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
|
||||
ensure!(
|
||||
jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks_url
|
||||
.set_username("")
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
jwks_url
|
||||
.set_password(None)
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks_url.set_fragment(None);
|
||||
jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(JwksSettings {
|
||||
id: jwks.id,
|
||||
jwks_url,
|
||||
provider_name: jwks.provider_name,
|
||||
jwt_audience: jwks.jwt_audience,
|
||||
role_names: jwks
|
||||
.role_names
|
||||
.into_iter()
|
||||
.map(RoleName::from)
|
||||
.map(|s| RoleNameInt::from(&s))
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
for jwks in data.jwks.into_iter().flatten() {
|
||||
jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
|
||||
}
|
||||
|
||||
info!("successfully loaded new config");
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
|
||||
|
||||
Ok(())
|
||||
proxy::binary::local_proxy::run().await
|
||||
}
|
||||
|
||||
@@ -1,299 +1,10 @@
|
||||
/// A stand-alone program that routes connections, e.g. from
|
||||
/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
|
||||
///
|
||||
/// This allows connecting to pods/services running in the same Kubernetes cluster from
|
||||
/// the outside. Similar to an ingress controller for HTTPS.
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::Arg;
|
||||
use futures::future::Either;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use proxy::context::RequestContext;
|
||||
use proxy::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use proxy::protocol2::ConnectionInfo;
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
|
||||
use proxy::stream::{PqStream, Stream};
|
||||
use proxy::tls::TlsServerEndPoint;
|
||||
use rustls::crypto::ring;
|
||||
use rustls::pki_types::PrivateKeyDer;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, Instrument};
|
||||
use utils::project_git_version;
|
||||
use utils::sentry_init::init_sentry;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn cli() -> clap::Command {
|
||||
clap::Command::new("Neon proxy/router")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("listen")
|
||||
.short('l')
|
||||
.long("listen")
|
||||
.help("listen for incoming client connections on ip:port")
|
||||
.default_value("127.0.0.1:4432"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-key")
|
||||
.short('k')
|
||||
.long("tls-key")
|
||||
.help("path to TLS key for client postgres connections")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tls-cert")
|
||||
.short('c')
|
||||
.long("tls-cert")
|
||||
.help("path to TLS cert for client postgres connections")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("dest")
|
||||
.short('d')
|
||||
.long("destination")
|
||||
.help("append this domain zone to the SNI hostname to get the destination address")
|
||||
.required(true),
|
||||
)
|
||||
}
|
||||
//! A stand-alone program that routes connections, e.g. from
|
||||
//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
|
||||
//!
|
||||
//! This allows connecting to pods/services running in the same Kubernetes cluster from
|
||||
//! the outside. Similar to an ingress controller for HTTPS.
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
let args = cli().get_matches();
|
||||
let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
|
||||
|
||||
// Configure TLS
|
||||
let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
|
||||
args.get_one::<String>("tls-key"),
|
||||
args.get_one::<String>("tls-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
|
||||
let mut keys =
|
||||
rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
|
||||
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
PrivateKeyDer::Pkcs8(
|
||||
keys.pop()
|
||||
.unwrap()
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?,
|
||||
)
|
||||
};
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain: Vec<_> = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.try_collect()
|
||||
.with_context(|| {
|
||||
format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
|
||||
})?
|
||||
};
|
||||
|
||||
// needed for channel bindings
|
||||
let first_cert = cert_chain.first().context("missing certificate")?;
|
||||
let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
|
||||
|
||||
let tls_config =
|
||||
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
|
||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
|
||||
.context("ring should support TLS1.2 and TLS1.3")?
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(cert_chain, key)?
|
||||
.into();
|
||||
|
||||
(tls_config, tls_server_end_point)
|
||||
}
|
||||
_ => bail!("tls-key and tls-cert must be specified"),
|
||||
};
|
||||
|
||||
// Start listening for incoming client connections
|
||||
let proxy_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
|
||||
info!("Starting sni router on {proxy_address}");
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let main = tokio::spawn(task_main(
|
||||
Arc::new(destination),
|
||||
tls_config,
|
||||
tls_server_end_point,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {}));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
// we want to immediately exit on either of these cases
|
||||
let signal = match futures::future::select(signals_task, main).await {
|
||||
Either::Left((res, _)) => proxy::error::flatten_err(res)?,
|
||||
Either::Right((res, _)) => return proxy::error::flatten_err(res),
|
||||
};
|
||||
|
||||
// maintenance tasks return `Infallible` success values, this is an impossible value
|
||||
// so this match statically ensures that there are no possibilities for that value
|
||||
match signal {}
|
||||
}
|
||||
|
||||
async fn task_main(
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// When set for the server socket, the keepalive setting
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let tls_config = Arc::clone(&tls_config);
|
||||
let dest_suffix = Arc::clone(&dest_suffix);
|
||||
|
||||
connections.spawn(
|
||||
async move {
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
info!(%peer_addr, "serving");
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
extra: None,
|
||||
},
|
||||
proxy::metrics::Protocol::SniRouter,
|
||||
"sni",
|
||||
);
|
||||
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(tracing::info_span!("handle_client", ?session_id)),
|
||||
);
|
||||
}
|
||||
|
||||
connections.close();
|
||||
drop(listener);
|
||||
|
||||
connections.wait().await;
|
||||
|
||||
info!("all client connections have finished");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &RequestContext,
|
||||
raw_stream: S,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
) -> anyhow::Result<Stream<S>> {
|
||||
let mut stream = PqStream::new(Stream::from_raw(raw_stream));
|
||||
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
use pq_proto::FeStartupPacket::*;
|
||||
|
||||
match msg {
|
||||
SslRequest { direct: false } => {
|
||||
stream
|
||||
.write_message(&pq_proto::BeMessage::EncryptionResponse(true))
|
||||
.await?;
|
||||
|
||||
// Upgrade raw stream into a secure TLS-backed stream.
|
||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||
|
||||
let (raw, read_buf) = stream.into_inner();
|
||||
// TODO: Normally, client doesn't send any data before
|
||||
// server says TLS handshake is ok and read_buf is empty.
|
||||
// However, you could imagine pipelining of postgres
|
||||
// SSLRequest + TLS ClientHello in one hunk similar to
|
||||
// pipelining in our node js driver. We should probably
|
||||
// support that by chaining read_buf with the stream.
|
||||
if !read_buf.is_empty() {
|
||||
bail!("data is sent before server replied with EncryptionResponse");
|
||||
}
|
||||
|
||||
Ok(Stream::Tls {
|
||||
tls: Box::new(
|
||||
raw.upgrade(tls_config, !ctx.has_private_peer_addr())
|
||||
.await?,
|
||||
),
|
||||
tls_server_end_point,
|
||||
})
|
||||
}
|
||||
unexpected => {
|
||||
info!(
|
||||
?unexpected,
|
||||
"unexpected startup packet, rejecting connection"
|
||||
);
|
||||
stream
|
||||
.throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_client(
|
||||
ctx: RequestContext,
|
||||
dest_suffix: Arc<String>,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
|
||||
|
||||
// Cut off first part of the SNI domain
|
||||
// We receive required destination details in the format of
|
||||
// `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
|
||||
let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
|
||||
let dest: Vec<&str> = sni
|
||||
.split_once('.')
|
||||
.context("invalid SNI")?
|
||||
.0
|
||||
.splitn(3, "--")
|
||||
.collect();
|
||||
let port = dest[2].parse::<u16>().context("invalid port")?;
|
||||
let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
|
||||
|
||||
info!("destination: {}", destination);
|
||||
|
||||
let mut client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
// doesn't yet matter as pg-sni-router doesn't report analytics logs
|
||||
ctx.set_success();
|
||||
ctx.log_connect();
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
|
||||
match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
|
||||
Ok(_) => Ok(()),
|
||||
Err(ErrorSource::Client(err)) => Err(err).context("client"),
|
||||
Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
|
||||
}
|
||||
proxy::binary::pg_sni_router::run().await
|
||||
}
|
||||
|
||||
@@ -1,831 +1,7 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::bail;
|
||||
use futures::future::Either;
|
||||
use proxy::auth::backend::jwt::JwkCache;
|
||||
use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
|
||||
use proxy::cancellation::{handle_cancel_messages, CancellationHandler};
|
||||
use proxy::config::{
|
||||
self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
|
||||
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
|
||||
};
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
use proxy::rate_limiter::{
|
||||
EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
|
||||
};
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::kv_ops::RedisKVClient;
|
||||
use proxy::redis::{elasticache, notifications};
|
||||
use proxy::scram::threadpool::ThreadPool;
|
||||
use proxy::serverless::cancel_set::CancelSet;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::tls::client_config::compute_client_config_with_root_certs;
|
||||
use proxy::{auth, control_plane, http, serverless, usage_metrics};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info, warn, Instrument};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{project_build_tag, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::{Parser, ValueEnum};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackendType {
|
||||
#[value(name("cplane-v1"), alias("control-plane"))]
|
||||
ControlPlaneV1,
|
||||
|
||||
#[value(name("link"), alias("control-redirect"))]
|
||||
ConsoleRedirect,
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres,
|
||||
}
|
||||
|
||||
/// Neon proxy/router
|
||||
#[derive(Parser)]
|
||||
#[command(version = GIT_VERSION, about)]
|
||||
struct ProxyCliArgs {
|
||||
/// Name of the region this proxy is deployed in
|
||||
#[clap(long, default_value_t = String::new())]
|
||||
region: String,
|
||||
/// listen for incoming client connections on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:4432")]
|
||||
proxy: String,
|
||||
#[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
|
||||
auth_backend: AuthBackendType,
|
||||
/// listen for management callback connection on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:7000")]
|
||||
mgmt: String,
|
||||
/// listen for incoming http connections (metrics, etc) on ip:port
|
||||
#[clap(long, default_value = "127.0.0.1:7001")]
|
||||
http: String,
|
||||
/// listen for incoming wss connections on ip:port
|
||||
#[clap(long)]
|
||||
wss: Option<String>,
|
||||
/// redirect unauthenticated users to the given uri in case of console redirect auth
|
||||
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
|
||||
uri: String,
|
||||
/// cloud API endpoint for authenticating users
|
||||
#[clap(
|
||||
short,
|
||||
long,
|
||||
default_value = "http://localhost:3000/authenticate_proxy_request/"
|
||||
)]
|
||||
auth_endpoint: String,
|
||||
/// JWT used to connect to control plane.
|
||||
#[clap(
|
||||
long,
|
||||
value_name = "JWT",
|
||||
default_value = "",
|
||||
env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
|
||||
)]
|
||||
control_plane_token: Arc<str>,
|
||||
/// if this is not local proxy, this toggles whether we accept jwt or passwords for http
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
is_auth_broker: bool,
|
||||
/// path to TLS key for client postgres connections
|
||||
///
|
||||
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
|
||||
#[clap(short = 'k', long, alias = "ssl-key")]
|
||||
tls_key: Option<String>,
|
||||
/// path to TLS cert for client postgres connections
|
||||
///
|
||||
/// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
|
||||
#[clap(short = 'c', long, alias = "ssl-cert")]
|
||||
tls_cert: Option<String>,
|
||||
/// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
|
||||
#[clap(long, alias = "allow-ssl-keylogfile")]
|
||||
allow_tls_keylogfile: bool,
|
||||
/// path to directory with TLS certificates for client postgres connections
|
||||
#[clap(long)]
|
||||
certs_dir: Option<String>,
|
||||
/// timeout for the TLS handshake
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
handshake_timeout: tokio::time::Duration,
|
||||
/// http endpoint to receive periodic metric updates
|
||||
#[clap(long)]
|
||||
metric_collection_endpoint: Option<String>,
|
||||
/// how often metrics should be sent to a collection endpoint
|
||||
#[clap(long)]
|
||||
metric_collection_interval: Option<String>,
|
||||
/// cache for `wake_compute` api method (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
wake_compute_cache: String,
|
||||
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
|
||||
wake_compute_lock: String,
|
||||
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
|
||||
connect_compute_lock: String,
|
||||
#[clap(flatten)]
|
||||
sql_over_http: SqlOverHttpArgs,
|
||||
/// timeout for scram authentication protocol
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
scram_protocol_timeout: tokio::time::Duration,
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Wake compute rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
wake_compute_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Cancellation channel size (max queue size for redis kv client)
|
||||
#[clap(long, default_value = "1024")]
|
||||
cancellation_ch_size: usize,
|
||||
/// cache for `allowed_ips` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
allowed_ips_cache: String,
|
||||
/// cache for `role_secret` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
role_secret_cache: String,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
/// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
|
||||
#[clap(long, default_value = "irsa")]
|
||||
redis_auth_type: String,
|
||||
/// redis host for streaming connections (might be different from the notifications host)
|
||||
#[clap(long)]
|
||||
redis_host: Option<String>,
|
||||
/// redis port for streaming connections (might be different from the notifications host)
|
||||
#[clap(long)]
|
||||
redis_port: Option<u16>,
|
||||
/// redis cluster name, used in aws elasticache
|
||||
#[clap(long)]
|
||||
redis_cluster_name: Option<String>,
|
||||
/// redis user_id, used in aws elasticache
|
||||
#[clap(long)]
|
||||
redis_user_id: Option<String>,
|
||||
/// aws region to retrieve credentials
|
||||
#[clap(long, default_value_t = String::new())]
|
||||
aws_region: String,
|
||||
/// cache for `project_info` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
project_info_cache: String,
|
||||
/// cache for all valid endpoints
|
||||
#[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
|
||||
endpoint_cache_config: String,
|
||||
#[clap(flatten)]
|
||||
parquet_upload: ParquetUploadArgs,
|
||||
|
||||
/// interval for backup metric collection
|
||||
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||
metric_backup_collection_interval: std::time::Duration,
|
||||
/// remote storage configuration for backup metric collection
|
||||
/// Encoded as toml (same format as pageservers), eg
|
||||
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
|
||||
#[clap(long, value_parser = remote_storage_from_toml)]
|
||||
metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
|
||||
/// chunk size for backup metric collection
|
||||
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
|
||||
#[clap(long, default_value = "4194304")]
|
||||
metric_backup_collection_chunk_size: usize,
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
/// Whether to retry the wake_compute request
|
||||
#[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
|
||||
wake_compute_retry: String,
|
||||
|
||||
/// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
is_private_access_proxy: bool,
|
||||
|
||||
/// Configure whether all incoming requests have a Proxy Protocol V2 packet.
|
||||
// TODO(conradludgate): switch default to rejected or required once we've updated all deployments
|
||||
#[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
|
||||
proxy_protocol_v2: ProxyProtocolV2,
|
||||
|
||||
/// Time the proxy waits for the webauth session to be confirmed by the control plane.
|
||||
// TODO: rename to `console_redirect_confirmation_timeout`.
|
||||
#[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
|
||||
webauth_confirmation_timeout: std::time::Duration,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
struct SqlOverHttpArgs {
|
||||
/// timeout for http connection requests
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_timeout: tokio::time::Duration,
|
||||
|
||||
/// Whether the SQL over http pool is opt-in
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
sql_over_http_pool_opt_in: bool,
|
||||
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 20)]
|
||||
sql_over_http_pool_max_conns_per_endpoint: usize,
|
||||
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 20000)]
|
||||
sql_over_http_pool_max_total_conns: usize,
|
||||
|
||||
/// How long pooled connections should remain idle for before closing
|
||||
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_idle_timeout: tokio::time::Duration,
|
||||
|
||||
/// Duration each shard will wait on average before a GC sweep.
|
||||
/// A longer time will causes sweeps to take longer but will interfere less frequently.
|
||||
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_pool_gc_epoch: tokio::time::Duration,
|
||||
|
||||
/// How many shards should the global pool have. Must be a power of two.
|
||||
/// More shards will introduce less contention for pool operations, but can
|
||||
/// increase memory used by the pool
|
||||
#[clap(long, default_value_t = 128)]
|
||||
sql_over_http_pool_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10000)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 64)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_request_size_bytes: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_response_size_bytes: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
// TODO: refactor these to use labels
|
||||
info!("Version: {GIT_VERSION}");
|
||||
info!("Build_tag: {BUILD_TAG}");
|
||||
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
});
|
||||
|
||||
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
|
||||
Ok(t) => Some(t),
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let args = ProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
let auth_backend = build_auth_backend(&args)?;
|
||||
|
||||
match auth_backend {
|
||||
Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
|
||||
Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
|
||||
};
|
||||
info!("Using region: {}", args.aws_region);
|
||||
|
||||
// TODO: untangle the config args
|
||||
let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
|
||||
("plain", redis_url) => match redis_url {
|
||||
None => {
|
||||
bail!("plain auth requires redis_notifications to be set");
|
||||
}
|
||||
Some(url) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
|
||||
),
|
||||
},
|
||||
("irsa", _) => match (&args.redis_host, args.redis_port) {
|
||||
(Some(host), Some(port)) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host.to_string(),
|
||||
port,
|
||||
elasticache::CredentialsProvider::new(
|
||||
args.aws_region,
|
||||
args.redis_cluster_name,
|
||||
args.redis_user_id,
|
||||
)
|
||||
.await,
|
||||
),
|
||||
),
|
||||
(None, None) => {
|
||||
warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
|
||||
None
|
||||
}
|
||||
_ => {
|
||||
bail!("redis-host and redis-port must be specified together");
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
bail!("unknown auth type given");
|
||||
}
|
||||
};
|
||||
|
||||
let redis_notifications_client = if let Some(url) = args.redis_notifications {
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()))
|
||||
} else {
|
||||
regional_redis_client.clone()
|
||||
};
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
let http_address: SocketAddr = args.http.parse()?;
|
||||
info!("Starting http on {http_address}");
|
||||
let http_listener = TcpListener::bind(http_address).await?.into_std()?;
|
||||
|
||||
let mgmt_address: SocketAddr = args.mgmt.parse()?;
|
||||
info!("Starting mgmt on {mgmt_address}");
|
||||
let mgmt_listener = TcpListener::bind(mgmt_address).await?;
|
||||
|
||||
let proxy_listener = if !args.is_auth_broker {
|
||||
let proxy_address: SocketAddr = args.proxy.parse()?;
|
||||
info!("Starting proxy on {proxy_address}");
|
||||
|
||||
Some(TcpListener::bind(proxy_address).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// TODO: rename the argument to something like serverless.
|
||||
// It now covers more than just websockets, it also covers SQL over HTTP.
|
||||
let serverless_listener = if let Some(serverless_address) = args.wss {
|
||||
let serverless_address: SocketAddr = serverless_address.parse()?;
|
||||
info!("Starting wss on {serverless_address}");
|
||||
Some(TcpListener::bind(serverless_address).await?)
|
||||
} else if args.is_auth_broker {
|
||||
bail!("wss arg must be present for auth-broker")
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
|
||||
RateBucketInfo::validate(redis_rps_limit)?;
|
||||
|
||||
let redis_kv_client = regional_redis_client
|
||||
.as_ref()
|
||||
.map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
|
||||
|
||||
// channel size should be higher than redis client limit to avoid blocking
|
||||
let cancel_ch_size = args.cancellation_ch_size;
|
||||
let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
|
||||
let cancellation_handler = Arc::new(CancellationHandler::new(
|
||||
&config.connect_to_compute,
|
||||
Some(tx_cancel),
|
||||
));
|
||||
|
||||
// bit of a hack - find the min rps and max rps supported and turn it into
|
||||
// leaky bucket config instead
|
||||
let max = args
|
||||
.endpoint_rps_limit
|
||||
.iter()
|
||||
.map(|x| x.rps())
|
||||
.max_by(f64::total_cmp)
|
||||
.unwrap_or(EndpointRateLimiter::DEFAULT.max);
|
||||
let rps = args
|
||||
.endpoint_rps_limit
|
||||
.iter()
|
||||
.map(|x| x.rps())
|
||||
.min_by(f64::total_cmp)
|
||||
.unwrap_or(EndpointRateLimiter::DEFAULT.rps);
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
LeakyBucketConfig { rps, max },
|
||||
64,
|
||||
));
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
// cancellation returns Ok(())
|
||||
let mut client_tasks = JoinSet::new();
|
||||
match auth_backend {
|
||||
Either::Left(auth_backend) => {
|
||||
if let Some(proxy_listener) = proxy_listener {
|
||||
client_tasks.spawn(proxy::proxy::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(serverless_listener) = serverless_listener {
|
||||
client_tasks.spawn(serverless::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
serverless_listener,
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Either::Right(auth_backend) => {
|
||||
if let Some(proxy_listener) = proxy_listener {
|
||||
client_tasks.spawn(proxy::console_redirect_proxy::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
cancellation_handler.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
client_tasks.spawn(proxy::context::parquet::worker(
|
||||
cancellation_token.clone(),
|
||||
args.parquet_upload,
|
||||
));
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {}));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(
|
||||
http_listener,
|
||||
AppMetrics {
|
||||
jemalloc,
|
||||
neon_metrics,
|
||||
proxy: proxy::metrics::Metrics::get(),
|
||||
},
|
||||
));
|
||||
maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
// TODO: Add gc regardles of the metric collection being enabled.
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||
}
|
||||
|
||||
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
|
||||
if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
(client1, client2) => {
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(client) = client1 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
if let Some(client) = client2 {
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
client,
|
||||
cache.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(mut redis_kv_client) = redis_kv_client {
|
||||
maintenance_tasks.spawn(async move {
|
||||
redis_kv_client.try_connect().await?;
|
||||
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(regional_redis_client) = regional_redis_client {
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(
|
||||
async move { cache.do_read(con, cancellation_token.clone()).await }
|
||||
.instrument(span),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let maintenance = loop {
|
||||
// get one complete task
|
||||
match futures::future::select(
|
||||
pin!(maintenance_tasks.join_next()),
|
||||
pin!(client_tasks.join_next()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?,
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?,
|
||||
// exit if all our client tasks have shutdown gracefully
|
||||
Either::Right((None, _)) => return Ok(()),
|
||||
}
|
||||
};
|
||||
|
||||
// maintenance tasks return Infallible success values, this is an impossible value
|
||||
// so this match statically ensures that there are no possibilities for that value
|
||||
match maintenance {}
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
|
||||
Metrics::install(thread_pool.metrics.clone());
|
||||
|
||||
let tls_config = match (&args.tls_key, &args.tls_cert) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
key_path,
|
||||
cert_path,
|
||||
args.certs_dir.as_ref(),
|
||||
args.allow_tls_keylogfile,
|
||||
)?),
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
|
||||
};
|
||||
|
||||
let backup_metric_collection_config = config::MetricBackupCollectionConfig {
|
||||
interval: args.metric_backup_collection_interval,
|
||||
remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
|
||||
chunk_size: args.metric_backup_collection_chunk_size,
|
||||
};
|
||||
|
||||
let metric_collection = match (
|
||||
&args.metric_collection_endpoint,
|
||||
&args.metric_collection_interval,
|
||||
) {
|
||||
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
|
||||
endpoint: endpoint.parse()?,
|
||||
interval: humantime::parse_duration(interval)?,
|
||||
backup_metric_collection_config,
|
||||
}),
|
||||
(None, None) => None,
|
||||
_ => bail!(
|
||||
"either both or neither metric-collection-endpoint \
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.connect_compute_lock.parse()?;
|
||||
info!(
|
||||
?limiter,
|
||||
shards,
|
||||
?epoch,
|
||||
"Using NodeLocks (connect_compute)"
|
||||
);
|
||||
let connect_compute_locks = control_plane::locks::ApiLocks::new(
|
||||
"connect_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().proxy.connect_compute_lock,
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: !args.is_auth_broker,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
pool_shards: args.sql_over_http.sql_over_http_pool_shards,
|
||||
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
|
||||
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
|
||||
max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool,
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
|
||||
ip_allowlist_check_enabled: !args.is_private_access_proxy,
|
||||
is_vpc_acccess_proxy: args.is_private_access_proxy,
|
||||
is_auth_broker: args.is_auth_broker,
|
||||
accept_jwts: args.is_auth_broker,
|
||||
console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
|
||||
};
|
||||
|
||||
let compute_config = ComputeConfig {
|
||||
retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
|
||||
tls: Arc::new(compute_client_config_with_root_certs()?),
|
||||
timeout: Duration::from_secs(2),
|
||||
};
|
||||
|
||||
let config = ProxyConfig {
|
||||
tls_config,
|
||||
metric_collection,
|
||||
http_config,
|
||||
authentication_config,
|
||||
proxy_protocol_v2: args.proxy_protocol_v2,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
region: args.region.clone(),
|
||||
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute: compute_config,
|
||||
};
|
||||
|
||||
let config = Box::leak(Box::new(config));
|
||||
|
||||
tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
/// auth::Backend is created at proxy startup, and lives forever.
|
||||
fn build_auth_backend(
|
||||
args: &ProxyCliArgs,
|
||||
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
|
||||
match &args.auth_backend {
|
||||
AuthBackendType::ControlPlaneV1 => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
let endpoint_cache_config: config::EndpointCacheConfig =
|
||||
args.endpoint_cache_config.parse()?;
|
||||
|
||||
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
|
||||
info!(
|
||||
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
|
||||
);
|
||||
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
|
||||
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
|
||||
wake_compute_cache_config,
|
||||
project_info_cache_config,
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.wake_compute_lock.parse()?;
|
||||
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||
let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)?));
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
|
||||
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
|
||||
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
|
||||
let wake_compute_endpoint_rate_limiter =
|
||||
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
|
||||
|
||||
let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
|
||||
endpoint,
|
||||
args.control_plane_token.clone(),
|
||||
caches,
|
||||
locks,
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
);
|
||||
|
||||
let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
|
||||
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
|
||||
let config = Box::leak(Box::new(auth_backend));
|
||||
|
||||
Ok(Either::Left(config))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackendType::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = control_plane::client::mock::MockControlPlane::new(
|
||||
url,
|
||||
!args.is_private_access_proxy,
|
||||
);
|
||||
let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
|
||||
|
||||
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
|
||||
|
||||
let config = Box::leak(Box::new(auth_backend));
|
||||
|
||||
Ok(Either::Left(config))
|
||||
}
|
||||
|
||||
AuthBackendType::ConsoleRedirect => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
let endpoint_cache_config: config::EndpointCacheConfig =
|
||||
args.endpoint_cache_config.parse()?;
|
||||
|
||||
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
|
||||
info!(
|
||||
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
|
||||
);
|
||||
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
|
||||
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
|
||||
wake_compute_cache_config,
|
||||
project_info_cache_config,
|
||||
endpoint_cache_config,
|
||||
)));
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.wake_compute_lock.parse()?;
|
||||
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||
let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)?));
|
||||
|
||||
let url = args.uri.clone().parse()?;
|
||||
let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
|
||||
let endpoint = http::Endpoint::new(ep_url, http::new_client());
|
||||
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
|
||||
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
|
||||
let wake_compute_endpoint_rate_limiter =
|
||||
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
|
||||
|
||||
// Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
|
||||
// and locks are not used in ConsoleRedirectBackend,
|
||||
// but they are required by the NeonControlPlaneClient
|
||||
let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
|
||||
endpoint,
|
||||
args.control_plane_token.clone(),
|
||||
caches,
|
||||
locks,
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
);
|
||||
|
||||
let backend = ConsoleRedirectBackend::new(url, api);
|
||||
let config = Box::leak(Box::new(backend));
|
||||
|
||||
Ok(Either::Right(config))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use clap::Parser;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
|
||||
#[test]
|
||||
fn parse_endpoint_rps_limit() {
|
||||
let config = super::ProxyCliArgs::parse_from([
|
||||
"proxy",
|
||||
"--endpoint-rps-limit",
|
||||
"100@1s",
|
||||
"--endpoint-rps-limit",
|
||||
"20@30s",
|
||||
]);
|
||||
|
||||
assert_eq!(
|
||||
config.endpoint_rps_limit,
|
||||
vec![
|
||||
RateBucketInfo::new(100, Duration::from_secs(1)),
|
||||
RateBucketInfo::new(20, Duration::from_secs(30)),
|
||||
]
|
||||
);
|
||||
}
|
||||
proxy::binary::proxy::run().await
|
||||
}
|
||||
|
||||
410
proxy/src/binary/local_proxy.rs
Normal file
410
proxy/src/binary/local_proxy.rs
Normal file
@@ -0,0 +1,410 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::auth::backend::jwt::JwkCache;
|
||||
use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
|
||||
use crate::auth::{self};
|
||||
use crate::cancellation::CancellationHandler;
|
||||
use crate::config::{
|
||||
self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
|
||||
};
|
||||
use crate::control_plane::locks::ApiLocks;
|
||||
use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
|
||||
use crate::http::health_server::AppMetrics;
|
||||
use crate::intern::RoleNameInt;
|
||||
use crate::metrics::{Metrics, ThreadPoolMetrics};
|
||||
use crate::rate_limiter::{
|
||||
BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
|
||||
};
|
||||
use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
use crate::serverless::{self, GlobalConnPoolOptions};
|
||||
use crate::tls::client_config::compute_client_config_with_root_certs;
|
||||
use crate::types::RoleName;
|
||||
use crate::url::ApiUrl;
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use compute_api::spec::LocalProxySpec;
|
||||
use futures::future::Either;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::Parser;
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{pid_file, project_build_tag, project_git_version};
|
||||
|
||||
/// Neon proxy/router
|
||||
#[derive(Parser)]
|
||||
#[command(version = GIT_VERSION, about)]
|
||||
struct LocalProxyCliArgs {
|
||||
/// listen for incoming metrics connections on ip:port
|
||||
#[clap(long, default_value = "127.0.0.1:7001")]
|
||||
metrics: String,
|
||||
/// listen for incoming http connections on ip:port
|
||||
#[clap(long)]
|
||||
http: String,
|
||||
/// timeout for the TLS handshake
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
handshake_timeout: tokio::time::Duration,
|
||||
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
|
||||
connect_compute_lock: String,
|
||||
#[clap(flatten)]
|
||||
sql_over_http: SqlOverHttpArgs,
|
||||
/// User rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
user_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
/// Address of the postgres server
|
||||
#[clap(long, default_value = "127.0.0.1:5432")]
|
||||
postgres: SocketAddr,
|
||||
/// Address of the internal compute-ctl api service
|
||||
#[clap(long, default_value = "http://127.0.0.1:3081/")]
|
||||
compute_ctl: ApiUrl,
|
||||
/// Path of the local proxy config file
|
||||
#[clap(long, default_value = "./local_proxy.json")]
|
||||
config_path: Utf8PathBuf,
|
||||
/// Path of the local proxy PID file
|
||||
#[clap(long, default_value = "./local_proxy.pid")]
|
||||
pid_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
struct SqlOverHttpArgs {
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 200)]
|
||||
sql_over_http_pool_max_total_conns: usize,
|
||||
|
||||
/// How long pooled connections should remain idle for before closing
|
||||
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_idle_timeout: tokio::time::Duration,
|
||||
|
||||
#[clap(long, default_value_t = 100)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 16)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_request_size_bytes: usize,
|
||||
|
||||
#[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
|
||||
sql_over_http_max_response_size_bytes: usize,
|
||||
}
|
||||
|
||||
pub async fn run() -> anyhow::Result<()> {
|
||||
let _logging_guard = crate::logging::init_local_proxy()?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
// TODO: refactor these to use labels
|
||||
debug!("Version: {GIT_VERSION}");
|
||||
debug!("Build_tag: {BUILD_TAG}");
|
||||
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
});
|
||||
|
||||
let jemalloc = match crate::jemalloc::MetricRecorder::new() {
|
||||
Ok(t) => Some(t),
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let args = LocalProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
let auth_backend = build_auth_backend(&args);
|
||||
|
||||
// before we bind to any ports, write the process ID to a file
|
||||
// so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
//
|
||||
// This also claims a "lock" that makes sure only one instance
|
||||
// of local_proxy runs at a time.
|
||||
let _process_guard = loop {
|
||||
match pid_file::claim_for_current_process(&args.pid_path) {
|
||||
Ok(guard) => break guard,
|
||||
Err(e) => {
|
||||
// compute-ctl might have tried to read the pid-file to let us
|
||||
// know about some config change. We should try again.
|
||||
error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
|
||||
let http_listener = TcpListener::bind(args.http).await?;
|
||||
let shutdown = CancellationToken::new();
|
||||
|
||||
// todo: should scale with CU
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
LeakyBucketConfig {
|
||||
rps: 10.0,
|
||||
max: 100.0,
|
||||
},
|
||||
16,
|
||||
));
|
||||
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
|
||||
let refresh_config_notify = Arc::new(Notify::new());
|
||||
maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), {
|
||||
let refresh_config_notify = Arc::clone(&refresh_config_notify);
|
||||
move || {
|
||||
refresh_config_notify.notify_one();
|
||||
}
|
||||
}));
|
||||
|
||||
// trigger the first config load **after** setting up the signal hook
|
||||
// to avoid the race condition where:
|
||||
// 1. No config file registered when local_proxy starts up
|
||||
// 2. The config file is written but the signal hook is not yet received
|
||||
// 3. local_proxy completes startup but has no config loaded, despite there being a registerd config.
|
||||
refresh_config_notify.notify_one();
|
||||
tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
|
||||
|
||||
maintenance_tasks.spawn(crate::http::health_server::task_main(
|
||||
metrics_listener,
|
||||
AppMetrics {
|
||||
jemalloc,
|
||||
neon_metrics,
|
||||
proxy: crate::metrics::Metrics::get(),
|
||||
},
|
||||
));
|
||||
|
||||
let task = serverless::task_main(
|
||||
config,
|
||||
auth_backend,
|
||||
http_listener,
|
||||
shutdown.clone(),
|
||||
Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
|
||||
endpoint_rate_limiter,
|
||||
);
|
||||
|
||||
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {},
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((res, _)) => res?,
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.connect_compute_lock.parse()?;
|
||||
info!(
|
||||
?limiter,
|
||||
shards,
|
||||
?epoch,
|
||||
"Using NodeLocks (connect_compute)"
|
||||
);
|
||||
let connect_compute_locks = ApiLocks::new(
|
||||
"connect_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().proxy.connect_compute_lock,
|
||||
);
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: false,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
gc_epoch: Duration::from_secs(60),
|
||||
pool_shards: 2,
|
||||
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
|
||||
opt_in: false,
|
||||
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
|
||||
max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
|
||||
};
|
||||
|
||||
let compute_config = ComputeConfig {
|
||||
retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?,
|
||||
tls: Arc::new(compute_client_config_with_root_certs()?),
|
||||
timeout: Duration::from_secs(2),
|
||||
};
|
||||
|
||||
Ok(Box::leak(Box::new(ProxyConfig {
|
||||
tls_config: None,
|
||||
metric_collection: None,
|
||||
http_config,
|
||||
authentication_config: AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(0),
|
||||
scram_protocol_timeout: Duration::from_secs(10),
|
||||
rate_limiter_enabled: false,
|
||||
rate_limiter: BucketRateLimiter::new(vec![]),
|
||||
rate_limit_ip_subnet: 64,
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_vpc_acccess_proxy: false,
|
||||
is_auth_broker: false,
|
||||
accept_jwts: true,
|
||||
console_redirect_confirmation_timeout: Duration::ZERO,
|
||||
},
|
||||
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
region: "local".into(),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute: compute_config,
|
||||
})))
|
||||
}
|
||||
|
||||
/// auth::Backend is created at proxy startup, and lives forever.
|
||||
fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> {
|
||||
let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned(
|
||||
LocalBackend::new(args.postgres, args.compute_ctl.clone()),
|
||||
));
|
||||
|
||||
Box::leak(Box::new(auth_backend))
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
enum RefreshConfigError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] std::io::Error),
|
||||
#[error(transparent)]
|
||||
Parse(#[from] serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Validate(anyhow::Error),
|
||||
}
|
||||
|
||||
async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
|
||||
let mut init = true;
|
||||
loop {
|
||||
rx.notified().await;
|
||||
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
// don't log for file not found errors if this is the first time we are checking
|
||||
// for computes that don't use local_proxy, this is not an error.
|
||||
Err(RefreshConfigError::Read(e))
|
||||
if init && e.kind() == std::io::ErrorKind::NotFound =>
|
||||
{
|
||||
debug!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
}
|
||||
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> {
|
||||
let bytes = tokio::fs::read(&path).await?;
|
||||
let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
|
||||
|
||||
let mut jwks_set = vec![];
|
||||
|
||||
fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
|
||||
let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
|
||||
|
||||
ensure!(
|
||||
jwks_url.has_authority()
|
||||
&& (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
|
||||
ensure!(
|
||||
jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks_url
|
||||
.set_username("")
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
jwks_url
|
||||
.set_password(None)
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks_url.set_fragment(None);
|
||||
jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(JwksSettings {
|
||||
id: jwks.id,
|
||||
jwks_url,
|
||||
_provider_name: jwks.provider_name,
|
||||
jwt_audience: jwks.jwt_audience,
|
||||
role_names: jwks
|
||||
.role_names
|
||||
.into_iter()
|
||||
.map(RoleName::from)
|
||||
.map(|s| RoleNameInt::from(&s))
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
for jwks in data.jwks.into_iter().flatten() {
|
||||
jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
|
||||
}
|
||||
|
||||
info!("successfully loaded new config");
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user