Compare commits

..

1 Commits

Author SHA1 Message Date
Cihan Demirci
f39fefd8ae dnm: test ci 2024-08-02 10:13:36 +03:00
212 changed files with 2869 additions and 6689 deletions

View File

@@ -8,9 +8,6 @@ self-hosted-runner:
- small-arm64 - small-arm64
- us-east-2 - us-east-2
config-variables: config-variables:
- BENCHMARK_PROJECT_ID_PUB
- BENCHMARK_PROJECT_ID_SUB
- REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_CONTAINER
- REMOTE_STORAGE_AZURE_REGION - REMOTE_STORAGE_AZURE_REGION
- SLACK_UPCOMING_RELEASE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID
- DEV_AWS_OIDC_ROLE_ARN

View File

@@ -56,10 +56,6 @@ concurrency:
jobs: jobs:
bench: bench:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -67,13 +63,9 @@ jobs:
- DEFAULT_PG_VERSION: 16 - DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging" PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
RUNNER: [ self-hosted, us-east-2, x64 ]
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
- DEFAULT_PG_VERSION: 16 - DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging" PLATFORM: "azure-staging"
region_id: 'azure-eastus2' region_id: 'azure-eastus2'
RUNNER: [ self-hosted, eastus2, x64 ]
IMAGE: neondatabase/build-tools:pinned
env: env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100" TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -84,21 +76,14 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.PLATFORM }} PLATFORM: ${{ matrix.PLATFORM }}
runs-on: ${{ matrix.RUNNER }} runs-on: [ self-hosted, us-east-2, x64 ]
container: container:
image: ${{ matrix.IMAGE }} image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init options: --init
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Configure AWS credentials # necessary on Azure runners
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact - name: Download Neon artifact
uses: ./.github/actions/download uses: ./.github/actions/download
with: with:
@@ -162,7 +147,7 @@ jobs:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
env: env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16 DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -176,7 +161,6 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Download Neon artifact - name: Download Neon artifact
uses: ./.github/actions/download uses: ./.github/actions/download
with: with:
@@ -184,7 +168,7 @@ jobs:
path: /tmp/neon/ path: /tmp/neon/
prefix: latest prefix: latest
- name: Run Logical Replication benchmarks - name: Run benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
with: with:
build_type: ${{ env.BUILD_TYPE }} build_type: ${{ env.BUILD_TYPE }}
@@ -192,15 +176,12 @@ jobs:
run_in_parallel: false run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }} save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400 extra_params: -m remote_cluster --timeout 5400
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env: env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
- name: Run Physical Replication benchmarks - name: Run benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
with: with:
build_type: ${{ env.BUILD_TYPE }} build_type: ${{ env.BUILD_TYPE }}
@@ -253,9 +234,6 @@ jobs:
id: pgbench-compare-matrix id: pgbench-compare-matrix
run: | run: |
region_id_default=${{ env.DEFAULT_REGION_ID }} region_id_default=${{ env.DEFAULT_REGION_ID }}
runner_default='["self-hosted", "us-east-2", "x64"]'
runner_azure='["self-hosted", "eastus2", "x64"]'
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
matrix='{ matrix='{
"pg_version" : [ "pg_version" : [
16 16
@@ -269,19 +247,16 @@ jobs:
"neonvm-captest-new" "neonvm-captest-new"
], ],
"db_size": [ "10gb" ], "db_size": [ "10gb" ],
"runner": ['"$runner_default"'], "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
"image": [ "'"$image_default"'" ], { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
}' }'
if [ "$(date +%A)" = "Saturday" ]; then if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -324,10 +299,6 @@ jobs:
pgbench-compare: pgbench-compare:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
needs: [ generate-matrices ] needs: [ generate-matrices ]
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
strategy: strategy:
fail-fast: false fail-fast: false
@@ -343,9 +314,9 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.platform }} PLATFORM: ${{ matrix.platform }}
runs-on: ${{ matrix.runner }} runs-on: [ self-hosted, us-east-2, x64 ]
container: container:
image: ${{ matrix.image }} image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init options: --init
# Increase timeout to 8h, default timeout is 6h # Increase timeout to 8h, default timeout is 6h
@@ -354,13 +325,6 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Configure AWS credentials # necessary on Azure runners
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact - name: Download Neon artifact
uses: ./.github/actions/download uses: ./.github/actions/download
with: with:
@@ -468,20 +432,12 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector: pgbench-pgvector:
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- PLATFORM: "neonvm-captest-pgvector" - PLATFORM: "neonvm-captest-pgvector"
RUNNER: [ self-hosted, us-east-2, x64 ]
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
- PLATFORM: "azure-captest-pgvector" - PLATFORM: "azure-captest-pgvector"
RUNNER: [ self-hosted, eastus2, x64 ]
IMAGE: neondatabase/build-tools:pinned
env: env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -494,9 +450,9 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.PLATFORM }} PLATFORM: ${{ matrix.PLATFORM }}
runs-on: ${{ matrix.RUNNER }} runs-on: [ self-hosted, us-east-2, x64 ]
container: container:
image: ${{ matrix.IMAGE }} image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init options: --init
steps: steps:
@@ -507,12 +463,12 @@ jobs:
- name: Install postgresql-16 where pytest expects it - name: Install postgresql-16 where pytest expects it
run: | run: |
cd /home/nonroot cd /home/nonroot
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
mkdir -p /tmp/neon/pg_install/v16/bin mkdir -p /tmp/neon/pg_install/v16/bin
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
@@ -538,13 +494,6 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Benchmark pgvector hnsw indexing - name: Benchmark pgvector hnsw indexing
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
with: with:

View File

@@ -149,6 +149,8 @@ jobs:
env: env:
BUILD_TYPE: release BUILD_TYPE: release
# remove the cachepot wrapper and build without crate caches
RUSTC_WRAPPER: ""
# build with incremental compilation produce partial results # build with incremental compilation produce partial results
# so do not attempt to cache this build, also disable the incremental compilation # so do not attempt to cache this build, also disable the incremental compilation
CARGO_INCREMENTAL: 0 CARGO_INCREMENTAL: 0

View File

@@ -66,31 +66,7 @@ jobs:
ports: ports:
- 9000:9000 - 9000:9000
- 8123:8123 - 8123:8123
zookeeper:
image: quay.io/debezium/zookeeper:2.7
ports:
- 2181:2181
kafka:
image: quay.io/debezium/kafka:2.7
env:
ZOOKEEPER_CONNECT: "zookeeper:2181"
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
KAFKA_BROKER_ID: 1
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_JMX_PORT: 9991
ports:
- 9092:9092
debezium:
image: quay.io/debezium/connect:2.7
env:
BOOTSTRAP_SERVERS: kafka:9092
GROUP_ID: 1
CONFIG_STORAGE_TOPIC: debezium-config
OFFSET_STORAGE_TOPIC: debezium-offset
STATUS_STORAGE_TOPIC: debezium-status
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
ports:
- 8083:8083
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

View File

@@ -7,20 +7,12 @@ on:
description: 'Source tag' description: 'Source tag'
required: true required: true
type: string type: string
force:
description: 'Force the image to be pinned'
default: false
type: boolean
workflow_call: workflow_call:
inputs: inputs:
from-tag: from-tag:
description: 'Source tag' description: 'Source tag'
required: true required: true
type: string type: string
force:
description: 'Force the image to be pinned'
default: false
type: boolean
defaults: defaults:
run: run:
@@ -30,19 +22,16 @@ concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }} group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {} permissions: {}
jobs:
tag-image:
runs-on: ubuntu-22.04
env: env:
FROM_TAG: ${{ inputs.from-tag }} FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: pinned TO_TAG: pinned
jobs:
check-manifests:
runs-on: ubuntu-22.04
outputs:
skip: ${{ steps.check-manifests.outputs.skip }}
steps: steps:
- name: Check if we really need to pin the image - name: Check if we really need to pin the image
id: check-manifests id: check-manifests
@@ -58,44 +47,27 @@ jobs:
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
tag-image:
needs: check-manifests
# use format(..) to catch both inputs.force = true AND inputs.force = 'true'
if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
runs-on: ubuntu-22.04
permissions:
id-token: write # for `azure/login`
steps:
- uses: docker/login-action@v3 - uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with: with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
if: steps.check-manifests.outputs.skip == 'false'
run: |
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG}
- uses: docker/login-action@v3 - uses: docker/login-action@v3
if: steps.check-manifests.outputs.skip == 'false'
with: with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }} username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
password: ${{ secrets.AWS_SECRET_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Azure login - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 if: steps.check-manifests.outputs.skip == 'false'
with:
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
- name: Login to ACR
run: |
az acr login --name=neoneastus2
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
run: | run: |
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-t neondatabase/build-tools:${TO_TAG} \
neondatabase/build-tools:${FROM_TAG} neondatabase/build-tools:${FROM_TAG}

View File

@@ -13,6 +13,8 @@ defaults:
env: env:
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs: jobs:
cancel-previous-e2e-tests: cancel-previous-e2e-tests:
@@ -62,35 +64,19 @@ jobs:
needs: [ tag ] needs: [ tag ]
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
env: env:
EVENT_ACTION: ${{ github.event.action }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
TAG: ${{ needs.tag.outputs.build-tag }} TAG: ${{ needs.tag.outputs.build-tag }}
steps: steps:
- name: Wait for `promote-images` job to finish - name: check if ecr image are present
# It's important to have a timeout here, the script in the step can run infinitely env:
timeout-minutes: 60 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: | run: |
if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
exit 0 OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
fi if [ "$OUTPUT" == "" ]; then
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
# For PRs we use the run id as the tag
BUILD_AND_TEST_RUN_ID=${TAG}
while true; do
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
case "$conclusion" in
success)
break
;;
failure | cancelled | skipped)
echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
exit 1 exit 1
;; fi
*)
echo "The 'promote-images' hasn't succeed yet. Waiting..."
sleep 60
;;
esac
done done
- name: Set e2e-platforms - name: Set e2e-platforms

372
Cargo.lock generated
View File

@@ -484,7 +484,7 @@ dependencies = [
"http 0.2.9", "http 0.2.9",
"http 1.1.0", "http 1.1.0",
"once_cell", "once_cell",
"p256 0.11.1", "p256",
"percent-encoding", "percent-encoding",
"ring 0.17.6", "ring 0.17.6",
"sha2", "sha2",
@@ -848,12 +848,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
[[package]]
name = "base16ct"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.13.1" version = "0.13.1"
@@ -977,9 +971,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
[[package]] [[package]]
name = "bytemuck" name = "bytemuck"
version = "1.16.3" version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
@@ -1532,10 +1526,8 @@ version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
dependencies = [ dependencies = [
"generic-array",
"rand_core 0.6.4", "rand_core 0.6.4",
"subtle", "subtle",
"zeroize",
] ]
[[package]] [[package]]
@@ -1629,7 +1621,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
dependencies = [ dependencies = [
"const-oid", "const-oid",
"pem-rfc7468",
"zeroize", "zeroize",
] ]
@@ -1729,7 +1720,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [ dependencies = [
"block-buffer", "block-buffer",
"const-oid",
"crypto-common", "crypto-common",
"subtle", "subtle",
] ]
@@ -1781,25 +1771,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
dependencies = [ dependencies = [
"der 0.6.1", "der 0.6.1",
"elliptic-curve 0.12.3", "elliptic-curve",
"rfc6979 0.3.1", "rfc6979",
"signature 1.6.4", "signature 1.6.4",
] ]
[[package]]
name = "ecdsa"
version = "0.16.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
dependencies = [
"der 0.7.8",
"digest",
"elliptic-curve 0.13.8",
"rfc6979 0.4.0",
"signature 2.2.0",
"spki 0.7.3",
]
[[package]] [[package]]
name = "either" name = "either"
version = "1.8.1" version = "1.8.1"
@@ -1812,36 +1788,16 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
dependencies = [ dependencies = [
"base16ct 0.1.1", "base16ct",
"crypto-bigint 0.4.9", "crypto-bigint 0.4.9",
"der 0.6.1", "der 0.6.1",
"digest", "digest",
"ff 0.12.1", "ff",
"generic-array", "generic-array",
"group 0.12.1", "group",
"pkcs8 0.9.0", "pkcs8",
"rand_core 0.6.4", "rand_core 0.6.4",
"sec1 0.3.0", "sec1",
"subtle",
"zeroize",
]
[[package]]
name = "elliptic-curve"
version = "0.13.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
dependencies = [
"base16ct 0.2.0",
"crypto-bigint 0.5.5",
"digest",
"ff 0.13.0",
"generic-array",
"group 0.13.0",
"pem-rfc7468",
"pkcs8 0.10.2",
"rand_core 0.6.4",
"sec1 0.7.3",
"subtle", "subtle",
"zeroize", "zeroize",
] ]
@@ -1995,16 +1951,6 @@ dependencies = [
"subtle", "subtle",
] ]
[[package]]
name = "ff"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
dependencies = [
"rand_core 0.6.4",
"subtle",
]
[[package]] [[package]]
name = "filetime" name = "filetime"
version = "0.2.22" version = "0.2.22"
@@ -2202,7 +2148,6 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [ dependencies = [
"typenum", "typenum",
"version_check", "version_check",
"zeroize",
] ]
[[package]] [[package]]
@@ -2269,18 +2214,7 @@ version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
dependencies = [ dependencies = [
"ff 0.12.1", "ff",
"rand_core 0.6.4",
"subtle",
]
[[package]]
name = "group"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
dependencies = [
"ff 0.13.0",
"rand_core 0.6.4", "rand_core 0.6.4",
"subtle", "subtle",
] ]
@@ -2842,42 +2776,6 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "jose-b64"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
dependencies = [
"base64ct",
"serde",
"subtle",
"zeroize",
]
[[package]]
name = "jose-jwa"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
dependencies = [
"serde",
]
[[package]]
name = "jose-jwk"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
dependencies = [
"jose-b64",
"jose-jwa",
"p256 0.13.2",
"p384",
"rsa",
"serde",
"zeroize",
]
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.69" version = "0.3.69"
@@ -2937,9 +2835,6 @@ name = "lazy_static"
version = "1.4.0" version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
dependencies = [
"spin 0.5.2",
]
[[package]] [[package]]
name = "lazycell" name = "lazycell"
@@ -3309,23 +3204,6 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "num-bigint-dig"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
dependencies = [
"byteorder",
"lazy_static",
"libm",
"num-integer",
"num-iter",
"num-traits",
"rand 0.8.5",
"smallvec",
"zeroize",
]
[[package]] [[package]]
name = "num-complex" name = "num-complex"
version = "0.4.4" version = "0.4.4"
@@ -3603,33 +3481,11 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
dependencies = [ dependencies = [
"ecdsa 0.14.8", "ecdsa",
"elliptic-curve 0.12.3", "elliptic-curve",
"sha2", "sha2",
] ]
[[package]]
name = "p256"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
dependencies = [
"ecdsa 0.16.9",
"elliptic-curve 0.13.8",
"primeorder",
"sha2",
]
[[package]]
name = "p384"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
dependencies = [
"elliptic-curve 0.13.8",
"primeorder",
]
[[package]] [[package]]
name = "pagebench" name = "pagebench"
version = "0.1.0" version = "0.1.0"
@@ -3991,15 +3847,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "pem-rfc7468"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
dependencies = [
"base64ct",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.2.0" version = "2.2.0"
@@ -4016,29 +3863,6 @@ dependencies = [
"indexmap 1.9.3", "indexmap 1.9.3",
] ]
[[package]]
name = "pg_sni_router"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"futures",
"git-version",
"itertools 0.10.5",
"pq_proto",
"proxy-core",
"proxy-sasl",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"socket2 0.5.5",
"tokio",
"tokio-util",
"tracing",
"tracing-utils",
"utils",
"uuid",
]
[[package]] [[package]]
name = "phf" name = "phf"
version = "0.11.1" version = "0.11.1"
@@ -4089,17 +3913,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkcs1"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
dependencies = [
"der 0.7.8",
"pkcs8 0.10.2",
"spki 0.7.3",
]
[[package]] [[package]]
name = "pkcs8" name = "pkcs8"
version = "0.9.0" version = "0.9.0"
@@ -4110,16 +3923,6 @@ dependencies = [
"spki 0.6.0", "spki 0.6.0",
] ]
[[package]]
name = "pkcs8"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
dependencies = [
"der 0.7.8",
"spki 0.7.3",
]
[[package]] [[package]]
name = "pkg-config" name = "pkg-config"
version = "0.3.27" version = "0.3.27"
@@ -4157,7 +3960,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres" name = "postgres"
version = "0.19.4" version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
dependencies = [ dependencies = [
"bytes", "bytes",
"fallible-iterator", "fallible-iterator",
@@ -4170,7 +3973,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres-protocol" name = "postgres-protocol"
version = "0.6.4" version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
dependencies = [ dependencies = [
"base64 0.20.0", "base64 0.20.0",
"byteorder", "byteorder",
@@ -4189,7 +3992,7 @@ dependencies = [
[[package]] [[package]]
name = "postgres-types" name = "postgres-types"
version = "0.2.4" version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
dependencies = [ dependencies = [
"bytes", "bytes",
"fallible-iterator", "fallible-iterator",
@@ -4313,15 +4116,6 @@ dependencies = [
"syn 2.0.52", "syn 2.0.52",
] ]
[[package]]
name = "primeorder"
version = "0.13.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
dependencies = [
"elliptic-curve 0.13.8",
]
[[package]] [[package]]
name = "proc-macro-hack" name = "proc-macro-hack"
version = "0.5.20+deprecated" version = "0.5.20+deprecated"
@@ -4436,38 +4230,9 @@ dependencies = [
[[package]] [[package]]
name = "proxy" name = "proxy"
version = "0.1.0" version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"clap",
"futures",
"git-version",
"humantime",
"itertools 0.10.5",
"metrics",
"pq_proto",
"proxy-core",
"proxy-sasl",
"remote_storage",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"socket2 0.5.5",
"tikv-jemallocator",
"tokio",
"tokio-util",
"tracing",
"tracing-utils",
"utils",
"uuid",
]
[[package]]
name = "proxy-core"
version = "0.1.0"
dependencies = [ dependencies = [
"ahash", "ahash",
"anyhow", "anyhow",
"arc-swap",
"async-compression", "async-compression",
"async-trait", "async-trait",
"atomic-take", "atomic-take",
@@ -4485,11 +4250,11 @@ dependencies = [
"consumption_metrics", "consumption_metrics",
"crossbeam-deque", "crossbeam-deque",
"dashmap", "dashmap",
"ecdsa 0.16.9",
"env_logger", "env_logger",
"fallible-iterator", "fallible-iterator",
"framed-websockets", "framed-websockets",
"futures", "futures",
"git-version",
"hashbrown 0.14.5", "hashbrown 0.14.5",
"hashlink", "hashlink",
"hex", "hex",
@@ -4505,14 +4270,12 @@ dependencies = [
"indexmap 2.0.1", "indexmap 2.0.1",
"ipnet", "ipnet",
"itertools 0.10.5", "itertools 0.10.5",
"jose-jwa",
"jose-jwk",
"lasso", "lasso",
"md5", "md5",
"measured", "measured",
"metrics", "metrics",
"once_cell", "once_cell",
"p256 0.13.2", "opentelemetry",
"parking_lot 0.12.1", "parking_lot 0.12.1",
"parquet", "parquet",
"parquet_derive", "parquet_derive",
@@ -4521,7 +4284,7 @@ dependencies = [
"postgres-protocol", "postgres-protocol",
"postgres_backend", "postgres_backend",
"pq_proto", "pq_proto",
"proxy-sasl", "prometheus",
"rand 0.8.5", "rand 0.8.5",
"rand_distr", "rand_distr",
"rcgen", "rcgen",
@@ -4533,7 +4296,6 @@ dependencies = [
"reqwest-retry", "reqwest-retry",
"reqwest-tracing", "reqwest-tracing",
"routerify", "routerify",
"rsa",
"rstest", "rstest",
"rustc-hash", "rustc-hash",
"rustls 0.22.4", "rustls 0.22.4",
@@ -4543,7 +4305,6 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
"signature 2.2.0",
"smallvec", "smallvec",
"smol_str", "smol_str",
"socket2 0.5.5", "socket2 0.5.5",
@@ -4551,6 +4312,7 @@ dependencies = [
"task-local-extensions", "task-local-extensions",
"thiserror", "thiserror",
"tikv-jemalloc-ctl", "tikv-jemalloc-ctl",
"tikv-jemallocator",
"tokio", "tokio",
"tokio-postgres", "tokio-postgres",
"tokio-postgres-rustls", "tokio-postgres-rustls",
@@ -4562,7 +4324,6 @@ dependencies = [
"tracing-opentelemetry", "tracing-opentelemetry",
"tracing-subscriber", "tracing-subscriber",
"tracing-utils", "tracing-utils",
"try-lock",
"typed-json", "typed-json",
"url", "url",
"urlencoding", "urlencoding",
@@ -4573,35 +4334,6 @@ dependencies = [
"x509-parser", "x509-parser",
] ]
[[package]]
name = "proxy-sasl"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"base64 0.13.1",
"bytes",
"crossbeam-deque",
"hmac",
"itertools 0.10.5",
"lasso",
"measured",
"parking_lot 0.12.1",
"pbkdf2",
"postgres-protocol",
"pq_proto",
"rand 0.8.5",
"rustls 0.22.4",
"sha2",
"subtle",
"thiserror",
"tokio",
"tracing",
"uuid",
"workspace_hack",
"x509-parser",
]
[[package]] [[package]]
name = "quick-xml" name = "quick-xml"
version = "0.31.0" version = "0.31.0"
@@ -5074,16 +4806,6 @@ dependencies = [
"zeroize", "zeroize",
] ]
[[package]]
name = "rfc6979"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
dependencies = [
"hmac",
"subtle",
]
[[package]] [[package]]
name = "ring" name = "ring"
version = "0.16.20" version = "0.16.20"
@@ -5144,26 +4866,6 @@ dependencies = [
"archery", "archery",
] ]
[[package]]
name = "rsa"
version = "0.9.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
dependencies = [
"const-oid",
"digest",
"num-bigint-dig",
"num-integer",
"num-traits",
"pkcs1",
"pkcs8 0.10.2",
"rand_core 0.6.4",
"signature 2.2.0",
"spki 0.7.3",
"subtle",
"zeroize",
]
[[package]] [[package]]
name = "rstest" name = "rstest"
version = "0.18.2" version = "0.18.2"
@@ -5492,24 +5194,10 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
dependencies = [ dependencies = [
"base16ct 0.1.1", "base16ct",
"der 0.6.1", "der 0.6.1",
"generic-array", "generic-array",
"pkcs8 0.9.0", "pkcs8",
"subtle",
"zeroize",
]
[[package]]
name = "sec1"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
dependencies = [
"base16ct 0.2.0",
"der 0.7.8",
"generic-array",
"pkcs8 0.10.2",
"subtle", "subtle",
"zeroize", "zeroize",
] ]
@@ -5856,7 +5544,6 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
dependencies = [ dependencies = [
"digest",
"rand_core 0.6.4", "rand_core 0.6.4",
] ]
@@ -6016,7 +5703,6 @@ dependencies = [
"pageserver_client", "pageserver_client",
"postgres_connection", "postgres_connection",
"r2d2", "r2d2",
"rand 0.8.5",
"reqwest 0.12.4", "reqwest 0.12.4",
"routerify", "routerify",
"scopeguard", "scopeguard",
@@ -6499,7 +6185,7 @@ dependencies = [
[[package]] [[package]]
name = "tokio-postgres" name = "tokio-postgres"
version = "0.7.7" version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"byteorder", "byteorder",
@@ -6876,9 +6562,9 @@ dependencies = [
[[package]] [[package]]
name = "try-lock" name = "try-lock"
version = "0.2.5" version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
[[package]] [[package]]
name = "tungstenite" name = "tungstenite"
@@ -7691,17 +7377,13 @@ dependencies = [
"clap", "clap",
"clap_builder", "clap_builder",
"crossbeam-utils", "crossbeam-utils",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged", "deranged",
"digest",
"either", "either",
"fail", "fail",
"futures-channel", "futures-channel",
"futures-executor", "futures-executor",
"futures-io", "futures-io",
"futures-util", "futures-util",
"generic-array",
"getrandom 0.2.11", "getrandom 0.2.11",
"hashbrown 0.14.5", "hashbrown 0.14.5",
"hex", "hex",
@@ -7709,7 +7391,6 @@ dependencies = [
"hyper 0.14.26", "hyper 0.14.26",
"indexmap 1.9.3", "indexmap 1.9.3",
"itertools 0.10.5", "itertools 0.10.5",
"lazy_static",
"libc", "libc",
"log", "log",
"memchr", "memchr",
@@ -7733,9 +7414,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
"signature 2.2.0",
"smallvec", "smallvec",
"spki 0.7.3",
"subtle", "subtle",
"syn 1.0.109", "syn 1.0.109",
"syn 2.0.52", "syn 2.0.52",
@@ -7846,7 +7525,6 @@ version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
dependencies = [ dependencies = [
"serde",
"zeroize_derive", "zeroize_derive",
] ]

View File

@@ -9,10 +9,7 @@ members = [
"pageserver/ctl", "pageserver/ctl",
"pageserver/client", "pageserver/client",
"pageserver/pagebench", "pageserver/pagebench",
"proxy/core", "proxy",
"proxy/sasl",
"proxy/proxy",
"proxy/pg_sni_router",
"safekeeper", "safekeeper",
"storage_broker", "storage_broker",
"storage_controller", "storage_controller",
@@ -187,7 +184,6 @@ tracing = "0.1"
tracing-error = "0.2.0" tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0" tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
twox-hash = { version = "1.6.3", default-features = false } twox-hash = { version = "1.6.3", default-features = false }
typed-json = "0.1" typed-json = "0.1"
url = "2.2" url = "2.2"

View File

@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot Makefile Makefile
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
ENV BUILD_TYPE=release ENV BUILD_TYPE release
RUN set -e \ RUN set -e \
&& mold -run make -j $(nproc) -s neon-pg-ext \ && mold -run make -j $(nproc) -s neon-pg-ext \
&& rm -rf pg_install/build \ && rm -rf pg_install/build \
@@ -29,12 +29,24 @@ WORKDIR /home/nonroot
ARG GIT_VERSION=local ARG GIT_VERSION=local
ARG BUILD_TAG ARG BUILD_TAG
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
ARG RUSTC_WRAPPER=cachepot
ENV AWS_REGION=eu-central-1
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
ARG CACHEPOT_BUCKET=neon-github-dev
#ARG AWS_ACCESS_KEY_ID
#ARG AWS_SECRET_ACCESS_KEY
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --chown=nonroot . . COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \ RUN set -e \
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
--bin pg_sni_router \ --bin pg_sni_router \
@@ -46,7 +58,8 @@ RUN set -e \
--bin proxy \ --bin proxy \
--bin neon_local \ --bin neon_local \
--bin storage_scrubber \ --bin storage_scrubber \
--locked --release --locked --release \
&& cachepot -s
# Build final image # Build final image
# #
@@ -91,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries # When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default. # that want a particular postgres version will select it explicitly: this is just a default.
ENV LD_LIBRARY_PATH=/usr/local/v16/lib ENV LD_LIBRARY_PATH /usr/local/v16/lib
VOLUME ["/data"] VOLUME ["/data"]
@@ -99,5 +112,5 @@ USER neon
EXPOSE 6400 EXPOSE 6400
EXPOSE 9898 EXPOSE 9898
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] CMD /usr/local/bin/pageserver -D /data/.neon

View File

@@ -58,7 +58,7 @@ RUN set -e \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# protobuf-compiler (protoc) # protobuf-compiler (protoc)
ENV PROTOC_VERSION=25.1 ENV PROTOC_VERSION 25.1
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
&& unzip -q protoc.zip -d protoc \ && unzip -q protoc.zip -d protoc \
&& mv protoc/bin/protoc /usr/local/bin/protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
&& rm awscliv2.zip && rm awscliv2.zip
# Mold: A Modern Linker # Mold: A Modern Linker
ENV MOLD_VERSION=v2.33.0 ENV MOLD_VERSION v2.31.0
RUN set -e \ RUN set -e \
&& git clone https://github.com/rui314/mold.git \ && git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \ && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
WORKDIR /home/nonroot WORKDIR /home/nonroot
# Python # Python
ENV PYTHON_VERSION=3.9.19 \ ENV PYTHON_VERSION=3.9.18 \
PYENV_ROOT=/home/nonroot/.pyenv \ PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \ RUN set -e \
@@ -192,14 +192,9 @@ WORKDIR /home/nonroot
# Rust # Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.80.1 ENV RUSTC_VERSION=1.80.0
ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1
ARG CARGO_HAKARI_VERSION=0.9.30
ARG CARGO_DENY_VERSION=0.16.1
ARG CARGO_HACK_VERSION=0.6.31
ARG CARGO_NEXTEST_VERSION=0.9.72
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \ chmod +x rustup-init && \
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -208,13 +203,15 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
. "$HOME/.cargo/env" && \ . "$HOME/.cargo/env" && \
cargo --version && rustup --version && \ cargo --version && rustup --version && \
rustup component add llvm-tools-preview rustfmt clippy && \ rustup component add llvm-tools-preview rustfmt clippy && \
cargo install rustfilt --version ${RUSTFILT_VERSION} && \ cargo install --git https://github.com/paritytech/cachepot && \
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ cargo install rustfilt && \
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ cargo install cargo-hakari && \
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ cargo install cargo-deny --locked && \
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ cargo install cargo-hack && \
cargo install cargo-nextest && \
rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git rm -rf /home/nonroot/.cargo/git
ENV RUSTC_WRAPPER=cachepot
# Show versions # Show versions
RUN whoami \ RUN whoami \

View File

@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* / make clean && cp -R /sfcgal/* /
ENV PATH="/usr/local/pgsql/bin:$PATH" ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH" ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in \ RUN case "${PG_VERSION}" in \
"v14" | "v15") \ "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin:$PATH" ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN case "${PG_VERSION}" in \ RUN case "${PG_VERSION}" in \
"v14") \ "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
FROM build-deps AS pg-cron-pg-build FROM build-deps AS pg-cron-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
libboost-system1.74-dev \ libboost-system1.74-dev \
libeigen3-dev libeigen3-dev
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
FROM build-deps AS pg-uuidv7-pg-build FROM build-deps AS pg-uuidv7-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
FROM build-deps AS pg-roaringbitmap-pg-build FROM build-deps AS pg-roaringbitmap-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
FROM build-deps AS pg-semver-pg-build FROM build-deps AS pg-semver-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ARG PG_VERSION ARG PG_VERSION
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \ RUN case "${PG_VERSION}" in \
"v14" | "v15") \ "v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
FROM build-deps AS pg-anon-pg-build FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
FROM build-deps AS wal2json-pg-build FROM build-deps AS wal2json-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
FROM build-deps AS pg-ivm-build FROM build-deps AS pg-ivm-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
FROM build-deps AS pg-partman-build FROM build-deps AS pg-partman-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH="/usr/local/pgsql/bin/:$PATH" ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -933,8 +933,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
COPY --from=rum-pg-build /rum.tar.gz /ext-src #COPY --from=rum-pg-build /rum.tar.gz /ext-src
COPY patches/rum.patch /ext-src
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -946,7 +945,7 @@ COPY patches/pg_hintplan.patch /ext-src
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
COPY patches/pg_cron.patch /ext-src COPY patches/pg_cron.patch /ext-src
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -961,7 +960,6 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|| exit 1; rm -f $f; done || exit 1; rm -f $f; done
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
# cmake is required for the h3 test # cmake is required for the h3 test
RUN apt-get update && apt-get install -y cmake RUN apt-get update && apt-get install -y cmake
RUN patch -p1 < /ext-src/pg_hintplan.patch RUN patch -p1 < /ext-src/pg_hintplan.patch
@@ -1034,6 +1032,6 @@ RUN apt update && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
ENV LANG=en_US.utf8 ENV LANG en_US.utf8
USER postgres USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"] ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -313,3 +313,5 @@ To get more familiar with this aspect, refer to:
- Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices. - Read [CONTRIBUTING.md](/CONTRIBUTING.md) to learn about project code style and practices.
- To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md). - To get familiar with a source tree layout, use [sourcetree.md](/docs/sourcetree.md).
- To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html - To learn more about PostgreSQL internals, check http://www.interdb.jp/pg/index.html
.

View File

@@ -4,11 +4,6 @@ version = "0.1.0"
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
[features]
default = []
# Enables test specific features.
testing = []
[dependencies] [dependencies]
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true async-compression.workspace = true

View File

@@ -400,15 +400,7 @@ impl ComputeNode {
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let mut retry_period_ms = 500.0; let mut retry_period_ms = 500.0;
let mut attempts = 0; let mut attempts = 0;
const DEFAULT_ATTEMPTS: u16 = 10; let max_attempts = 10;
#[cfg(feature = "testing")]
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
u16::from_str(&v).unwrap()
} else {
DEFAULT_ATTEMPTS
};
#[cfg(not(feature = "testing"))]
let max_attempts = DEFAULT_ATTEMPTS;
loop { loop {
let result = self.try_get_basebackup(compute_state, lsn); let result = self.try_get_basebackup(compute_state, lsn);
match result { match result {

View File

@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() { for (var, val) in std::env::vars() {
if var.starts_with("NEON_") { if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val); cmd = cmd.env(var, val);
} }
} }

View File

@@ -158,8 +158,6 @@ pub struct NeonStorageControllerConf {
/// Threshold for auto-splitting a tenant into shards /// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>, pub split_threshold: Option<u64>,
pub max_secondary_lag_bytes: Option<u64>,
} }
impl NeonStorageControllerConf { impl NeonStorageControllerConf {
@@ -175,7 +173,6 @@ impl Default for NeonStorageControllerConf {
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
split_threshold: None, split_threshold: None,
max_secondary_lag_bytes: None,
} }
} }
} }

View File

@@ -383,10 +383,6 @@ impl StorageController {
args.push(format!("--split-threshold={split_threshold}")) args.push(format!("--split-threshold={split_threshold}"))
} }
if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
args.push(format!("--max-secondary-lag-bytes={lag}"))
}
args.push(format!( args.push(format!(
"--neon-local-repo-dir={}", "--neon-local-repo-dir={}",
self.env.base_data_dir.display() self.env.base_data_dir.display()

View File

@@ -4,7 +4,6 @@
# to your expectations and requirements. # to your expectations and requirements.
# Root options # Root options
[graph]
targets = [ targets = [
{ triple = "x86_64-unknown-linux-gnu" }, { triple = "x86_64-unknown-linux-gnu" },
{ triple = "aarch64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" },
@@ -13,7 +12,6 @@ targets = [
] ]
all-features = false all-features = false
no-default-features = false no-default-features = false
[output]
feature-depth = 1 feature-depth = 1
# This section is considered when running `cargo deny check advisories` # This section is considered when running `cargo deny check advisories`
@@ -21,16 +19,17 @@ feature-depth = 1
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
[advisories] [advisories]
db-urls = ["https://github.com/rustsec/advisory-db"] db-urls = ["https://github.com/rustsec/advisory-db"]
vulnerability = "deny"
unmaintained = "warn"
yanked = "warn" yanked = "warn"
notice = "warn"
[[advisories.ignore]] ignore = []
id = "RUSTSEC-2023-0071"
reason = "the marvin attack only affects private key decryption, not public key signature verification"
# This section is considered when running `cargo deny check licenses` # This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here: # More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses] [licenses]
unlicensed = "deny"
allow = [ allow = [
"Apache-2.0", "Apache-2.0",
"Artistic-2.0", "Artistic-2.0",
@@ -43,6 +42,10 @@ allow = [
"OpenSSL", "OpenSSL",
"Unicode-DFS-2016", "Unicode-DFS-2016",
] ]
deny = []
copyleft = "warn"
allow-osi-fsf-free = "neither"
default = "deny"
confidence-threshold = 0.8 confidence-threshold = 0.8
exceptions = [ exceptions = [
# Zlib license has some restrictions if we decide to change sth # Zlib license has some restrictions if we decide to change sth

View File

@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
rm -rf $TMPDIR rm -rf $TMPDIR
# We are running tests now # We are running tests now
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
then then
cleanup cleanup

View File

@@ -1,15 +1,15 @@
#!/bin/bash #!/bin/bash
set -x set -x
cd /ext-src || exit 2 cd /ext-src
FAILED= FAILED=
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u) LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
for d in ${LIST} for d in ${LIST}
do do
[ -d "${d}" ] || continue [ -d ${d} ] || continue
psql -c "select 1" >/dev/null || break psql -c "select 1" >/dev/null || break
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" make -C ${d} installcheck || FAILED="${d} ${FAILED}"
done done
[ -z "${FAILED}" ] && exit 0 [ -z "${FAILED}" ] && exit 0
echo "${FAILED}" echo ${FAILED}
exit 1 exit 1

View File

@@ -1,18 +1,13 @@
# Summary # Summary
# Looking for `neon.tech` docs?
This page linkes to a selection of technical content about the open source code in this repository.
Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
in this repository.
# Architecture
[Introduction]() [Introduction]()
- [Separation of Compute and Storage](./separation-compute-storage.md) - [Separation of Compute and Storage](./separation-compute-storage.md)
# Architecture
- [Compute]() - [Compute]()
- [WAL proposer]()
- [WAL Backpressure]()
- [Postgres changes](./core_changes.md) - [Postgres changes](./core_changes.md)
- [Pageserver](./pageserver.md) - [Pageserver](./pageserver.md)
@@ -21,15 +16,33 @@ in this repository.
- [WAL Redo](./pageserver-walredo.md) - [WAL Redo](./pageserver-walredo.md)
- [Page cache](./pageserver-pagecache.md) - [Page cache](./pageserver-pagecache.md)
- [Storage](./pageserver-storage.md) - [Storage](./pageserver-storage.md)
- [Datadir mapping]()
- [Layer files]()
- [Branching]()
- [Garbage collection]()
- [Cloud Storage]()
- [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing a GetPage request](./pageserver-processing-getpage.md)
- [Processing WAL](./pageserver-processing-wal.md) - [Processing WAL](./pageserver-processing-wal.md)
- [Management API]()
- [Tenant Rebalancing]()
- [WAL Service](walservice.md) - [WAL Service](walservice.md)
- [Consensus protocol](safekeeper-protocol.md) - [Consensus protocol](safekeeper-protocol.md)
- [Management API]()
- [Rebalancing]()
- [Control Plane]()
- [Proxy]()
- [Source view](./sourcetree.md) - [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline. - [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging](./error-handling.md) - [Error handling and logging](./error-handling.md)
- [Testing]()
- [Unit testing]()
- [Integration testing]()
- [Benchmarks]()
- [Glossary](./glossary.md) - [Glossary](./glossary.md)
@@ -45,6 +58,28 @@ in this repository.
# RFCs # RFCs
Major changes are documented in RFCS: - [RFCs](./rfcs/README.md)
- See [RFCs](./rfcs/README.md) for more information
- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs - [002-storage](rfcs/002-storage.md)
- [003-laptop-cli](rfcs/003-laptop-cli.md)
- [004-durability](rfcs/004-durability.md)
- [005-zenith_local](rfcs/005-zenith_local.md)
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
- [008-push-pull](rfcs/008-push-pull.md)
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
- [010-storage_details](rfcs/010-storage_details.md)
- [011-retention-policy](rfcs/011-retention-policy.md)
- [012-background-tasks](rfcs/012-background-tasks.md)
- [013-term-history](rfcs/013-term-history.md)
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
- [014-storage-lsm](rfcs/014-storage-lsm.md)
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)

View File

@@ -107,10 +107,7 @@ impl Key {
/// As long as Neon does not support tablespace (because of lack of access to local file system), /// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16 /// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 { pub fn to_i128(&self) -> i128 {
assert!( assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
"invalid key: {self}",
);
(((self.field1 & 0x7F) as i128) << 120) (((self.field1 & 0x7F) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104) | (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72) | ((self.field3 as i128) << 72)

View File

@@ -637,13 +637,6 @@ pub struct TenantInfo {
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
pub attachment_status: TenantAttachmentStatus, pub attachment_status: TenantAttachmentStatus,
pub generation: u32, pub generation: u32,
/// Opaque explanation if gc is being blocked.
///
/// Only looked up for the individual tenant detail, not the listing. This is purely for
/// debugging, not included in openapi.
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_blocking: Option<String>,
} }
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
@@ -947,8 +940,6 @@ pub struct TopTenantShardsResponse {
} }
pub mod virtual_file { pub mod virtual_file {
use std::path::PathBuf;
#[derive( #[derive(
Copy, Copy,
Clone, Clone,
@@ -967,53 +958,6 @@ pub mod virtual_file {
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
TokioEpollUring, TokioEpollUring,
} }
/// Direct IO modes for a pageserver.
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum DirectIoMode {
/// Direct IO disabled (uses usual buffered IO).
#[default]
Disabled,
/// Direct IO disabled (performs checks and perf simulations).
Evaluate {
/// Alignment check level
alignment_check: DirectIoAlignmentCheckLevel,
/// Latency padded for performance simulation.
latency_padding: DirectIoLatencyPadding,
},
/// Direct IO enabled.
Enabled {
/// Actions to perform on alignment error.
on_alignment_error: DirectIoOnAlignmentErrorAction,
},
}
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
#[serde(rename_all = "kebab-case")]
pub enum DirectIoAlignmentCheckLevel {
#[default]
Error,
Log,
None,
}
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
#[serde(rename_all = "kebab-case")]
pub enum DirectIoOnAlignmentErrorAction {
Error,
#[default]
FallbackToBuffered,
}
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
#[serde(tag = "type", rename_all = "kebab-case")]
pub enum DirectIoLatencyPadding {
/// Pad virtual file operations with IO to a fake file.
FakeFileRW { path: PathBuf },
#[default]
None,
}
} }
// Wrapped in libpq CopyData // Wrapped in libpq CopyData
@@ -1483,7 +1427,6 @@ mod tests {
current_physical_size: Some(42), current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached, attachment_status: TenantAttachmentStatus::Attached,
generation: 1, generation: 1,
gc_blocking: None,
}; };
let expected_active = json!({ let expected_active = json!({
"id": original_active.id.to_string(), "id": original_active.id.to_string(),
@@ -1506,7 +1449,6 @@ mod tests {
current_physical_size: Some(42), current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached, attachment_status: TenantAttachmentStatus::Attached,
generation: 1, generation: 1,
gc_blocking: None,
}; };
let expected_broken = json!({ let expected_broken = json!({
"id": original_broken.id.to_string(), "id": original_broken.id.to_string(),

View File

@@ -1,8 +1,6 @@
use std::collections::HashSet;
use utils::id::TimelineId; use utils::id::TimelineId;
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct AncestorDetached { pub struct AncestorDetached {
pub reparented_timelines: HashSet<TimelineId>, pub reparented_timelines: Vec<TimelineId>,
} }

View File

@@ -144,20 +144,7 @@ impl PgConnectionConfig {
// implement and this function is hardly a bottleneck. The function is only called around // implement and this function is hardly a bottleneck. The function is only called around
// establishing a new connection. // establishing a new connection.
#[allow(unstable_name_collisions)] #[allow(unstable_name_collisions)]
config.options( config.options(&encode_options(&self.options));
&self
.options
.iter()
.map(|s| {
if s.contains(['\\', ' ']) {
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
} else {
Cow::Borrowed(s.as_str())
}
})
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
.collect::<String>(),
);
} }
config config
} }
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
} }
} }
#[allow(unstable_name_collisions)]
fn encode_options(options: &[String]) -> String {
options
.iter()
.map(|s| {
if s.contains(['\\', ' ']) {
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
} else {
Cow::Borrowed(s.as_str())
}
})
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
.collect::<String>()
}
impl fmt::Display for PgConnectionConfig { impl fmt::Display for PgConnectionConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// The password is intentionally hidden and not part of this display string. // The password is intentionally hidden and not part of this display string.
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {
#[cfg(test)] #[cfg(test)]
mod tests_pg_connection_config { mod tests_pg_connection_config {
use crate::PgConnectionConfig; use crate::{encode_options, PgConnectionConfig};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use url::Host; use url::Host;
@@ -255,18 +257,12 @@ mod tests_pg_connection_config {
#[test] #[test]
fn test_with_options() { fn test_with_options() {
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([ let options = encode_options(&[
"hello", "hello".to_owned(),
"world", "world".to_owned(),
"with space", "with space".to_owned(),
"and \\ backslashes", "and \\ backslashes".to_owned(),
]); ]);
assert_eq!(cfg.host(), &*STUB_HOST); assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
assert_eq!(cfg.port(), 123);
assert_eq!(cfg.raw_address(), "stub.host.example:123");
assert_eq!(
cfg.to_tokio_postgres_config().get_options(),
Some("hello world with\\ space and\\ \\\\\\ backslashes")
);
} }
} }

View File

@@ -128,7 +128,7 @@ pub mod circuit_breaker;
/// ///
/// ############################################################################################# /// #############################################################################################
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details. /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036> /// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
/// The problem needs further investigation and regular `const` declaration instead of a macro. /// The problem needs further investigation and regular `const` declaration instead of a macro.

View File

@@ -78,9 +78,8 @@ impl Drop for GateGuard {
} }
} }
#[derive(Debug, thiserror::Error)] #[derive(Debug)]
pub enum GateError { pub enum GateError {
#[error("gate is closed")]
GateClosed, GateClosed,
} }

View File

@@ -108,7 +108,3 @@ harness = false
[[bench]] [[bench]]
name = "bench_walredo" name = "bench_walredo"
harness = false harness = false
[[bench]]
name = "bench_ingest"
harness = false

View File

@@ -1,239 +0,0 @@
use std::{env, num::NonZeroUsize};
use bytes::Bytes;
use camino::Utf8PathBuf;
use criterion::{criterion_group, criterion_main, Criterion};
use pageserver::{
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
repository::Value,
task_mgr::TaskKind,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
use pageserver_api::{key::Key, shard::TenantShardId};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
};
// A very cheap hash for generating non-sequential keys.
fn murmurhash32(mut h: u32) -> u32 {
h ^= h >> 16;
h = h.wrapping_mul(0x85ebca6b);
h ^= h >> 13;
h = h.wrapping_mul(0xc2b2ae35);
h ^= h >> 16;
h
}
enum KeyLayout {
/// Sequential unique keys
Sequential,
/// Random unique keys
Random,
/// Random keys, but only use the bits from the mask of them
RandomReuse(u32),
}
enum WriteDelta {
Yes,
No,
}
async fn ingest(
conf: &'static PageServerConf,
put_size: usize,
put_count: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
) -> anyhow::Result<()> {
let mut lsn = utils::lsn::Lsn(1000);
let mut key = Key::from_i128(0x0);
let timeline_id = TimelineId::generate();
let tenant_id = TenantId::generate();
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
let gate = utils::sync::gate::Gate::default();
let entered = gate.enter().unwrap();
let layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
let ctx = RequestContext::new(
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
pageserver::context::DownloadBehavior::Download,
);
for i in 0..put_count {
lsn += put_size as u64;
// Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
// usually care the most about write performance when they're blasting a huge batch of data into a huge table.
match key_layout {
KeyLayout::Sequential => {
// Use sequential order to illustrate the experience a user is likely to have
// when ingesting bulk data.
key.field6 = i as u32;
}
KeyLayout::Random => {
// Use random-order keys to avoid giving a false advantage to data structures that are
// faster when inserting on the end.
key.field6 = murmurhash32(i as u32);
}
KeyLayout::RandomReuse(mask) => {
// Use low bits only, to limit cardinality
key.field6 = murmurhash32(i as u32) & mask;
}
}
layer.put_value(key, lsn, &data, &ctx).await?;
}
layer.freeze(lsn + 1).await;
if matches!(write_delta, WriteDelta::Yes) {
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
max_concurrency: NonZeroUsize::new(1).unwrap(),
});
let (_desc, path) = layer
.write_to_disk(&ctx, None, l0_flush_state.inner())
.await?
.unwrap();
tokio::fs::remove_file(path).await?;
}
Ok(())
}
/// Wrapper to instantiate a tokio runtime
fn ingest_main(
conf: &'static PageServerConf,
put_size: usize,
put_count: usize,
key_layout: KeyLayout,
write_delta: WriteDelta,
) {
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
runtime.block_on(async move {
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
if let Err(e) = r {
panic!("{e:?}");
}
});
}
/// Declare a series of benchmarks for the Pageserver's ingest write path.
///
/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
///
/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on
/// a fast disk, CPU is the bottleneck at time of writing.
fn criterion_benchmark(c: &mut Criterion) {
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
eprintln!("Data directory: {}", temp_dir.path());
let conf: &'static PageServerConf = Box::leak(Box::new(
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
));
virtual_file::init(16384, virtual_file::io_engine_for_bench());
page_cache::init(conf.page_cache_size);
{
let mut group = c.benchmark_group("ingest-small-values");
let put_size = 100usize;
let put_count = 128 * 1024 * 1024 / put_size;
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
group.sample_size(10);
group.bench_function("ingest 128MB/100b seq", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b rand", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Random,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::RandomReuse(0x3ff),
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/100b seq, no delta", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::No,
)
})
});
}
{
let mut group = c.benchmark_group("ingest-big-values");
let put_size = 8192usize;
let put_count = 128 * 1024 * 1024 / put_size;
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
group.sample_size(10);
group.bench_function("ingest 128MB/8k seq", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::Yes,
)
})
});
group.bench_function("ingest 128MB/8k seq, no delta", |b| {
b.iter(|| {
ingest_main(
conf,
put_size,
put_count,
KeyLayout::Sequential,
WriteDelta::No,
)
})
});
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -123,7 +123,6 @@ fn main() -> anyhow::Result<()> {
// after setting up logging, log the effective IO engine choice and read path implementations // after setting up logging, log the effective IO engine choice and read path implementations
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
info!(?conf.get_impl, "starting with get page implementation"); info!(?conf.get_impl, "starting with get page implementation");
info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

View File

@@ -300,9 +300,6 @@ pub struct PageServerConf {
/// This flag is temporary and will be removed after gradual rollout. /// This flag is temporary and will be removed after gradual rollout.
/// See <https://github.com/neondatabase/neon/issues/8184>. /// See <https://github.com/neondatabase/neon/issues/8184>.
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
/// Direct IO settings
pub virtual_file_direct_io: virtual_file::DirectIoMode,
} }
/// We do not want to store this in a PageServerConf because the latter may be logged /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -411,8 +408,6 @@ struct PageServerConfigBuilder {
l0_flush: BuilderValue<L0FlushConfig>, l0_flush: BuilderValue<L0FlushConfig>,
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>, compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
} }
impl PageServerConfigBuilder { impl PageServerConfigBuilder {
@@ -503,7 +498,6 @@ impl PageServerConfigBuilder {
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()), l0_flush: Set(L0FlushConfig::default()),
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
} }
} }
} }
@@ -691,10 +685,6 @@ impl PageServerConfigBuilder {
self.compact_level0_phase1_value_access = BuilderValue::Set(value); self.compact_level0_phase1_value_access = BuilderValue::Set(value);
} }
pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
self.virtual_file_direct_io = BuilderValue::Set(value);
}
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> { pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
let default = Self::default_values(); let default = Self::default_values();
@@ -753,7 +743,6 @@ impl PageServerConfigBuilder {
ephemeral_bytes_per_memory_kb, ephemeral_bytes_per_memory_kb,
l0_flush, l0_flush,
compact_level0_phase1_value_access, compact_level0_phase1_value_access,
virtual_file_direct_io,
} }
CUSTOM LOGIC CUSTOM LOGIC
{ {
@@ -1029,9 +1018,6 @@ impl PageServerConf {
"compact_level0_phase1_value_access" => { "compact_level0_phase1_value_access" => {
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
} }
"virtual_file_direct_io" => {
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
}
_ => bail!("unrecognized pageserver option '{key}'"), _ => bail!("unrecognized pageserver option '{key}'"),
} }
} }
@@ -1117,7 +1103,6 @@ impl PageServerConf {
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(), l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
} }
} }
} }
@@ -1360,7 +1345,6 @@ background_task_maximum_delay = '334 s'
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(), l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
}, },
"Correct defaults should be used when no config values are provided" "Correct defaults should be used when no config values are provided"
); );
@@ -1436,7 +1420,6 @@ background_task_maximum_delay = '334 s'
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(), l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
}, },
"Should be able to parse all basic config values correctly" "Should be able to parse all basic config values correctly"
); );

View File

@@ -308,45 +308,6 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Persistently add a gc blocking at the tenant level because of this timeline
responses:
"200":
description: OK
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Persistently remove a tenant level gc blocking for this timeline
responses:
"200":
description: OK
/v1/tenant/{tenant_shard_id}/location_config: /v1/tenant/{tenant_shard_id}/location_config:
parameters: parameters:
- name: tenant_shard_id - name: tenant_shard_id
@@ -932,7 +893,7 @@ components:
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
ArchivalConfigRequest: ArchivalConfigRequest:
type: object type: object
required: required
- state - state
properties: properties:
state: state:

View File

@@ -935,7 +935,6 @@ async fn tenant_list_handler(
generation: (*gen) generation: (*gen)
.into() .into()
.expect("Tenants are always attached with a generation"), .expect("Tenants are always attached with a generation"),
gc_blocking: None,
}) })
.collect::<Vec<TenantInfo>>(); .collect::<Vec<TenantInfo>>();
@@ -987,7 +986,6 @@ async fn tenant_status(
.generation() .generation()
.into() .into()
.expect("Tenants are always attached with a generation"), .expect("Tenants are always attached with a generation"),
gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
}, },
walredo: tenant.wal_redo_manager_status(), walredo: tenant.wal_redo_manager_status(),
timelines: tenant.list_timeline_ids(), timelines: tenant.list_timeline_ids(),
@@ -1162,10 +1160,7 @@ async fn layer_map_info_handler(
let timeline = let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?; .await?;
let layer_map_info = timeline let layer_map_info = timeline.layer_map_info(reset).await;
.layer_map_info(reset)
.await
.map_err(|_shutdown| ApiError::ShuttingDown)?;
json_response(StatusCode::OK, layer_map_info) json_response(StatusCode::OK, layer_map_info)
} }
@@ -1231,72 +1226,6 @@ async fn evict_timeline_layer_handler(
} }
} }
async fn timeline_gc_blocking_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
block_or_unblock_gc(request, true).await
}
async fn timeline_gc_unblocking_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
block_or_unblock_gc(request, false).await
}
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
///
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
async fn block_or_unblock_gc(
request: Request<Body>,
block: bool,
) -> Result<Response<Body>, ApiError> {
use crate::tenant::{
remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
};
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let state = get_state(&request);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, true)?;
let fut = async {
if block {
timeline.block_gc(&tenant).await.map(|_| ())
} else {
timeline.unblock_gc(&tenant).await
}
};
let span = tracing::info_span!(
"block_or_unblock_gc",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
timeline_id = %timeline_id,
block = block,
);
let res = fut.instrument(span).await;
res.map_err(|e| {
if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
ApiError::ShuttingDown
} else {
ApiError::InternalServerError(e)
}
})?;
json_response(StatusCode::OK, ())
}
/// Get tenant_size SVG graph along with the JSON data. /// Get tenant_size SVG graph along with the JSON data.
fn synthetic_size_html_response( fn synthetic_size_html_response(
inputs: ModelInputs, inputs: ModelInputs,
@@ -2975,14 +2904,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler), |r| api_handler(r, evict_timeline_layer_handler),
) )
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|r| api_handler(r, timeline_gc_blocking_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|r| api_handler(r, timeline_gc_unblocking_handler),
)
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler) api_handler(r, secondary_upload_handler)
}) })

View File

@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
#[derive(Clone)] #[derive(Clone)]
pub struct L0FlushGlobalState(Arc<Inner>); pub struct L0FlushGlobalState(Arc<Inner>);
pub enum Inner { pub(crate) enum Inner {
PageCached, PageCached,
Direct { semaphore: tokio::sync::Semaphore }, Direct { semaphore: tokio::sync::Semaphore },
} }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
} }
} }
pub fn inner(&self) -> &Arc<Inner> { pub(crate) fn inner(&self) -> &Arc<Inner> {
&self.0 &self.0
} }
} }

View File

@@ -122,15 +122,11 @@ impl Listener {
} }
} }
impl Connections { impl Connections {
pub(crate) async fn shutdown(self) { pub async fn shutdown(self) {
let Self { cancel, mut tasks } = self; let Self { cancel, mut tasks } = self;
cancel.cancel(); cancel.cancel();
while let Some(res) = tasks.join_next().await { while let Some(res) = tasks.join_next().await {
Self::handle_connection_completion(res); // the logging done here mimics what was formerly done by task_mgr
}
}
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
match res { match res {
Ok(Ok(())) => {} Ok(Ok(())) => {}
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
@@ -138,6 +134,7 @@ impl Connections {
} }
} }
} }
}
/// ///
/// Main loop of the page service. /// Main loop of the page service.
@@ -158,19 +155,20 @@ pub async fn libpq_listener_main(
let connections_cancel = CancellationToken::new(); let connections_cancel = CancellationToken::new();
let mut connection_handler_tasks = tokio::task::JoinSet::default(); let mut connection_handler_tasks = tokio::task::JoinSet::default();
loop { // Wait for a new connection to arrive, or for server shutdown.
let accepted = tokio::select! { while let Some(res) = tokio::select! {
biased; biased;
_ = listener_cancel.cancelled() => break,
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
let res = next.expect("we dont poll while empty");
Connections::handle_connection_completion(res);
continue;
}
accepted = listener.accept() => accepted,
};
match accepted { _ = listener_cancel.cancelled() => {
// We were requested to shut down.
None
}
res = listener.accept() => {
Some(res)
}
} {
match res {
Ok((socket, peer_addr)) => { Ok((socket, peer_addr)) => {
// Connection established. Spawn a new task to handle it. // Connection established. Spawn a new task to handle it.
debug!("accepted connection from {}", peer_addr); debug!("accepted connection from {}", peer_addr);

View File

@@ -56,6 +56,7 @@ impl Statvfs {
} }
pub mod mock { pub mod mock {
use anyhow::Context;
use camino::Utf8Path; use camino::Utf8Path;
use regex::Regex; use regex::Regex;
use tracing::log::info; use tracing::log::info;
@@ -134,30 +135,14 @@ pub mod mock {
{ {
continue; continue;
} }
let m = match entry.metadata() { total += entry
Ok(m) => m, .metadata()
Err(e) if is_not_found(&e) => { .with_context(|| format!("get metadata of {:?}", entry.path()))?
// some temp file which got removed right as we are walking .len();
continue;
}
Err(e) => {
return Err(anyhow::Error::new(e)
.context(format!("get metadata of {:?}", entry.path())))
}
};
total += m.len();
} }
Ok(total) Ok(total)
} }
fn is_not_found(e: &walkdir::Error) -> bool {
let Some(io_error) = e.io_error() else {
return false;
};
let kind = io_error.kind();
matches!(kind, std::io::ErrorKind::NotFound)
}
pub struct Statvfs { pub struct Statvfs {
pub blocks: u64, pub blocks: u64,
pub blocks_available: u64, pub blocks_available: u64,

View File

@@ -148,7 +148,6 @@ pub(crate) mod timeline;
pub mod size; pub mod size;
mod gc_block;
pub(crate) mod throttle; pub(crate) mod throttle;
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -304,12 +303,6 @@ pub struct Tenant {
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>, ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
/// `index_part.json` based gc blocking reason tracking.
///
/// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
/// proceeding.
pub(crate) gc_block: gc_block::GcBlock,
l0_flush_global_state: L0FlushGlobalState, l0_flush_global_state: L0FlushGlobalState,
} }
@@ -601,12 +594,6 @@ impl From<PageReconstructError> for GcError {
} }
} }
impl From<timeline::layer_manager::Shutdown> for GcError {
fn from(_: timeline::layer_manager::Shutdown) -> Self {
GcError::TimelineCancelled
}
}
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
pub(crate) enum LoadConfigError { pub(crate) enum LoadConfigError {
#[error("TOML deserialization error: '{0}'")] #[error("TOML deserialization error: '{0}'")]
@@ -716,7 +703,6 @@ impl Tenant {
.read() .read()
.await .await
.layer_map() .layer_map()
.expect("currently loading, layer manager cannot be shutdown already")
.iter_historic_layers() .iter_historic_layers()
.next() .next()
.is_some(), .is_some(),
@@ -1050,8 +1036,6 @@ impl Tenant {
} }
} }
let mut gc_blocks = HashMap::new();
// For every timeline, download the metadata file, scan the local directory, // For every timeline, download the metadata file, scan the local directory,
// and build a layer map that contains an entry for each remote and local // and build a layer map that contains an entry for each remote and local
// layer file. // layer file.
@@ -1061,16 +1045,6 @@ impl Tenant {
.remove(&timeline_id) .remove(&timeline_id)
.expect("just put it in above"); .expect("just put it in above");
if let Some(blocking) = index_part.gc_blocking.as_ref() {
// could just filter these away, but it helps while testing
anyhow::ensure!(
!blocking.reasons.is_empty(),
"index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
);
let prev = gc_blocks.insert(timeline_id, blocking.reasons);
assert!(prev.is_none());
}
// TODO again handle early failure // TODO again handle early failure
self.load_remote_timeline( self.load_remote_timeline(
timeline_id, timeline_id,
@@ -1115,8 +1089,6 @@ impl Tenant {
// IndexPart is the source of truth. // IndexPart is the source of truth.
self.clean_up_timelines(&existent_timelines)?; self.clean_up_timelines(&existent_timelines)?;
self.gc_block.set_scanned(gc_blocks);
fail::fail_point!("attach-before-activate", |_| { fail::fail_point!("attach-before-activate", |_| {
anyhow::bail!("attach-before-activate"); anyhow::bail!("attach-before-activate");
}); });
@@ -1707,14 +1679,6 @@ impl Tenant {
} }
} }
let _guard = match self.gc_block.start().await {
Ok(guard) => guard,
Err(reasons) => {
info!("Skipping GC: {reasons}");
return Ok(GcResult::default());
}
};
self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx) self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
.await .await
} }
@@ -2727,7 +2691,6 @@ impl Tenant {
)), )),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(), ongoing_timeline_detach: std::sync::Mutex::default(),
gc_block: Default::default(),
l0_flush_global_state, l0_flush_global_state,
} }
} }
@@ -3012,6 +2975,54 @@ impl Tenant {
// because that will stall branch creation. // because that will stall branch creation.
let gc_cs = self.gc_cs.lock().await; let gc_cs = self.gc_cs.lock().await;
// Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
// depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
// and fail out if it's inaccurate.
// (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
{
let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
BTreeMap::new();
timelines.iter().for_each(|timeline| {
if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
let ancestor_children =
all_branchpoints.entry(*ancestor_timeline_id).or_default();
ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
}
});
for timeline in &timelines {
let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
.remove(&timeline.timeline_id)
.unwrap_or_default();
branchpoints.sort_by_key(|b| b.0);
let target = timeline.gc_info.read().unwrap();
// We require that retain_lsns contains everything in `branchpoints`, but not that
// they are exactly equal: timeline deletions can race with us, so retain_lsns
// may contain some extra stuff. It is safe to have extra timelines in there, because it
// just means that we retain slightly more data than we otherwise might.
let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
for b in &branchpoints {
if !have_branchpoints.contains(b) {
tracing::error!(
"Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}",
branchpoints,
target.retain_lsns
);
debug_assert!(false);
// Do not GC based on bad information!
// (ab-use an existing GcError type rather than adding a new one, since this is a
// "should never happen" check that will be removed soon).
return Err(GcError::Remote(anyhow::anyhow!(
"retain_lsns failed validation!"
)));
}
}
}
}
// Ok, we now know all the branch points. // Ok, we now know all the branch points.
// Update the GC information for each timeline. // Update the GC information for each timeline.
let mut gc_timelines = Vec::with_capacity(timelines.len()); let mut gc_timelines = Vec::with_capacity(timelines.len());
@@ -4081,7 +4092,7 @@ pub(crate) mod harness {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::{BTreeMap, BTreeSet}; use std::collections::BTreeMap;
use super::*; use super::*;
use crate::keyspace::KeySpaceAccum; use crate::keyspace::KeySpaceAccum;
@@ -4633,10 +4644,10 @@ mod tests {
let layer_map = tline.layers.read().await; let layer_map = tline.layers.read().await;
let level0_deltas = layer_map let level0_deltas = layer_map
.layer_map()? .layer_map()
.level0_deltas() .get_level0_deltas()
.iter() .into_iter()
.map(|desc| layer_map.get_from_desc(desc)) .map(|desc| layer_map.get_from_desc(&desc))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert!(!level0_deltas.is_empty()); assert!(!level0_deltas.is_empty());
@@ -4756,7 +4767,7 @@ mod tests {
lsn: Lsn, lsn: Lsn,
repeat: usize, repeat: usize,
key_count: usize, key_count: usize,
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> { ) -> anyhow::Result<()> {
let compact = true; let compact = true;
bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
} }
@@ -4769,9 +4780,7 @@ mod tests {
repeat: usize, repeat: usize,
key_count: usize, key_count: usize,
compact: bool, compact: bool,
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> { ) -> anyhow::Result<()> {
let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0; let mut blknum = 0;
@@ -4792,7 +4801,6 @@ mod tests {
ctx, ctx,
) )
.await?; .await?;
inserted.entry(test_key).or_default().insert(lsn);
writer.finish_write(lsn); writer.finish_write(lsn);
drop(writer); drop(writer);
@@ -4817,7 +4825,7 @@ mod tests {
assert_eq!(res.layers_removed, 0, "this never removes anything"); assert_eq!(res.layers_removed, 0, "this never removes anything");
} }
Ok(inserted) Ok(())
} }
// //
@@ -4864,16 +4872,14 @@ mod tests {
.await?; .await?;
let lsn = Lsn(0x10); let lsn = Lsn(0x10);
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
let guard = tline.layers.read().await; let guard = tline.layers.read().await;
let lm = guard.layer_map()?; guard.layer_map().dump(true, &ctx).await?;
lm.dump(true, &ctx).await?;
let mut reads = Vec::new(); let mut reads = Vec::new();
let mut prev = None; let mut prev = None;
lm.iter_historic_layers().for_each(|desc| { guard.layer_map().iter_historic_layers().for_each(|desc| {
if !desc.is_delta() { if !desc.is_delta() {
prev = Some(desc.clone()); prev = Some(desc.clone());
return; return;
@@ -4927,39 +4933,9 @@ mod tests {
&ctx, &ctx,
) )
.await; .await;
tline
let mut expected_lsns: HashMap<Key, Lsn> = Default::default(); .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
let mut expect_missing = false; .await;
let mut key = read.start().unwrap();
while key != read.end().unwrap() {
if let Some(lsns) = inserted.get(&key) {
let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
match expected_lsn {
Some(lsn) => {
expected_lsns.insert(key, *lsn);
}
None => {
expect_missing = true;
break;
}
}
} else {
expect_missing = true;
break;
}
key = key.next();
}
if expect_missing {
assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
} else {
for (key, image) in vectored_res? {
let expected_lsn = expected_lsns.get(&key).expect("determined above");
let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
assert_eq!(image?, expected_image);
}
}
} }
Ok(()) Ok(())
@@ -5009,6 +4985,10 @@ mod tests {
) )
.await; .await;
child_timeline
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
.await;
let images = vectored_res?; let images = vectored_res?;
assert!(images.is_empty()); assert!(images.is_empty());
Ok(()) Ok(())
@@ -5879,12 +5859,23 @@ mod tests {
tline.freeze_and_flush().await?; // force create a delta layer tline.freeze_and_flush().await?; // force create a delta layer
} }
let before_num_l0_delta_files = let before_num_l0_delta_files = tline
tline.layers.read().await.layer_map()?.level0_deltas().len(); .layers
.read()
.await
.layer_map()
.get_level0_deltas()
.len();
tline.compact(&cancel, EnumSet::empty(), &ctx).await?; tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); let after_num_l0_delta_files = tline
.layers
.read()
.await
.layer_map()
.get_level0_deltas()
.len();
assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
@@ -6908,10 +6899,7 @@ mod tests {
} }
let cancel = CancellationToken::new(); let cancel = CancellationToken::new();
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
for (idx, expected) in expected_result.iter().enumerate() { for (idx, expected) in expected_result.iter().enumerate() {
assert_eq!( assert_eq!(
@@ -7005,10 +6993,7 @@ mod tests {
guard.cutoffs.time = Lsn(0x40); guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40); guard.cutoffs.space = Lsn(0x40);
} }
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
Ok(()) Ok(())
} }
@@ -7342,10 +7327,7 @@ mod tests {
} }
let cancel = CancellationToken::new(); let cancel = CancellationToken::new();
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
for idx in 0..10 { for idx in 0..10 {
assert_eq!( assert_eq!(
@@ -7371,10 +7353,7 @@ mod tests {
guard.cutoffs.time = Lsn(0x40); guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40); guard.cutoffs.space = Lsn(0x40);
} }
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
Ok(()) Ok(())
} }
@@ -7919,28 +7898,11 @@ mod tests {
verify_result().await; verify_result().await;
let cancel = CancellationToken::new(); let cancel = CancellationToken::new();
let mut dryrun_flags = EnumSet::new(); tline.compact_with_gc(&cancel, &ctx).await.unwrap();
dryrun_flags.insert(CompactFlags::DryRun);
tline
.compact_with_gc(&cancel, dryrun_flags, &ctx)
.await
.unwrap();
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
verify_result().await;
tline
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await; verify_result().await;
// compact again // compact again
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await; verify_result().await;
// increase GC horizon and compact again // increase GC horizon and compact again
@@ -7950,17 +7912,11 @@ mod tests {
guard.cutoffs.time = Lsn(0x38); guard.cutoffs.time = Lsn(0x38);
guard.cutoffs.space = Lsn(0x38); guard.cutoffs.space = Lsn(0x38);
} }
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
// not increasing the GC horizon and compact again // not increasing the GC horizon and compact again
tline tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await; verify_result().await;
Ok(()) Ok(())
@@ -8141,10 +8097,7 @@ mod tests {
verify_result().await; verify_result().await;
let cancel = CancellationToken::new(); let cancel = CancellationToken::new();
branch_tline branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await; verify_result().await;

View File

@@ -29,7 +29,6 @@ impl EphemeralFile {
conf: &PageServerConf, conf: &PageServerConf,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
timeline_id: TimelineId, timeline_id: TimelineId,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<EphemeralFile, io::Error> { ) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -52,12 +51,10 @@ impl EphemeralFile {
) )
.await?; .await?;
let prewarm = conf.l0_flush.prewarm_on_write();
Ok(EphemeralFile { Ok(EphemeralFile {
_tenant_shard_id: tenant_shard_id, _tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id, _timeline_id: timeline_id,
rw: page_caching::RW::new(file, prewarm, gate_guard), rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
}) })
} }
@@ -164,11 +161,7 @@ mod tests {
async fn test_ephemeral_blobs() -> Result<(), io::Error> { async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let gate = utils::sync::gate::Gate::default(); let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
let entered = gate.enter().unwrap();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?;
assert_eq!( assert_eq!(
@@ -222,38 +215,4 @@ mod tests {
Ok(()) Ok(())
} }
#[tokio::test]
async fn ephemeral_file_holds_gate_open() {
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
let (conf, tenant_id, timeline_id, ctx) =
harness("ephemeral_file_holds_gate_open").unwrap();
let gate = utils::sync::gate::Gate::default();
let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let mut closing = tokio::task::spawn(async move {
gate.close().await;
});
// gate is entered until the ephemeral file is dropped
// do not start paused tokio-epoll-uring has a sleep loop
tokio::time::pause();
tokio::time::timeout(FOREVER, &mut closing)
.await
.expect_err("closing cannot complete before dropping");
// this is a requirement of the reset_tenant functionality: we have to be able to restart a
// tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
drop(file);
tokio::time::timeout(FOREVER, &mut closing)
.await
.expect("closing completes right away")
.expect("closing does not panic");
}
} }

View File

@@ -18,8 +18,6 @@ use super::zero_padded_read_write;
pub struct RW { pub struct RW {
page_cache_file_id: page_cache::FileId, page_cache_file_id: page_cache::FileId,
rw: super::zero_padded_read_write::RW<PreWarmingWriter>, rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
_gate_guard: utils::sync::gate::GateGuard,
} }
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -31,11 +29,7 @@ pub enum PrewarmOnWrite {
} }
impl RW { impl RW {
pub fn new( pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
file: VirtualFile,
prewarm_on_write: PrewarmOnWrite,
_gate_guard: utils::sync::gate::GateGuard,
) -> Self {
let page_cache_file_id = page_cache::next_file_id(); let page_cache_file_id = page_cache::next_file_id();
Self { Self {
page_cache_file_id, page_cache_file_id,
@@ -44,7 +38,6 @@ impl RW {
file, file,
prewarm_on_write, prewarm_on_write,
)), )),
_gate_guard,
} }
} }
@@ -152,7 +145,6 @@ impl Drop for RW {
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// unlink the file // unlink the file
// we are clear to do this, because we have entered a gate
let res = std::fs::remove_file(&self.rw.as_writer().file.path); let res = std::fs::remove_file(&self.rw.as_writer().file.path);
if let Err(e) = res { if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound { if e.kind() != std::io::ErrorKind::NotFound {

View File

@@ -1,213 +0,0 @@
use std::collections::HashMap;
use utils::id::TimelineId;
use super::remote_timeline_client::index::GcBlockingReason;
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
#[derive(Default)]
pub(crate) struct GcBlock {
/// The timelines which have current reasons to block gc.
///
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
reasons: std::sync::Mutex<Storage>,
blocking: tokio::sync::Mutex<()>,
}
impl GcBlock {
/// Start another gc iteration.
///
/// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
/// it's ending, or if not currently possible, a value describing the reasons why not.
///
/// Cancellation safe.
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
let reasons = {
let g = self.reasons.lock().unwrap();
// TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
// tests, we use everything. we should warn if the gc has been consecutively blocked
// for more than 1h (within single tenant session?).
BlockingReasons::clean_and_summarize(g)
};
if let Some(reasons) = reasons {
Err(reasons)
} else {
Ok(Guard {
_inner: self.blocking.lock().await,
})
}
}
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
let g = self.reasons.lock().unwrap();
BlockingReasons::summarize(&g)
}
/// Start blocking gc for this one timeline for the given reason.
///
/// This is not a guard based API but instead it mimics set API. The returned future will not
/// resolve until an existing gc round has completed.
///
/// Returns true if this block was new, false if gc was already blocked for this reason.
///
/// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
/// keep the gc blocking reason.
pub(crate) async fn insert(
&self,
timeline: &super::Timeline,
reason: GcBlockingReason,
) -> anyhow::Result<bool> {
let (added, uploaded) = {
let mut g = self.reasons.lock().unwrap();
let set = g.entry(timeline.timeline_id).or_default();
let added = set.insert(reason);
// LOCK ORDER: intentionally hold the lock, see self.reasons.
let uploaded = timeline
.remote_client
.schedule_insert_gc_block_reason(reason)?;
(added, uploaded)
};
uploaded.await?;
// ensure that any ongoing gc iteration has completed
drop(self.blocking.lock().await);
Ok(added)
}
/// Remove blocking gc for this one timeline and the given reason.
pub(crate) async fn remove(
&self,
timeline: &super::Timeline,
reason: GcBlockingReason,
) -> anyhow::Result<()> {
use std::collections::hash_map::Entry;
super::span::debug_assert_current_span_has_tenant_and_timeline_id();
let (remaining_blocks, uploaded) = {
let mut g = self.reasons.lock().unwrap();
match g.entry(timeline.timeline_id) {
Entry::Occupied(mut oe) => {
let set = oe.get_mut();
set.remove(reason);
if set.is_empty() {
oe.remove();
}
}
Entry::Vacant(_) => {
// we must still do the index_part.json update regardless, in case we had earlier
// been cancelled
}
}
let remaining_blocks = g.len();
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
let uploaded = timeline
.remote_client
.schedule_remove_gc_block_reason(reason)?;
(remaining_blocks, uploaded)
};
uploaded.await?;
// no need to synchronize with gc iteration again
if remaining_blocks > 0 {
tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
} else {
tracing::info!("gc is now unblocked for the tenant");
}
Ok(())
}
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
let unblocked = {
let mut g = self.reasons.lock().unwrap();
if g.is_empty() {
return;
}
g.remove(&timeline.timeline_id);
BlockingReasons::clean_and_summarize(g).is_none()
};
if unblocked {
tracing::info!("gc is now unblocked following deletion");
}
}
/// Initialize with the non-deleted timelines of this tenant.
pub(crate) fn set_scanned(&self, scanned: Storage) {
let mut g = self.reasons.lock().unwrap();
assert!(g.is_empty());
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
tracing::info!(summary=?reasons, "initialized with gc blocked");
}
}
}
pub(super) struct Guard<'a> {
_inner: tokio::sync::MutexGuard<'a, ()>,
}
#[derive(Debug)]
pub(crate) struct BlockingReasons {
timelines: usize,
reasons: enumset::EnumSet<GcBlockingReason>,
}
impl std::fmt::Display for BlockingReasons {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{} timelines block for {:?}",
self.timelines, self.reasons
)
}
}
impl BlockingReasons {
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let mut reasons = enumset::EnumSet::empty();
g.retain(|_key, value| {
reasons = reasons.union(*value);
!value.is_empty()
});
if !g.is_empty() {
Some(BlockingReasons {
timelines: g.len(),
reasons,
})
} else {
None
}
}
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
if g.is_empty() {
None
} else {
let reasons = g
.values()
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
Some(BlockingReasons {
timelines: g.len(),
reasons,
})
}
}
}

View File

@@ -846,8 +846,8 @@ impl LayerMap {
} }
/// Return all L0 delta layers /// Return all L0 delta layers
pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> { pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
&self.l0_delta_layers self.l0_delta_layers.to_vec()
} }
/// debugging function to print out the contents of the layer map /// debugging function to print out the contents of the layer map

View File

@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng}; use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap};
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
@@ -224,8 +224,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
} }
/// See [`Self::spawn`]. /// See [`Self::spawn`].
#[derive(Clone, Default)] #[derive(Clone)]
pub struct BackgroundPurges(tokio_util::task::TaskTracker); pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
enum BackgroundPurgesInner {
Open(tokio::task::JoinSet<()>),
// we use the async mutex for coalescing
ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
}
impl Default for BackgroundPurges {
fn default() -> Self {
Self(Arc::new(std::sync::Mutex::new(
BackgroundPurgesInner::Open(JoinSet::new()),
)))
}
}
impl BackgroundPurges { impl BackgroundPurges {
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -234,32 +247,24 @@ impl BackgroundPurges {
/// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
/// Thus the [`BackgroundPurges`] type to keep track of these tasks. /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
pub fn spawn(&self, tmp_path: Utf8PathBuf) { pub fn spawn(&self, tmp_path: Utf8PathBuf) {
// because on shutdown we close and wait, we are misusing TaskTracker a bit. let mut guard = self.0.lock().unwrap();
// let jset = match &mut *guard {
// so first acquire a token, then check if the tracker has been closed. the tracker might get closed BackgroundPurgesInner::Open(ref mut jset) => jset,
// right after, but at least the shutdown will wait for what we are spawning next. BackgroundPurgesInner::ShuttingDown(_) => {
let token = self.0.token(); warn!("trying to spawn background purge during shutdown, ignoring");
if self.0.is_closed() {
warn!(
%tmp_path,
"trying to spawn background purge during shutdown, ignoring"
);
return; return;
} }
let span = info_span!(parent: None, "background_purge", %tmp_path);
let task = move || {
let _token = token;
let _entered = span.entered();
if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
// should we fatal_io_error here?
warn!(%error, "failed to purge tenant directory");
}
}; };
jset.spawn_on(
BACKGROUND_RUNTIME.spawn_blocking(task); async move {
if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
// should we fatal_io_error here?
warn!(%error, path=%tmp_path, "failed to purge tenant directory");
}
}
.instrument(info_span!(parent: None, "background_purge")),
BACKGROUND_RUNTIME.handle(),
);
} }
/// When this future completes, all background purges have completed. /// When this future completes, all background purges have completed.
@@ -273,9 +278,42 @@ impl BackgroundPurges {
/// instances of this future will continue to be correct. /// instances of this future will continue to be correct.
#[instrument(skip_all)] #[instrument(skip_all)]
pub async fn shutdown(&self) { pub async fn shutdown(&self) {
// forbid new tasks (can be called many times) let jset = {
self.0.close(); let mut guard = self.0.lock().unwrap();
self.0.wait().await; match &mut *guard {
BackgroundPurgesInner::Open(jset) => {
*guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
std::mem::take(jset),
)))
}
BackgroundPurgesInner::ShuttingDown(_) => {
// calling shutdown multiple times is most likely a bug in pageserver shutdown code
warn!("already shutting down");
}
};
match &mut *guard {
BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
BackgroundPurgesInner::Open(_) => {
unreachable!("above code transitions into shut down state");
}
}
};
let mut jset = jset.lock().await; // concurrent callers coalesce here
while let Some(res) = jset.join_next().await {
match res {
Ok(()) => {}
Err(e) if e.is_panic() => {
// If it panicked, the error is already logged by the panic hook.
}
Err(e) if e.is_cancelled() => {
unreachable!("we don't cancel the joinset or runtime")
}
Err(e) => {
// No idea when this can happen, but let's log it.
warn!(%e, "background purge task failed or panicked");
}
}
}
} }
} }
@@ -1729,9 +1767,14 @@ impl TenantManager {
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>(); let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
for timeline in timelines.values() { for timeline in timelines.values() {
tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
let layers = timeline.layers.read().await; let timeline_layers = timeline
.layers
.read()
.await
.likely_resident_layers()
.collect::<Vec<_>>();
for layer in layers.likely_resident_layers() { for layer in timeline_layers {
let relative_path = layer let relative_path = layer
.local_path() .local_path()
.strip_prefix(&parent_path) .strip_prefix(&parent_path)
@@ -1928,8 +1971,7 @@ impl TenantManager {
timeline_id: TimelineId, timeline_id: TimelineId,
prepared: PreparedTimelineDetach, prepared: PreparedTimelineDetach,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<HashSet<TimelineId>, anyhow::Error> { ) -> Result<Vec<TimelineId>, anyhow::Error> {
// FIXME: this is unnecessary, slotguard already has these semantics
struct RevertOnDropSlot(Option<SlotGuard>); struct RevertOnDropSlot(Option<SlotGuard>);
impl Drop for RevertOnDropSlot { impl Drop for RevertOnDropSlot {

View File

@@ -800,123 +800,6 @@ impl RemoteTimelineClient {
.context("wait completion") .context("wait completion")
} }
/// Adds a gc blocking reason for this timeline if one does not exist already.
///
/// A retryable step of timeline detach ancestor.
///
/// Returns a future which waits until the completion of the upload.
pub(crate) fn schedule_insert_gc_block_reason(
self: &Arc<Self>,
reason: index::GcBlockingReason,
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
{
let maybe_barrier = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let index::GcBlockingReason::DetachAncestor = reason {
if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
drop(guard);
panic!("cannot start detach ancestor if there is nothing to detach from");
}
}
let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
let current = upload_queue.dirty.gc_blocking.as_ref();
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
match (current, uploaded) {
(x, y) if wanted(x) && wanted(y) => None,
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
// Usual case: !wanted(x) && !wanted(y)
//
// Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
// turn on and off some reason.
(x, y) => {
if !wanted(x) && wanted(y) {
// this could be avoided by having external in-memory synchronization, like
// timeline detach ancestor
warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
}
// at this point, the metadata must always show that there is a parent
upload_queue.dirty.gc_blocking = current
.map(|x| x.with_reason(reason))
.or_else(|| Some(index::GcBlocking::started_now_for(reason)));
self.schedule_index_upload(upload_queue)?;
Some(self.schedule_barrier0(upload_queue))
}
}
};
Ok(async move {
if let Some(barrier) = maybe_barrier {
Self::wait_completion0(barrier).await?;
}
Ok(())
})
}
/// Removes a gc blocking reason for this timeline if one exists.
///
/// A retryable step of timeline detach ancestor.
///
/// Returns a future which waits until the completion of the upload.
pub(crate) fn schedule_remove_gc_block_reason(
self: &Arc<Self>,
reason: index::GcBlockingReason,
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
{
let maybe_barrier = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let index::GcBlockingReason::DetachAncestor = reason {
if !upload_queue
.clean
.0
.lineage
.is_detached_from_original_ancestor()
{
drop(guard);
panic!("cannot complete timeline_ancestor_detach while not detached");
}
}
let wanted = |x: Option<&index::GcBlocking>| {
x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
};
let current = upload_queue.dirty.gc_blocking.as_ref();
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
match (current, uploaded) {
(x, y) if wanted(x) && wanted(y) => None,
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
(x, y) => {
if !wanted(x) && wanted(y) {
warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
}
upload_queue.dirty.gc_blocking =
current.as_ref().and_then(|x| x.without_reason(reason));
assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
// FIXME: bogus ?
self.schedule_index_upload(upload_queue)?;
Some(self.schedule_barrier0(upload_queue))
}
}
};
Ok(async move {
if let Some(barrier) = maybe_barrier {
Self::wait_completion0(barrier).await?;
}
Ok(())
})
}
/// Launch an upload operation in the background; the file is added to be included in next /// Launch an upload operation in the background; the file is added to be included in next
/// `index_part.json` upload. /// `index_part.json` upload.
pub(crate) fn schedule_layer_file_upload( pub(crate) fn schedule_layer_file_upload(

View File

@@ -60,9 +60,6 @@ pub struct IndexPart {
#[serde(default)] #[serde(default)]
pub(crate) lineage: Lineage, pub(crate) lineage: Lineage,
#[serde(skip_serializing_if = "Option::is_none", default)]
pub(crate) gc_blocking: Option<GcBlocking>,
/// Describes the kind of aux files stored in the timeline. /// Describes the kind of aux files stored in the timeline.
/// ///
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -88,11 +85,10 @@ impl IndexPart {
/// - 6: last_aux_file_policy is added. /// - 6: last_aux_file_policy is added.
/// - 7: metadata_bytes is no longer written, but still read /// - 7: metadata_bytes is no longer written, but still read
/// - 8: added `archived_at` /// - 8: added `archived_at`
/// - 9: +gc_blocking const LATEST_VERSION: usize = 8;
const LATEST_VERSION: usize = 9;
// Versions we may see when reading from a bucket. // Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
pub const FILE_NAME: &'static str = "index_part.json"; pub const FILE_NAME: &'static str = "index_part.json";
@@ -105,7 +101,6 @@ impl IndexPart {
deleted_at: None, deleted_at: None,
archived_at: None, archived_at: None,
lineage: Default::default(), lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
} }
} }
@@ -256,64 +251,6 @@ impl Lineage {
} }
} }
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct GcBlocking {
pub(crate) started_at: NaiveDateTime,
pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
}
#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
#[enumset(serialize_repr = "list")]
pub(crate) enum GcBlockingReason {
Manual,
DetachAncestor,
}
impl GcBlocking {
pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
GcBlocking {
started_at: chrono::Utc::now().naive_utc(),
reasons: enumset::EnumSet::only(reason),
}
}
/// Returns true if the given reason is one of the reasons why the gc is blocked.
pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
self.reasons.contains(reason)
}
/// Returns a version of self with the given reason.
pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
assert!(!self.blocked_by(reason));
let mut reasons = self.reasons;
reasons.insert(reason);
Self {
started_at: self.started_at,
reasons,
}
}
/// Returns a version of self without the given reason. Assumption is that if
/// there are no more reasons, we can unblock the gc by returning `None`.
pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
assert!(self.blocked_by(reason));
if self.reasons.len() == 1 {
None
} else {
let mut reasons = self.reasons;
assert!(reasons.remove(reason));
assert!(!reasons.is_empty());
Some(Self {
started_at: self.started_at,
reasons,
})
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@@ -355,7 +292,6 @@ mod tests {
deleted_at: None, deleted_at: None,
archived_at: None, archived_at: None,
lineage: Lineage::default(), lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -399,7 +335,6 @@ mod tests {
deleted_at: None, deleted_at: None,
archived_at: None, archived_at: None,
lineage: Lineage::default(), lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -444,7 +379,6 @@ mod tests {
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None, archived_at: None,
lineage: Lineage::default(), lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -492,7 +426,6 @@ mod tests {
deleted_at: None, deleted_at: None,
archived_at: None, archived_at: None,
lineage: Lineage::default(), lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -535,7 +468,6 @@ mod tests {
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None, archived_at: None,
lineage: Lineage::default(), lineage: Lineage::default(),
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -581,7 +513,6 @@ mod tests {
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
}, },
gc_blocking: None,
last_aux_file_policy: None, last_aux_file_policy: None,
}; };
@@ -632,7 +563,6 @@ mod tests {
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
}, },
gc_blocking: None,
last_aux_file_policy: Some(AuxFilePolicy::V2), last_aux_file_policy: Some(AuxFilePolicy::V2),
}; };
@@ -688,7 +618,6 @@ mod tests {
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: None, archived_at: None,
lineage: Default::default(), lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: Default::default(), last_aux_file_policy: Default::default(),
}; };
@@ -745,7 +674,6 @@ mod tests {
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
lineage: Default::default(), lineage: Default::default(),
gc_blocking: None,
last_aux_file_policy: Default::default(), last_aux_file_policy: Default::default(),
}; };
@@ -753,68 +681,6 @@ mod tests {
assert_eq!(part, expected); assert_eq!(part, expected);
} }
#[test]
fn v9_indexpart_is_parsed() {
let example = r#"{
"version": 9,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata": {
"disk_consistent_lsn": "0/16960E8",
"prev_record_lsn": "0/1696070",
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
"ancestor_lsn": "0/0",
"latest_gc_cutoff_lsn": "0/1696070",
"initdb_lsn": "0/1696070",
"pg_version": 14
},
"gc_blocking": {
"started_at": "2024-07-19T09:00:00.123",
"reasons": ["DetachAncestor"]
}
}"#;
let expected = IndexPart {
version: 9,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::new(
Lsn::from_str("0/16960E8").unwrap(),
Some(Lsn::from_str("0/1696070").unwrap()),
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
).with_recalculated_checksum().unwrap(),
deleted_at: None,
lineage: Default::default(),
gc_blocking: Some(GcBlocking {
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
}),
last_aux_file_policy: Default::default(),
archived_at: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
fn parse_naive_datetime(s: &str) -> NaiveDateTime { fn parse_naive_datetime(s: &str) -> NaiveDateTime {
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
} }

View File

@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument}; use tracing::{info_span, instrument, warn, Instrument};
use utils::{ use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
id::TimelineId, pausable_failpoint, serde_system_time, id::TimelineId, serde_system_time,
}; };
use super::{ use super::{
@@ -1146,14 +1146,12 @@ impl<'a> TenantDownloader<'a> {
layer: HeatMapLayer, layer: HeatMapLayer,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<Option<HeatMapLayer>, UpdateError> { ) -> Result<Option<HeatMapLayer>, UpdateError> {
// Failpoints for simulating slow remote storage // Failpoint for simulating slow remote storage
failpoint_support::sleep_millis_async!( failpoint_support::sleep_millis_async!(
"secondary-layer-download-sleep", "secondary-layer-download-sleep",
&self.secondary_state.cancel &self.secondary_state.cancel
); );
pausable_failpoint!("secondary-layer-download-pausable");
let local_path = local_layer_path( let local_path = local_layer_path(
self.conf, self.conf,
tenant_shard_id, tenant_shard_id,

View File

@@ -8,9 +8,6 @@ mod layer_desc;
mod layer_name; mod layer_name;
pub mod merge_iterator; pub mod merge_iterator;
#[cfg(test)]
pub mod split_writer;
use crate::context::{AccessStatsBehavior, RequestContext}; use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value; use crate::repository::Value;
use crate::walrecord::NeonWalRecord; use crate::walrecord::NeonWalRecord;
@@ -435,6 +432,21 @@ impl ReadableLayer {
} }
} }
/// Return value from [`Layer::get_value_reconstruct_data`]
#[derive(Clone, Copy, Debug)]
pub enum ValueReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue,
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing,
}
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather /// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should /// of layers (for example when creating a branch that makes some previously covered layers visible). It should
@@ -539,25 +551,19 @@ impl LayerAccessStats {
self.record_residence_event_at(SystemTime::now()) self.record_residence_event_at(SystemTime::now())
} }
fn record_access_at(&self, now: SystemTime) -> bool { pub(crate) fn record_access_at(&self, now: SystemTime) {
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now); let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
// A layer which is accessed must be visible. // A layer which is accessed must be visible.
mask |= 0x1 << Self::VISIBILITY_SHIFT; mask |= 0x1 << Self::VISIBILITY_SHIFT;
value |= 0x1 << Self::VISIBILITY_SHIFT; value |= 0x1 << Self::VISIBILITY_SHIFT;
let old_bits = self.write_bits(mask, value); self.write_bits(mask, value);
!matches!(
self.decode_visibility(old_bits),
LayerVisibilityHint::Visible
)
} }
/// Returns true if we modified the layer's visibility to set it to Visible implicitly pub(crate) fn record_access(&self, ctx: &RequestContext) {
/// as a result of this access
pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
return false; return;
} }
self.record_access_at(SystemTime::now()) self.record_access_at(SystemTime::now())

View File

@@ -36,12 +36,13 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
use crate::tenant::disk_btree::{ use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
}; };
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::timeline::GetVectoredError; use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{ use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadPlanner, VectoredReadPlanner,
}; };
use crate::tenant::PageReconstructError; use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::{self, VirtualFile}; use crate::virtual_file::{self, VirtualFile};
use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -71,7 +72,10 @@ use utils::{
lsn::Lsn, lsn::Lsn,
}; };
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; use super::{
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
ValuesReconstructState,
};
/// ///
/// Header stored in the beginning of the file /// Header stored in the beginning of the file
@@ -196,6 +200,7 @@ impl DeltaKey {
pub struct DeltaLayer { pub struct DeltaLayer {
path: Utf8PathBuf, path: Utf8PathBuf,
pub desc: PersistentLayerDesc, pub desc: PersistentLayerDesc,
access_stats: LayerAccessStats,
inner: OnceCell<Arc<DeltaLayerInner>>, inner: OnceCell<Arc<DeltaLayerInner>>,
} }
@@ -294,6 +299,7 @@ impl DeltaLayer {
/// not loaded already. /// not loaded already.
/// ///
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> { async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
self.access_stats.record_access(ctx);
// Quick exit if already loaded // Quick exit if already loaded
self.inner self.inner
.get_or_try_init(|| self.load_inner(ctx)) .get_or_try_init(|| self.load_inner(ctx))
@@ -344,6 +350,7 @@ impl DeltaLayer {
summary.lsn_range, summary.lsn_range,
metadata.len(), metadata.len(),
), ),
access_stats: Default::default(),
inner: OnceCell::new(), inner: OnceCell::new(),
}) })
} }
@@ -366,6 +373,7 @@ impl DeltaLayer {
/// 3. Call `finish`. /// 3. Call `finish`.
/// ///
struct DeltaLayerWriterInner { struct DeltaLayerWriterInner {
conf: &'static PageServerConf,
pub path: Utf8PathBuf, pub path: Utf8PathBuf,
timeline_id: TimelineId, timeline_id: TimelineId,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
@@ -376,9 +384,6 @@ struct DeltaLayerWriterInner {
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>, tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
blob_writer: BlobWriter<true>, blob_writer: BlobWriter<true>,
// Number of key-lsns in the layer.
num_keys: usize,
} }
impl DeltaLayerWriterInner { impl DeltaLayerWriterInner {
@@ -412,6 +417,7 @@ impl DeltaLayerWriterInner {
let tree_builder = DiskBtreeBuilder::new(block_buf); let tree_builder = DiskBtreeBuilder::new(block_buf);
Ok(Self { Ok(Self {
conf,
path, path,
timeline_id, timeline_id,
tenant_shard_id, tenant_shard_id,
@@ -419,7 +425,6 @@ impl DeltaLayerWriterInner {
lsn_range, lsn_range,
tree: tree_builder, tree: tree_builder,
blob_writer, blob_writer,
num_keys: 0,
}) })
} }
@@ -470,9 +475,6 @@ impl DeltaLayerWriterInner {
let delta_key = DeltaKey::from_key_lsn(&key, lsn); let delta_key = DeltaKey::from_key_lsn(&key, lsn);
let res = self.tree.append(&delta_key.0, blob_ref.0); let res = self.tree.append(&delta_key.0, blob_ref.0);
self.num_keys += 1;
(val, res.map_err(|e| anyhow::anyhow!(e))) (val, res.map_err(|e| anyhow::anyhow!(e)))
} }
@@ -486,10 +488,11 @@ impl DeltaLayerWriterInner {
async fn finish( async fn finish(
self, self,
key_end: Key, key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { ) -> anyhow::Result<ResidentLayer> {
let temp_path = self.path.clone(); let temp_path = self.path.clone();
let result = self.finish0(key_end, ctx).await; let result = self.finish0(key_end, timeline, ctx).await;
if result.is_err() { if result.is_err() {
tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
if let Err(e) = std::fs::remove_file(&temp_path) { if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -502,8 +505,9 @@ impl DeltaLayerWriterInner {
async fn finish0( async fn finish0(
self, self,
key_end: Key, key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { ) -> anyhow::Result<ResidentLayer> {
let index_start_blk = let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -568,9 +572,11 @@ impl DeltaLayerWriterInner {
// fsync the file // fsync the file
file.sync_all().await?; file.sync_all().await?;
trace!("created delta layer {}", self.path); let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
Ok((desc, self.path)) trace!("created delta layer {}", layer.local_path());
Ok(layer)
} }
} }
@@ -671,20 +677,14 @@ impl DeltaLayerWriter {
pub(crate) async fn finish( pub(crate) async fn finish(
mut self, mut self,
key_end: Key, key_end: Key,
timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { ) -> anyhow::Result<ResidentLayer> {
self.inner.take().unwrap().finish(key_end, ctx).await self.inner
} .take()
.unwrap()
#[cfg(test)] .finish(key_end, timeline, ctx)
pub(crate) fn num_keys(&self) -> usize { .await
self.inner.as_ref().unwrap().num_keys
}
#[cfg(test)]
pub(crate) fn estimated_size(&self) -> u64 {
let inner = self.inner.as_ref().unwrap();
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
} }
} }
@@ -808,6 +808,95 @@ impl DeltaLayerInner {
}) })
} }
pub(super) async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let mut need_image = true;
// Scan the page versions backwards, starting from `lsn`.
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
&block_reader,
);
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
tree_reader
.visit(
&search_key.0,
VisitDirection::Backwards,
|key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
!blob_ref.will_init()
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
.await?;
let ctx = &RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerValue)
.build();
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let cursor = block_reader.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor
.read_blob_into_buf(pos, &mut buf, ctx)
.await
.with_context(|| {
format!("Failed to read blob from virtual file {}", self.file.path)
})?;
let val = Value::des(&buf).with_context(|| {
format!(
"Failed to deserialize file blob from virtual file {}",
self.file.path
)
})?;
match val {
Value::Image(img) => {
reconstruct_state.img = Some((entry_lsn, img));
need_image = false;
break;
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
} else {
Ok(ValueReconstructResult::Complete)
}
}
// Look up the keys in the provided keyspace and update // Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found. // the reconstruct state with whatever is found.
// //
@@ -1580,9 +1669,8 @@ pub(crate) mod test {
use super::*; use super::*;
use crate::repository::Value; use crate::repository::Value;
use crate::tenant::harness::TIMELINE_ID; use crate::tenant::harness::TIMELINE_ID;
use crate::tenant::storage_layer::{Layer, ResidentLayer};
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
use crate::tenant::{Tenant, Timeline}; use crate::tenant::Tenant;
use crate::{ use crate::{
context::DownloadBehavior, context::DownloadBehavior,
task_mgr::TaskKind, task_mgr::TaskKind,
@@ -1876,8 +1964,9 @@ pub(crate) mod test {
res?; res?;
} }
let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?; let resident = writer
let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?; .finish(entries_meta.key_range.end, &timeline, &ctx)
.await?;
let inner = resident.get_as_delta(&ctx).await?; let inner = resident.get_as_delta(&ctx).await?;
@@ -1957,7 +2046,6 @@ pub(crate) mod test {
.await .await
.likely_resident_layers() .likely_resident_layers()
.next() .next()
.cloned()
.unwrap(); .unwrap();
{ {
@@ -2032,8 +2120,7 @@ pub(crate) mod test {
.read() .read()
.await .await
.likely_resident_layers() .likely_resident_layers()
.find(|&x| x != &initdb_layer) .find(|x| x != &initdb_layer)
.cloned()
.unwrap(); .unwrap();
// create a copy for the timeline, so we don't overwrite the file // create a copy for the timeline, so we don't overwrite the file
@@ -2068,8 +2155,7 @@ pub(crate) mod test {
.await .await
.unwrap(); .unwrap();
let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap(); let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
copied_layer.get_as_delta(ctx).await.unwrap(); copied_layer.get_as_delta(ctx).await.unwrap();
@@ -2197,9 +2283,7 @@ pub(crate) mod test {
for (key, lsn, value) in deltas { for (key, lsn, value) in deltas {
writer.put_value(key, lsn, value, ctx).await?; writer.put_value(key, lsn, value, ctx).await?;
} }
let delta_layer = writer.finish(key_end, tline, ctx).await?;
let (desc, path) = writer.finish(key_end, ctx).await?;
let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
Ok::<_, anyhow::Error>(delta_layer) Ok::<_, anyhow::Error>(delta_layer)
} }

View File

@@ -32,6 +32,9 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{ use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
}; };
use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::timeline::GetVectoredError; use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{ use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -134,6 +137,7 @@ pub struct ImageLayer {
pub desc: PersistentLayerDesc, pub desc: PersistentLayerDesc,
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
pub lsn: Lsn, pub lsn: Lsn,
access_stats: LayerAccessStats,
inner: OnceCell<ImageLayerInner>, inner: OnceCell<ImageLayerInner>,
} }
@@ -251,6 +255,7 @@ impl ImageLayer {
/// not loaded already. /// not loaded already.
/// ///
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> { async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
self.access_stats.record_access(ctx);
self.inner self.inner
.get_or_try_init(|| self.load_inner(ctx)) .get_or_try_init(|| self.load_inner(ctx))
.await .await
@@ -301,6 +306,7 @@ impl ImageLayer {
metadata.len(), metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future. ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn, lsn: summary.lsn,
access_stats: Default::default(),
inner: OnceCell::new(), inner: OnceCell::new(),
}) })
} }
@@ -423,6 +429,46 @@ impl ImageLayerInner {
}) })
} }
pub(super) async fn get_value_reconstruct_data(
&self,
key: Key,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader
.get(
&keybuf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await?
{
let blob = block_reader
.block_cursor()
.read_blob(
offset,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerValue)
.build(),
)
.await
.with_context(|| format!("failed to read value from offset {}", offset))?;
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok(ValueReconstructResult::Complete)
} else {
Ok(ValueReconstructResult::Missing)
}
}
// Look up the keys in the provided keyspace and update // Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found. // the reconstruct state with whatever is found.
pub(super) async fn get_values_reconstruct_data( pub(super) async fn get_values_reconstruct_data(
@@ -696,21 +742,11 @@ struct ImageLayerWriterInner {
// where we have chosen their compressed form // where we have chosen their compressed form
uncompressed_bytes_chosen: u64, uncompressed_bytes_chosen: u64,
// Number of keys in the layer.
num_keys: usize,
blob_writer: BlobWriter<false>, blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>, tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
last_written_key: Key,
} }
impl ImageLayerWriterInner { impl ImageLayerWriterInner {
fn size(&self) -> u64 {
self.tree.borrow_writer().size() + self.blob_writer.size()
}
/// ///
/// Start building a new image layer. /// Start building a new image layer.
/// ///
@@ -764,8 +800,6 @@ impl ImageLayerWriterInner {
uncompressed_bytes: 0, uncompressed_bytes: 0,
uncompressed_bytes_eligible: 0, uncompressed_bytes_eligible: 0,
uncompressed_bytes_chosen: 0, uncompressed_bytes_chosen: 0,
num_keys: 0,
last_written_key: Key::MIN,
}; };
Ok(writer) Ok(writer)
@@ -786,7 +820,6 @@ impl ImageLayerWriterInner {
let compression = self.conf.image_compression; let compression = self.conf.image_compression;
let uncompressed_len = img.len() as u64; let uncompressed_len = img.len() as u64;
self.uncompressed_bytes += uncompressed_len; self.uncompressed_bytes += uncompressed_len;
self.num_keys += 1;
let (_img, res) = self let (_img, res) = self
.blob_writer .blob_writer
.write_blob_maybe_compressed(img, ctx, compression) .write_blob_maybe_compressed(img, ctx, compression)
@@ -806,11 +839,6 @@ impl ImageLayerWriterInner {
key.write_to_byte_slice(&mut keybuf); key.write_to_byte_slice(&mut keybuf);
self.tree.append(&keybuf, off)?; self.tree.append(&keybuf, off)?;
#[cfg(feature = "testing")]
{
self.last_written_key = key;
}
Ok(()) Ok(())
} }
@@ -821,7 +849,6 @@ impl ImageLayerWriterInner {
self, self,
timeline: &Arc<Timeline>, timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
end_key: Option<Key>,
) -> anyhow::Result<ResidentLayer> { ) -> anyhow::Result<ResidentLayer> {
let index_start_blk = let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -872,23 +899,11 @@ impl ImageLayerWriterInner {
let desc = PersistentLayerDesc::new_img( let desc = PersistentLayerDesc::new_img(
self.tenant_shard_id, self.tenant_shard_id,
self.timeline_id, self.timeline_id,
if let Some(end_key) = end_key { self.key_range.clone(),
self.key_range.start..end_key
} else {
self.key_range.clone()
},
self.lsn, self.lsn,
metadata.len(), metadata.len(),
); );
#[cfg(feature = "testing")]
if let Some(end_key) = end_key {
assert!(
self.last_written_key < end_key,
"written key violates end_key range"
);
}
// Note: Because we open the file in write-only mode, we cannot // Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't // reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it. // set inner.file here. The first read will have to re-open it.
@@ -965,18 +980,6 @@ impl ImageLayerWriter {
self.inner.as_mut().unwrap().put_image(key, img, ctx).await self.inner.as_mut().unwrap().put_image(key, img, ctx).await
} }
#[cfg(test)]
/// Estimated size of the image layer.
pub(crate) fn estimated_size(&self) -> u64 {
let inner = self.inner.as_ref().unwrap();
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
}
#[cfg(test)]
pub(crate) fn num_keys(&self) -> usize {
self.inner.as_ref().unwrap().num_keys
}
/// ///
/// Finish writing the image layer. /// Finish writing the image layer.
/// ///
@@ -985,26 +988,7 @@ impl ImageLayerWriter {
timeline: &Arc<Timeline>, timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<super::ResidentLayer> { ) -> anyhow::Result<super::ResidentLayer> {
self.inner.take().unwrap().finish(timeline, ctx, None).await self.inner.take().unwrap().finish(timeline, ctx).await
}
#[cfg(test)]
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
pub(super) async fn finish_with_end_key(
mut self,
timeline: &Arc<Timeline>,
end_key: Key,
ctx: &RequestContext,
) -> anyhow::Result<super::ResidentLayer> {
self.inner
.take()
.unwrap()
.finish(timeline, ctx, Some(end_key))
.await
}
pub(crate) fn size(&self) -> u64 {
self.inner.as_ref().unwrap().size()
} }
} }

View File

@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value}; use crate::repository::{Key, Value};
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError; use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError; use crate::tenant::{PageReconstructError, Timeline};
use crate::{l0_flush, page_cache, walrecord}; use crate::{l0_flush, page_cache, walrecord};
use anyhow::{anyhow, Result}; use anyhow::{anyhow, ensure, Result};
use camino::Utf8PathBuf;
use pageserver_api::keyspace::KeySpace; use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
@@ -34,7 +34,8 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
use tokio::sync::{RwLock, RwLockWriteGuard}; use tokio::sync::{RwLock, RwLockWriteGuard};
use super::{ use super::{
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
ValuesReconstructState,
}; };
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -54,6 +55,9 @@ pub struct InMemoryLayer {
/// Writes are only allowed when this is `None`. /// Writes are only allowed when this is `None`.
pub(crate) end_lsn: OnceLock<Lsn>, pub(crate) end_lsn: OnceLock<Lsn>,
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
local_path_str: Arc<str>,
/// Used for traversal path. Cached representation of the in-memory layer after frozen. /// Used for traversal path. Cached representation of the in-memory layer after frozen.
frozen_local_path_str: OnceLock<Arc<str>>, frozen_local_path_str: OnceLock<Arc<str>>,
@@ -244,6 +248,12 @@ impl InMemoryLayer {
self.start_lsn..self.end_lsn_or_max() self.start_lsn..self.end_lsn_or_max()
} }
pub(crate) fn local_path_str(&self) -> &Arc<str> {
self.frozen_local_path_str
.get()
.unwrap_or(&self.local_path_str)
}
/// debugging function to print out the contents of the layer /// debugging function to print out the contents of the layer
/// ///
/// this is likely completly unused /// this is likely completly unused
@@ -293,6 +303,60 @@ impl InMemoryLayer {
Ok(()) Ok(())
} }
/// Look up given value in the layer.
pub(crate) async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos, &ctx).await?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
return Ok(ValueReconstructResult::Complete);
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((*entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
}
// release lock on 'inner'
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
} else {
Ok(ValueReconstructResult::Complete)
}
}
// Look up the keys in the provided keyspace and update // Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found. // the reconstruct state with whatever is found.
// //
@@ -385,17 +449,20 @@ impl InMemoryLayer {
timeline_id: TimelineId, timeline_id: TimelineId,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
start_lsn: Lsn, start_lsn: Lsn,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<InMemoryLayer> { ) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
let key = InMemoryLayerFileId(file.page_cache_file_id()); let key = InMemoryLayerFileId(file.page_cache_file_id());
Ok(InMemoryLayer { Ok(InMemoryLayer {
file_id: key, file_id: key,
local_path_str: {
let mut buf = String::new();
inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
buf.into()
},
frozen_local_path_str: OnceLock::new(), frozen_local_path_str: OnceLock::new(),
conf, conf,
timeline_id, timeline_id,
@@ -415,7 +482,8 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree /// Adds the page version to the in-memory tree
pub async fn put_value(
pub(crate) async fn put_value(
&self, &self,
key: Key, key: Key,
lsn: Lsn, lsn: Lsn,
@@ -480,6 +548,8 @@ impl InMemoryLayer {
/// Records the end_lsn for non-dropped layers. /// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive /// `end_lsn` is exclusive
pub async fn freeze(&self, end_lsn: Lsn) { pub async fn freeze(&self, end_lsn: Lsn) {
let inner = self.inner.write().await;
assert!( assert!(
self.start_lsn < end_lsn, self.start_lsn < end_lsn,
"{} >= {}", "{} >= {}",
@@ -497,28 +567,24 @@ impl InMemoryLayer {
}) })
.expect("frozen_local_path_str set only once"); .expect("frozen_local_path_str set only once");
#[cfg(debug_assertions)]
{
let inner = self.inner.write().await;
for vec_map in inner.index.values() { for vec_map in inner.index.values() {
for (lsn, _pos) in vec_map.as_slice() { for (lsn, _pos) in vec_map.as_slice() {
assert!(*lsn < end_lsn); assert!(*lsn < end_lsn);
} }
} }
} }
}
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
/// layer will only contain the key range the user specifies, and may return `None` /// layer will only contain the key range the user specifies, and may return `None`
/// if there are no matching keys. /// if there are no matching keys.
/// ///
/// Returns a new delta layer with all the same data as this in-memory layer /// Returns a new delta layer with all the same data as this in-memory layer
pub async fn write_to_disk( pub(crate) async fn write_to_disk(
&self, &self,
timeline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
key_range: Option<Range<Key>>, key_range: Option<Range<Key>>,
l0_flush_global_state: &l0_flush::Inner, ) -> Result<Option<ResidentLayer>> {
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
// Grab the lock in read-mode. We hold it over the I/O, but because this // Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the // layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception // write lock on it, so we shouldn't block anyone. There's one exception
@@ -530,8 +596,9 @@ impl InMemoryLayer {
// rare though, so we just accept the potential latency hit for now. // rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().await; let inner = self.inner.read().await;
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
use l0_flush::Inner; use l0_flush::Inner;
let _concurrency_permit = match l0_flush_global_state { let _concurrency_permit = match &*l0_flush_global_state {
Inner::PageCached => None, Inner::PageCached => None,
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
}; };
@@ -561,7 +628,7 @@ impl InMemoryLayer {
) )
.await?; .await?;
match l0_flush_global_state { match &*l0_flush_global_state {
l0_flush::Inner::PageCached => { l0_flush::Inner::PageCached => {
let ctx = RequestContextBuilder::extend(ctx) let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer) .page_content_kind(PageContentKind::InMemoryLayer)
@@ -626,7 +693,7 @@ impl InMemoryLayer {
} }
// MAX is used here because we identify L0 layers by full key range // MAX is used here because we identify L0 layers by full key range
let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?; let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
// //
@@ -638,6 +705,6 @@ impl InMemoryLayer {
// we dirtied when writing to the filesystem have been flushed and marked !dirty. // we dirtied when writing to the filesystem have been flushed and marked !dirty.
drop(_concurrency_permit); drop(_concurrency_permit);
Ok(Some((desc, path))) Ok(Some(delta_layer))
} }
} }

View File

@@ -24,7 +24,8 @@ use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self}; use super::image_layer::{self};
use super::{ use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
ValuesReconstructState,
}; };
use utils::generation::Generation; use utils::generation::Generation;
@@ -300,6 +301,42 @@ impl Layer {
self.0.delete_on_drop(); self.0.delete_on_drop();
} }
/// Return data needed to reconstruct given page at LSN.
///
/// It is up to the caller to collect more data from the previous layer and
/// perform WAL redo, if necessary.
///
/// # Cancellation-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_data: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
use anyhow::ensure;
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
self.0.access_stats.record_access(ctx);
if self.layer_desc().is_delta {
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
ensure!(self.layer_desc().key_range.contains(&key));
} else {
ensure!(self.layer_desc().key_range.contains(&key));
ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
}
layer
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
.instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
.await
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
}
pub(crate) async fn get_values_reconstruct_data( pub(crate) async fn get_values_reconstruct_data(
&self, &self,
keyspace: KeySpace, keyspace: KeySpace,
@@ -316,7 +353,7 @@ impl Layer {
other => GetVectoredError::Other(anyhow::anyhow!(other)), other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?; })?;
self.record_access(ctx); self.0.access_stats.record_access(ctx);
layer layer
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -396,18 +433,18 @@ impl Layer {
self.0.info(reset) self.0.info(reset)
} }
pub(crate) fn latest_activity(&self) -> SystemTime { pub(crate) fn access_stats(&self) -> &LayerAccessStats {
self.0.access_stats.latest_activity() &self.0.access_stats
}
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
self.0.access_stats.visibility()
} }
pub(crate) fn local_path(&self) -> &Utf8Path { pub(crate) fn local_path(&self) -> &Utf8Path {
&self.0.path &self.0.path
} }
pub(crate) fn debug_str(&self) -> &Arc<str> {
&self.0.debug_str
}
pub(crate) fn metadata(&self) -> LayerFileMetadata { pub(crate) fn metadata(&self) -> LayerFileMetadata {
self.0.metadata() self.0.metadata()
} }
@@ -451,31 +488,13 @@ impl Layer {
} }
} }
fn record_access(&self, ctx: &RequestContext) {
if self.0.access_stats.record_access(ctx) {
// Visibility was modified to Visible
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
);
if let Some(tl) = self.0.timeline.upgrade() {
tl.metrics
.visible_physical_size_gauge
.add(self.0.desc.file_size)
}
}
}
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) { pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let old_visibility = self.0.access_stats.set_visibility(visibility.clone()); let old_visibility = self.access_stats().set_visibility(visibility.clone());
use LayerVisibilityHint::*; use LayerVisibilityHint::*;
match (old_visibility, visibility) { match (old_visibility, visibility) {
(Visible, Covered) => { (Visible, Covered) => {
// Subtract this layer's contribution to the visible size metric // Subtract this layer's contribution to the visible size metric
if let Some(tl) = self.0.timeline.upgrade() { if let Some(tl) = self.0.timeline.upgrade() {
debug_assert!(
tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
);
tl.metrics tl.metrics
.visible_physical_size_gauge .visible_physical_size_gauge
.sub(self.0.desc.file_size) .sub(self.0.desc.file_size)
@@ -500,7 +519,7 @@ impl Layer {
/// ///
/// However when we want something evicted, we cannot evict it right away as there might be current /// However when we want something evicted, we cannot evict it right away as there might be current
/// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
/// read with [`Layer::get_values_reconstruct_data`]. /// read with [`Layer::get_value_reconstruct_data`].
/// ///
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
#[derive(Debug)] #[derive(Debug)]
@@ -581,6 +600,9 @@ struct LayerInner {
/// Full path to the file; unclear if this should exist anymore. /// Full path to the file; unclear if this should exist anymore.
path: Utf8PathBuf, path: Utf8PathBuf,
/// String representation of the layer, used for traversal id.
debug_str: Arc<str>,
desc: PersistentLayerDesc, desc: PersistentLayerDesc,
/// Timeline access is needed for remote timeline client and metrics. /// Timeline access is needed for remote timeline client and metrics.
@@ -693,9 +715,6 @@ impl Drop for LayerInner {
} }
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
debug_assert!(
timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
);
timeline timeline
.metrics .metrics
.visible_physical_size_gauge .visible_physical_size_gauge
@@ -817,6 +836,9 @@ impl LayerInner {
LayerInner { LayerInner {
conf, conf,
debug_str: {
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
},
path: local_path, path: local_path,
desc, desc,
timeline: Arc::downgrade(timeline), timeline: Arc::downgrade(timeline),
@@ -1737,6 +1759,28 @@ impl DownloadedLayer {
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
} }
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_data: &mut ValueReconstructState,
owner: &Arc<LayerInner>,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
use LayerKind::*;
match self.get(owner, ctx).await? {
Delta(d) => {
d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
.await
}
Image(i) => {
i.get_value_reconstruct_data(key, reconstruct_data, ctx)
.await
}
}
}
async fn get_values_reconstruct_data( async fn get_values_reconstruct_data(
&self, &self,
keyspace: KeySpace, keyspace: KeySpace,
@@ -1835,7 +1879,7 @@ impl ResidentLayer {
// this is valid because the DownloadedLayer::kind is a OnceCell, not a // this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held. // while it's being held.
self.owner.record_access(ctx); owner.access_stats.record_access(ctx);
delta_layer::DeltaLayerInner::load_keys(d, ctx) delta_layer::DeltaLayerInner::load_keys(d, ctx)
.await .await

View File

@@ -39,7 +39,7 @@ async fn smoke_test() {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -50,26 +50,13 @@ async fn smoke_test() {
// all layers created at pageserver are like `layer`, initialized with strong // all layers created at pageserver are like `layer`, initialized with strong
// Arc<DownloadedLayer>. // Arc<DownloadedLayer>.
let controlfile_keyspace = KeySpace {
ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
};
let img_before = { let img_before = {
let mut data = ValuesReconstructState::default(); let mut data = ValueReconstructState::default();
layer layer
.get_values_reconstruct_data( .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
controlfile_keyspace.clone(),
Lsn(0x10)..Lsn(0x11),
&mut data,
&ctx,
)
.await .await
.unwrap(); .unwrap();
data.keys data.img
.remove(&CONTROLFILE_KEY)
.expect("must be present")
.expect("should not error")
.img
.take() .take()
.expect("tenant harness writes the control file") .expect("tenant harness writes the control file")
}; };
@@ -87,24 +74,13 @@ async fn smoke_test() {
// on accesses when the layer is evicted, it will automatically be downloaded. // on accesses when the layer is evicted, it will automatically be downloaded.
let img_after = { let img_after = {
let mut data = ValuesReconstructState::default(); let mut data = ValueReconstructState::default();
layer layer
.get_values_reconstruct_data( .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
controlfile_keyspace.clone(),
Lsn(0x10)..Lsn(0x11),
&mut data,
&ctx,
)
.instrument(download_span.clone()) .instrument(download_span.clone())
.await .await
.unwrap(); .unwrap();
data.keys data.img.take().unwrap()
.remove(&CONTROLFILE_KEY)
.expect("must be present")
.expect("should not error")
.img
.take()
.expect("tenant harness writes the control file")
}; };
assert_eq!(img_before, img_after); assert_eq!(img_before, img_after);
@@ -176,7 +152,7 @@ async fn smoke_test() {
{ {
let layers = &[layer]; let layers = &[layer];
let mut g = timeline.layers.write().await; let mut g = timeline.layers.write().await;
g.open_mut().unwrap().finish_gc_timeline(layers); g.finish_gc_timeline(layers);
// this just updates the remote_physical_size for demonstration purposes // this just updates the remote_physical_size for demonstration purposes
rtc.schedule_gc_update(layers).unwrap(); rtc.schedule_gc_update(layers).unwrap();
} }
@@ -216,7 +192,7 @@ async fn evict_and_wait_on_wanted_deleted() {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -260,7 +236,7 @@ async fn evict_and_wait_on_wanted_deleted() {
// the deletion of the layer in remote_storage happens. // the deletion of the layer in remote_storage happens.
{ {
let mut layers = timeline.layers.write().await; let mut layers = timeline.layers.write().await;
layers.open_mut().unwrap().finish_gc_timeline(&[layer]); layers.finish_gc_timeline(&[layer]);
} }
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -301,7 +277,7 @@ fn read_wins_pending_eviction() {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -433,7 +409,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -602,7 +578,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -682,7 +658,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
let layer = { let layer = {
let mut layers = { let mut layers = {
let layers = timeline.layers.read().await; let layers = timeline.layers.read().await;
layers.likely_resident_layers().cloned().collect::<Vec<_>>() layers.likely_resident_layers().collect::<Vec<_>>()
}; };
assert_eq!(layers.len(), 1); assert_eq!(layers.len(), 1);
@@ -801,9 +777,9 @@ async fn eviction_cancellation_on_drop() {
let (evicted_layer, not_evicted) = { let (evicted_layer, not_evicted) = {
let mut layers = { let mut layers = {
let mut guard = timeline.layers.write().await; let mut guard = timeline.layers.write().await;
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>(); let layers = guard.likely_resident_layers().collect::<Vec<_>>();
// remove the layers from layermap // remove the layers from layermap
guard.open_mut().unwrap().finish_gc_timeline(&layers); guard.finish_gc_timeline(&layers);
layers layers
}; };
@@ -854,7 +830,7 @@ async fn eviction_cancellation_on_drop() {
fn layer_size() { fn layer_size() {
assert_eq!(size_of::<LayerAccessStats>(), 8); assert_eq!(size_of::<LayerAccessStats>(), 8);
assert_eq!(size_of::<PersistentLayerDesc>(), 104); assert_eq!(size_of::<PersistentLayerDesc>(), 104);
assert_eq!(size_of::<LayerInner>(), 296); assert_eq!(size_of::<LayerInner>(), 312);
// it also has the utf8 path // it also has the utf8 path
} }

View File

@@ -1,454 +0,0 @@
use std::{ops::Range, sync::Arc};
use bytes::Bytes;
use pageserver_api::key::{Key, KEY_SIZE};
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
use crate::tenant::storage_layer::Layer;
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
/// An image writer that takes images and produces multiple image layers. The interface does not
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
/// to be cleaned up)
#[must_use]
pub struct SplitImageLayerWriter {
inner: ImageLayerWriter,
target_layer_size: u64,
generated_layers: Vec<ResidentLayer>,
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
lsn: Lsn,
}
impl SplitImageLayerWriter {
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_key: Key,
lsn: Lsn,
target_layer_size: u64,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: ImageLayerWriter::new(
conf,
timeline_id,
tenant_shard_id,
&(start_key..Key::MAX),
lsn,
ctx,
)
.await?,
generated_layers: Vec::new(),
conf,
timeline_id,
tenant_shard_id,
lsn,
})
}
pub async fn put_image(
&mut self,
key: Key,
img: Bytes,
tline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// The current estimation is an upper bound of the space that the key/image could take
// because we did not consider compression in this estimation. The resulting image layer
// could be smaller than the target size.
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
if self.inner.num_keys() >= 1
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
{
let next_image_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(key..Key::MAX),
self.lsn,
ctx,
)
.await?;
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
self.generated_layers.push(
prev_image_writer
.finish_with_end_key(tline, key, ctx)
.await?,
);
}
self.inner.put_image(key, img, ctx).await
}
pub(crate) async fn finish(
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Key,
) -> anyhow::Result<Vec<ResidentLayer>> {
let Self {
mut generated_layers,
inner,
..
} = self;
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
Ok(generated_layers)
}
/// When split writer fails, the caller should call this function and handle partially generated layers.
#[allow(dead_code)]
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
Ok((self.generated_layers, self.inner))
}
}
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
/// to be cleaned up).
#[must_use]
pub struct SplitDeltaLayerWriter {
inner: DeltaLayerWriter,
target_layer_size: u64,
generated_layers: Vec<ResidentLayer>,
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
lsn_range: Range<Lsn>,
}
impl SplitDeltaLayerWriter {
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_key: Key,
lsn_range: Range<Lsn>,
target_layer_size: u64,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
target_layer_size,
inner: DeltaLayerWriter::new(
conf,
timeline_id,
tenant_shard_id,
start_key,
lsn_range.clone(),
ctx,
)
.await?,
generated_layers: Vec::new(),
conf,
timeline_id,
tenant_shard_id,
lsn_range,
})
}
pub async fn put_value(
&mut self,
key: Key,
lsn: Lsn,
val: Value,
tline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
if self.inner.num_keys() >= 1
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
{
let next_delta_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
key,
self.lsn_range.clone(),
ctx,
)
.await?;
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
self.generated_layers.push(delta_layer);
}
self.inner.put_value(key, lsn, val, ctx).await
}
pub(crate) async fn finish(
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
end_key: Key,
) -> anyhow::Result<Vec<ResidentLayer>> {
let Self {
mut generated_layers,
inner,
..
} = self;
let (desc, path) = inner.finish(end_key, ctx).await?;
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
generated_layers.push(delta_layer);
Ok(generated_layers)
}
/// When split writer fails, the caller should call this function and handle partially generated layers.
#[allow(dead_code)]
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
Ok((self.generated_layers, self.inner))
}
}
#[cfg(test)]
mod tests {
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::AsLayerDesc,
},
DEFAULT_PG_VERSION,
};
use super::*;
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
fn get_img(id: u32) -> Bytes {
format!("{id:064}").into()
}
fn get_large_img() -> Bytes {
vec![0; 8192].into()
}
#[tokio::test]
async fn write_one_image() {
let harness = TenantHarness::create("split_writer_write_one_image")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut image_writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
image_writer
.put_image(get_key(0), get_img(0), &tline, &ctx)
.await
.unwrap();
let layers = image_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 1);
delta_writer
.put_value(
get_key(0),
Lsn(0x18),
Value::Image(get_img(0)),
&tline,
&ctx,
)
.await
.unwrap();
let layers = delta_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 1);
}
#[tokio::test]
async fn write_split() {
let harness = TenantHarness::create("split_writer_write_split")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut image_writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024 * 1024,
&ctx,
)
.await
.unwrap();
const N: usize = 2000;
for i in 0..N {
let i = i as u32;
image_writer
.put_image(get_key(i), get_large_img(), &tline, &ctx)
.await
.unwrap();
delta_writer
.put_value(
get_key(i),
Lsn(0x20),
Value::Image(get_large_img()),
&tline,
&ctx,
)
.await
.unwrap();
}
let image_layers = image_writer
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
let delta_layers = delta_writer
.finish(&tline, &ctx, get_key(N as u32))
.await
.unwrap();
assert_eq!(image_layers.len(), N / 512 + 1);
assert_eq!(delta_layers.len(), N / 512 + 1);
for idx in 0..image_layers.len() {
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
if idx > 0 {
assert_eq!(
image_layers[idx - 1].layer_desc().key_range.end,
image_layers[idx].layer_desc().key_range.start
);
assert_eq!(
delta_layers[idx - 1].layer_desc().key_range.end,
delta_layers[idx].layer_desc().key_range.start
);
}
}
}
#[tokio::test]
async fn write_large_img() {
let harness = TenantHarness::create("split_writer_write_large_img")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
let mut image_writer = SplitImageLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18),
4 * 1024,
&ctx,
)
.await
.unwrap();
let mut delta_writer = SplitDeltaLayerWriter::new(
tenant.conf,
tline.timeline_id,
tenant.tenant_shard_id,
get_key(0),
Lsn(0x18)..Lsn(0x20),
4 * 1024,
&ctx,
)
.await
.unwrap();
image_writer
.put_image(get_key(0), get_img(0), &tline, &ctx)
.await
.unwrap();
image_writer
.put_image(get_key(1), get_large_img(), &tline, &ctx)
.await
.unwrap();
let layers = image_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 2);
delta_writer
.put_value(
get_key(0),
Lsn(0x18),
Value::Image(get_img(0)),
&tline,
&ctx,
)
.await
.unwrap();
delta_writer
.put_value(
get_key(1),
Lsn(0x1A),
Value::Image(get_large_img()),
&tline,
&ctx,
)
.await
.unwrap();
let layers = delta_writer
.finish(&tline, &ctx, get_key(10))
.await
.unwrap();
assert_eq!(layers.len(), 2);
}
}

View File

@@ -407,16 +407,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
error_run_count += 1; error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration); let wait_duration = Duration::from_secs_f64(wait_duration);
if matches!(e, crate::tenant::GcError::TimelineCancelled) { error!(
// Timeline was cancelled during gc. We might either be in an event "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
// that affects the entire tenant (tenant deletion, pageserver shutdown), );
// or in one that affects the timeline only (timeline deletion).
// Therefore, don't exit the loop.
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
} else {
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
}
wait_duration wait_duration
} }
} }

File diff suppressed because it is too large Load Diff

View File

@@ -19,10 +19,8 @@ use bytes::Bytes;
use enumset::EnumSet; use enumset::EnumSet;
use fail::fail_point; use fail::fail_point;
use itertools::Itertools; use itertools::Itertools;
use pageserver_api::key::KEY_SIZE;
use pageserver_api::keyspace::ShardedRange; use pageserver_api::keyspace::ShardedRange;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use serde::Serialize;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument}; use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId; use utils::id::TimelineId;
@@ -43,7 +41,6 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::keyspace::KeySpace; use crate::keyspace::KeySpace;
use crate::repository::{Key, Value}; use crate::repository::{Key, Value};
use crate::walrecord::NeonWalRecord;
use utils::lsn::Lsn; use utils::lsn::Lsn;
@@ -76,7 +73,6 @@ impl KeyHistoryRetention {
key: Key, key: Key,
delta_writer: &mut Vec<(Key, Lsn, Value)>, delta_writer: &mut Vec<(Key, Lsn, Value)>,
mut image_writer: Option<&mut ImageLayerWriter>, mut image_writer: Option<&mut ImageLayerWriter>,
stat: &mut CompactionStatistics,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut first_batch = true; let mut first_batch = true;
@@ -86,7 +82,6 @@ impl KeyHistoryRetention {
let Value::Image(img) = &logs[0].1 else { let Value::Image(img) = &logs[0].1 else {
unreachable!() unreachable!()
}; };
stat.produce_image_key(img);
if let Some(image_writer) = image_writer.as_mut() { if let Some(image_writer) = image_writer.as_mut() {
image_writer.put_image(key, img.clone(), ctx).await?; image_writer.put_image(key, img.clone(), ctx).await?;
} else { } else {
@@ -94,111 +89,24 @@ impl KeyHistoryRetention {
} }
} else { } else {
for (lsn, val) in logs { for (lsn, val) in logs {
stat.produce_key(&val);
delta_writer.push((key, lsn, val)); delta_writer.push((key, lsn, val));
} }
} }
first_batch = false; first_batch = false;
} else { } else {
for (lsn, val) in logs { for (lsn, val) in logs {
stat.produce_key(&val);
delta_writer.push((key, lsn, val)); delta_writer.push((key, lsn, val));
} }
} }
} }
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon; let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
for (lsn, val) in above_horizon_logs { for (lsn, val) in above_horizon_logs {
stat.produce_key(&val);
delta_writer.push((key, lsn, val)); delta_writer.push((key, lsn, val));
} }
Ok(()) Ok(())
} }
} }
#[derive(Debug, Serialize, Default)]
struct CompactionStatisticsNumSize {
num: u64,
size: u64,
}
#[derive(Debug, Serialize, Default)]
pub struct CompactionStatistics {
delta_layer_visited: CompactionStatisticsNumSize,
image_layer_visited: CompactionStatisticsNumSize,
delta_layer_produced: CompactionStatisticsNumSize,
image_layer_produced: CompactionStatisticsNumSize,
num_delta_layer_discarded: usize,
num_image_layer_discarded: usize,
num_unique_keys_visited: usize,
wal_keys_visited: CompactionStatisticsNumSize,
image_keys_visited: CompactionStatisticsNumSize,
wal_produced: CompactionStatisticsNumSize,
image_produced: CompactionStatisticsNumSize,
}
impl CompactionStatistics {
fn estimated_size_of_value(val: &Value) -> usize {
match val {
Value::Image(img) => img.len(),
Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
_ => std::mem::size_of::<NeonWalRecord>(),
}
}
fn estimated_size_of_key() -> usize {
KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
}
fn visit_delta_layer(&mut self, size: u64) {
self.delta_layer_visited.num += 1;
self.delta_layer_visited.size += size;
}
fn visit_image_layer(&mut self, size: u64) {
self.image_layer_visited.num += 1;
self.image_layer_visited.size += size;
}
fn on_unique_key_visited(&mut self) {
self.num_unique_keys_visited += 1;
}
fn visit_wal_key(&mut self, val: &Value) {
self.wal_keys_visited.num += 1;
self.wal_keys_visited.size +=
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
}
fn visit_image_key(&mut self, val: &Value) {
self.image_keys_visited.num += 1;
self.image_keys_visited.size +=
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
}
fn produce_key(&mut self, val: &Value) {
match val {
Value::Image(img) => self.produce_image_key(img),
Value::WalRecord(_) => self.produce_wal_key(val),
}
}
fn produce_wal_key(&mut self, val: &Value) {
self.wal_produced.num += 1;
self.wal_produced.size +=
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
}
fn produce_image_key(&mut self, val: &Bytes) {
self.image_produced.num += 1;
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
}
fn discard_delta_layer(&mut self) {
self.num_delta_layer_discarded += 1;
}
fn discard_image_layer(&mut self) {
self.num_image_layer_discarded += 1;
}
fn produce_delta_layer(&mut self, size: u64) {
self.delta_layer_produced.num += 1;
self.delta_layer_produced.size += size;
}
fn produce_image_layer(&mut self, size: u64) {
self.image_layer_produced.num += 1;
self.image_layer_produced.size += size;
}
}
impl Timeline { impl Timeline {
/// TODO: cancellation /// TODO: cancellation
/// ///
@@ -210,18 +118,12 @@ impl Timeline {
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<bool, CompactionError> { ) -> Result<bool, CompactionError> {
if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) { if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
self.compact_with_gc(cancel, flags, ctx) self.compact_with_gc(cancel, ctx)
.await .await
.map_err(CompactionError::Other)?; .map_err(CompactionError::Other)?;
return Ok(false); return Ok(false);
} }
if flags.contains(CompactFlags::DryRun) {
return Err(CompactionError::Other(anyhow!(
"dry-run mode is not supported for legacy compaction for now"
)));
}
// High level strategy for compaction / image creation: // High level strategy for compaction / image creation:
// //
// 1. First, calculate the desired "partitioning" of the // 1. First, calculate the desired "partitioning" of the
@@ -371,7 +273,7 @@ impl Timeline {
); );
let layers = self.layers.read().await; let layers = self.layers.read().await;
for layer_desc in layers.layer_map()?.iter_historic_layers() { for layer_desc in layers.layer_map().iter_historic_layers() {
let layer = layers.get_from_desc(&layer_desc); let layer = layers.get_from_desc(&layer_desc);
if layer.metadata().shard.shard_count == self.shard_identity.count { if layer.metadata().shard.shard_count == self.shard_identity.count {
// This layer does not belong to a historic ancestor, no need to re-image it. // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -549,9 +451,7 @@ impl Timeline {
/// ///
/// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
/// that we know won't be needed for reads. /// that we know won't be needed for reads.
pub(super) async fn update_layer_visibility( pub(super) async fn update_layer_visibility(&self) {
&self,
) -> Result<(), super::layer_manager::Shutdown> {
let head_lsn = self.get_last_record_lsn(); let head_lsn = self.get_last_record_lsn();
// We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas
@@ -559,7 +459,7 @@ impl Timeline {
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
// they will be subject to L0->L1 compaction in the near future. // they will be subject to L0->L1 compaction in the near future.
let layer_manager = self.layers.read().await; let layer_manager = self.layers.read().await;
let layer_map = layer_manager.layer_map()?; let layer_map = layer_manager.layer_map();
let readable_points = { let readable_points = {
let children = self.gc_info.read().unwrap().retain_lsns.clone(); let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -582,7 +482,6 @@ impl Timeline {
// TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
// avoid assuming that everything at a branch point is visible. // avoid assuming that everything at a branch point is visible.
drop(covered); drop(covered);
Ok(())
} }
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -636,8 +535,12 @@ impl Timeline {
) -> Result<CompactLevel0Phase1Result, CompactionError> { ) -> Result<CompactLevel0Phase1Result, CompactionError> {
stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_held_spawn_blocking_startup_micros =
stats.read_lock_acquisition_micros.till_now(); // set by caller stats.read_lock_acquisition_micros.till_now(); // set by caller
let layers = guard.layer_map()?; let layers = guard.layer_map();
let level0_deltas = layers.level0_deltas(); let level0_deltas = layers.get_level0_deltas();
let mut level0_deltas = level0_deltas
.into_iter()
.map(|x| guard.get_from_desc(&x))
.collect_vec();
stats.level0_deltas_count = Some(level0_deltas.len()); stats.level0_deltas_count = Some(level0_deltas.len());
// Only compact if enough layers have accumulated. // Only compact if enough layers have accumulated.
@@ -650,11 +553,6 @@ impl Timeline {
return Ok(CompactLevel0Phase1Result::default()); return Ok(CompactLevel0Phase1Result::default());
} }
let mut level0_deltas = level0_deltas
.iter()
.map(|x| guard.get_from_desc(x))
.collect::<Vec<_>>();
// Gather the files to compact in this iteration. // Gather the files to compact in this iteration.
// //
// Start with the oldest Level 0 delta file, and collect any other // Start with the oldest Level 0 delta file, and collect any other
@@ -1108,16 +1006,14 @@ impl Timeline {
|| contains_hole || contains_hole
{ {
// ... if so, flush previous layer and prepare to write new one // ... if so, flush previous layer and prepare to write new one
let (desc, path) = writer new_layers.push(
writer
.take() .take()
.unwrap() .unwrap()
.finish(prev_key.unwrap().next(), ctx) .finish(prev_key.unwrap().next(), self, ctx)
.await .await
.map_err(CompactionError::Other)?; .map_err(CompactionError::Other)?,
let new_delta = Layer::finish_creating(self.conf, self, desc, &path) );
.map_err(CompactionError::Other)?;
new_layers.push(new_delta);
writer = None; writer = None;
if contains_hole { if contains_hole {
@@ -1180,13 +1076,12 @@ impl Timeline {
prev_key = Some(key); prev_key = Some(key);
} }
if let Some(writer) = writer { if let Some(writer) = writer {
let (desc, path) = writer new_layers.push(
.finish(prev_key.unwrap().next(), ctx) writer
.finish(prev_key.unwrap().next(), self, ctx)
.await .await
.map_err(CompactionError::Other)?; .map_err(CompactionError::Other)?,
let new_delta = Layer::finish_creating(self.conf, self, desc, &path) );
.map_err(CompactionError::Other)?;
new_layers.push(new_delta);
} }
// Sync layers // Sync layers
@@ -1411,9 +1306,10 @@ impl Timeline {
// Find the top of the historical layers // Find the top of the historical layers
let end_lsn = { let end_lsn = {
let guard = self.layers.read().await; let guard = self.layers.read().await;
let layers = guard.layer_map()?; let layers = guard.layer_map();
let l0_deltas = layers.level0_deltas(); let l0_deltas = layers.get_level0_deltas();
drop(guard);
// As an optimization, if we find that there are too few L0 layers, // As an optimization, if we find that there are too few L0 layers,
// bail out early. We know that the compaction algorithm would do // bail out early. We know that the compaction algorithm would do
@@ -1745,47 +1641,38 @@ impl Timeline {
pub(crate) async fn compact_with_gc( pub(crate) async fn compact_with_gc(
self: &Arc<Self>, self: &Arc<Self>,
cancel: &CancellationToken, cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
use std::collections::BTreeSet; use std::collections::BTreeSet;
// Block other compaction/GC tasks from running for now. GC-compaction could run along // Block other compaction/GC tasks from running for now. GC-compaction could run along
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // with legacy compaction tasks in the future.
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
let gc_lock = async { let _compaction_lock = tokio::select! {
tokio::select! { guard = self.compaction_lock.lock() => guard,
guard = self.gc_lock.lock() => Ok(guard),
// TODO: refactor to CompactionError to correctly pass cancelled error // TODO: refactor to CompactionError to correctly pass cancelled error
_ = cancel.cancelled() => Err(anyhow!("cancelled")), _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
}
}; };
let gc_lock = crate::timed( let _gc = tokio::select! {
gc_lock, guard = self.gc_lock.lock() => guard,
"acquires gc lock", // TODO: refactor to CompactionError to correctly pass cancelled error
std::time::Duration::from_secs(5), _ = cancel.cancelled() => return Err(anyhow!("cancelled")),
) };
.await?;
let dry_run = flags.contains(CompactFlags::DryRun); info!("running enhanced gc bottom-most compaction");
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
scopeguard::defer! { scopeguard::defer! {
info!("done enhanced gc bottom-most compaction"); info!("done enhanced gc bottom-most compaction");
}; };
let mut stat = CompactionStatistics::default();
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon. // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
// The layer selection has the following properties: // The layer selection has the following properties:
// 1. If a layer is in the selection, all layers below it are in the selection. // 1. If a layer is in the selection, all layers below it are in the selection.
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = { let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
let guard = self.layers.read().await; let guard = self.layers.read().await;
let layers = guard.layer_map()?; let layers = guard.layer_map();
let gc_info = self.gc_info.read().unwrap(); let gc_info = self.gc_info.read().unwrap();
let mut retain_lsns_below_horizon = Vec::new(); let mut retain_lsns_below_horizon = Vec::new();
let gc_cutoff = gc_info.cutoffs.select_min(); let gc_cutoff = gc_info.cutoffs.select_min();
@@ -1849,9 +1736,6 @@ impl Timeline {
let key_range = desc.get_key_range(); let key_range = desc.get_key_range();
delta_split_points.insert(key_range.start); delta_split_points.insert(key_range.start);
delta_split_points.insert(key_range.end); delta_split_points.insert(key_range.end);
stat.visit_delta_layer(desc.file_size());
} else {
stat.visit_image_layer(desc.file_size());
} }
} }
let mut delta_layers = Vec::new(); let mut delta_layers = Vec::new();
@@ -1887,8 +1771,6 @@ impl Timeline {
tline: &Arc<Timeline>, tline: &Arc<Timeline>,
lowest_retain_lsn: Lsn, lowest_retain_lsn: Lsn,
ctx: &RequestContext, ctx: &RequestContext,
stats: &mut CompactionStatistics,
dry_run: bool,
last_batch: bool, last_batch: bool,
) -> anyhow::Result<Option<FlushDeltaResult>> { ) -> anyhow::Result<Option<FlushDeltaResult>> {
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1945,7 +1827,6 @@ impl Timeline {
let layer_generation = guard.get_from_key(&delta_key).metadata().generation; let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
drop(guard); drop(guard);
if layer_generation == tline.generation { if layer_generation == tline.generation {
stats.discard_delta_layer();
// TODO: depending on whether we design this compaction process to run along with // TODO: depending on whether we design this compaction process to run along with
// other compactions, there could be layer map modifications after we drop the // other compactions, there could be layer map modifications after we drop the
// layer guard, and in case it creates duplicated layer key, we will still error // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1972,16 +1853,9 @@ impl Timeline {
for (key, lsn, val) in deltas { for (key, lsn, val) in deltas {
delta_layer_writer.put_value(key, lsn, val, ctx).await?; delta_layer_writer.put_value(key, lsn, val, ctx).await?;
} }
let delta_layer = delta_layer_writer
stats.produce_delta_layer(delta_layer_writer.size()); .finish(delta_key.key_range.end, tline, ctx)
if dry_run {
return Ok(None);
}
let (desc, path) = delta_layer_writer
.finish(delta_key.key_range.end, ctx)
.await?; .await?;
let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer))) Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
} }
@@ -2073,13 +1947,6 @@ impl Timeline {
let mut current_delta_split_point = 0; let mut current_delta_split_point = 0;
let mut delta_layers = Vec::new(); let mut delta_layers = Vec::new();
while let Some((key, lsn, val)) = merge_iter.next().await? { while let Some((key, lsn, val)) = merge_iter.next().await? {
if cancel.is_cancelled() {
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
}
match val {
Value::Image(_) => stat.visit_image_key(&val),
Value::WalRecord(_) => stat.visit_wal_key(&val),
}
if last_key.is_none() || last_key.as_ref() == Some(&key) { if last_key.is_none() || last_key.as_ref() == Some(&key) {
if last_key.is_none() { if last_key.is_none() {
last_key = Some(key); last_key = Some(key);
@@ -2087,7 +1954,6 @@ impl Timeline {
accumulated_values.push((key, lsn, val)); accumulated_values.push((key, lsn, val));
} else { } else {
let last_key = last_key.as_mut().unwrap(); let last_key = last_key.as_mut().unwrap();
stat.on_unique_key_visited();
let retention = self let retention = self
.generate_key_retention( .generate_key_retention(
*last_key, *last_key,
@@ -2104,7 +1970,6 @@ impl Timeline {
*last_key, *last_key,
&mut delta_values, &mut delta_values,
image_layer_writer.as_mut(), image_layer_writer.as_mut(),
&mut stat,
ctx, ctx,
) )
.await?; .await?;
@@ -2117,8 +1982,6 @@ impl Timeline {
self, self,
lowest_retain_lsn, lowest_retain_lsn,
ctx, ctx,
&mut stat,
dry_run,
false, false,
) )
.await?, .await?,
@@ -2131,7 +1994,6 @@ impl Timeline {
let last_key = last_key.expect("no keys produced during compaction"); let last_key = last_key.expect("no keys produced during compaction");
// TODO: move this part to the loop body // TODO: move this part to the loop body
stat.on_unique_key_visited();
let retention = self let retention = self
.generate_key_retention( .generate_key_retention(
last_key, last_key,
@@ -2148,7 +2010,6 @@ impl Timeline {
last_key, last_key,
&mut delta_values, &mut delta_values,
image_layer_writer.as_mut(), image_layer_writer.as_mut(),
&mut stat,
ctx, ctx,
) )
.await?; .await?;
@@ -2161,8 +2022,6 @@ impl Timeline {
self, self,
lowest_retain_lsn, lowest_retain_lsn,
ctx, ctx,
&mut stat,
dry_run,
true, true,
) )
.await?, .await?,
@@ -2170,28 +2029,12 @@ impl Timeline {
assert!(delta_values.is_empty(), "unprocessed keys"); assert!(delta_values.is_empty(), "unprocessed keys");
let image_layer = if discard_image_layer { let image_layer = if discard_image_layer {
stat.discard_image_layer();
None None
} else if let Some(writer) = image_layer_writer { } else if let Some(writer) = image_layer_writer {
stat.produce_image_layer(writer.size());
if !dry_run {
Some(writer.finish(self, ctx).await?) Some(writer.finish(self, ctx).await?)
} else { } else {
None None
}
} else {
None
}; };
info!(
"gc-compaction statistics: {}",
serde_json::to_string(&stat)?
);
if dry_run {
return Ok(());
}
info!( info!(
"produced {} delta layers and {} image layers", "produced {} delta layers and {} image layers",
delta_layers.len(), delta_layers.len(),
@@ -2215,19 +2058,14 @@ impl Timeline {
let mut layer_selection = layer_selection; let mut layer_selection = layer_selection;
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
compact_to.extend(image_layer); compact_to.extend(image_layer);
// Step 3: Place back to the layer map. // Step 3: Place back to the layer map.
{ {
let mut guard = self.layers.write().await; let mut guard = self.layers.write().await;
guard guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
.open_mut()?
.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
}; };
self.remote_client self.remote_client
.schedule_compaction_update(&layer_selection, &compact_to)?; .schedule_compaction_update(&layer_selection, &compact_to)?;
drop(gc_lock);
Ok(()) Ok(())
} }
} }
@@ -2301,7 +2139,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
self.flush_updates().await?; self.flush_updates().await?;
let guard = self.timeline.layers.read().await; let guard = self.timeline.layers.read().await;
let layer_map = guard.layer_map()?; let layer_map = guard.layer_map();
let result = layer_map let result = layer_map
.iter_historic_layers() .iter_historic_layers()
@@ -2424,9 +2262,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
)) ))
}); });
let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?; let new_delta_layer = writer
let new_delta_layer = .finish(prev.unwrap().0.next(), &self.timeline, ctx)
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; .await?;
self.new_deltas.push(new_delta_layer); self.new_deltas.push(new_delta_layer);
Ok(()) Ok(())

View File

@@ -63,19 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
timeline: &Timeline, timeline: &Timeline,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
// Always ensure the lock order is compaction -> gc. let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
let compaction_lock = timeline.compaction_lock.lock(); let guards = crate::timed(
let compaction_lock = crate::timed( guards,
compaction_lock, "acquire gc and compaction locks",
"acquires compaction lock",
std::time::Duration::from_secs(5),
)
.await;
let gc_lock = timeline.gc_lock.lock();
let gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
std::time::Duration::from_secs(5), std::time::Duration::from_secs(5),
) )
.await; .await;
@@ -116,8 +107,7 @@ pub(super) async fn delete_local_timeline_directory(
.context("fsync_pre_mark_remove")?; .context("fsync_pre_mark_remove")?;
info!("finished deleting layer files, releasing locks"); info!("finished deleting layer files, releasing locks");
drop(gc_lock); drop(guards);
drop(compaction_lock);
fail::fail_point!("timeline-delete-after-rm", |_| { fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -230,8 +220,6 @@ impl DeleteTimelineFlow {
// Now that the Timeline is in Stopping state, request all the related tasks to shut down. // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Hard).await; timeline.shutdown(super::ShutdownMode::Hard).await;
tenant.gc_block.before_delete(&timeline);
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!( Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at" "failpoint: timeline-delete-before-index-deleted-at"

View File

@@ -1,4 +1,4 @@
use std::{collections::HashSet, sync::Arc}; use std::sync::Arc;
use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
use crate::{ use crate::{
@@ -74,11 +74,6 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
Error::ShuttingDown Error::ShuttingDown
} }
} }
impl From<super::layer_manager::Shutdown> for Error {
fn from(_: super::layer_manager::Shutdown) -> Self {
Error::ShuttingDown
}
}
impl From<FlushLayerError> for Error { impl From<FlushLayerError> for Error {
fn from(value: FlushLayerError) -> Self { fn from(value: FlushLayerError) -> Self {
@@ -146,9 +141,50 @@ pub(super) async fn prepare(
} }
} }
let reparented_timelines = reparented_direct_children(detached, tenant)?; // detached has previously been detached; let's inspect each of the current timelines and
// report back the timelines which have been reparented by our detach
let mut all_direct_children = tenant
.timelines
.lock()
.unwrap()
.values()
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
.map(|tl| (tl.ancestor_lsn, tl.clone()))
.collect::<Vec<_>>();
let mut any_shutdown = false;
all_direct_children.retain(
|(_, tl)| match tl.remote_client.initialized_upload_queue() {
Ok(accessor) => accessor
.latest_uploaded_index_part()
.lineage
.is_reparented(),
Err(_shutdownalike) => {
// not 100% a shutdown, but let's bail early not to give inconsistent results in
// sharded enviroment.
any_shutdown = true;
true
}
},
);
if any_shutdown {
// it could be one or many being deleted; have client retry
return Err(Error::ShuttingDown);
}
let mut reparented = all_direct_children;
// why this instead of hashset? there is a reason, but I've forgotten it many times.
//
// maybe if this was a hashset we would not be able to distinguish some race condition.
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
return Ok(Progress::Done(AncestorDetached { return Ok(Progress::Done(AncestorDetached {
reparented_timelines, reparented_timelines: reparented
.into_iter()
.map(|(_, tl)| tl.timeline_id)
.collect(),
})); }));
}; };
@@ -241,7 +277,7 @@ pub(super) async fn prepare(
// between retries, these can change if compaction or gc ran in between. this will mean // between retries, these can change if compaction or gc ran in between. this will mean
// we have to redo work. // we have to redo work.
partition_work(ancestor_lsn, &layers)? partition_work(ancestor_lsn, &layers)
}; };
// TODO: layers are already sorted by something: use that to determine how much of remote // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -345,67 +381,16 @@ pub(super) async fn prepare(
Ok(Progress::Prepared(guard, prepared)) Ok(Progress::Prepared(guard, prepared))
} }
fn reparented_direct_children(
detached: &Arc<Timeline>,
tenant: &Tenant,
) -> Result<HashSet<TimelineId>, Error> {
let mut all_direct_children = tenant
.timelines
.lock()
.unwrap()
.values()
.filter_map(|tl| {
let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
if is_direct_child {
Some(tl.clone())
} else {
if let Some(timeline) = tl.ancestor_timeline.as_ref() {
assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
}
None
}
})
// Collect to avoid lock taking order problem with Tenant::timelines and
// Timeline::remote_client
.collect::<Vec<_>>();
let mut any_shutdown = false;
all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
Ok(accessor) => accessor
.latest_uploaded_index_part()
.lineage
.is_reparented(),
Err(_shutdownalike) => {
// not 100% a shutdown, but let's bail early not to give inconsistent results in
// sharded enviroment.
any_shutdown = true;
true
}
});
if any_shutdown {
// it could be one or many being deleted; have client retry
return Err(Error::ShuttingDown);
}
Ok(all_direct_children
.into_iter()
.map(|tl| tl.timeline_id)
.collect())
}
fn partition_work( fn partition_work(
ancestor_lsn: Lsn, ancestor_lsn: Lsn,
source: &LayerManager, source_layermap: &LayerManager,
) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> { ) -> (usize, Vec<Layer>, Vec<Layer>) {
let mut straddling_branchpoint = vec![]; let mut straddling_branchpoint = vec![];
let mut rest_of_historic = vec![]; let mut rest_of_historic = vec![];
let mut later_by_lsn = 0; let mut later_by_lsn = 0;
for desc in source.layer_map()?.iter_historic_layers() { for desc in source_layermap.layer_map().iter_historic_layers() {
// off by one chances here: // off by one chances here:
// - start is inclusive // - start is inclusive
// - end is exclusive // - end is exclusive
@@ -424,10 +409,10 @@ fn partition_work(
&mut rest_of_historic &mut rest_of_historic
}; };
target.push(source.get_from_desc(&desc)); target.push(source_layermap.get_from_desc(&desc));
} }
Ok((later_by_lsn, straddling_branchpoint, rest_of_historic)) (later_by_lsn, straddling_branchpoint, rest_of_historic)
} }
async fn upload_rewritten_layer( async fn upload_rewritten_layer(
@@ -503,12 +488,10 @@ async fn copy_lsn_prefix(
// reuse the key instead of adding more holes between layers by using the real // reuse the key instead of adding more holes between layers by using the real
// highest key in the layer. // highest key in the layer.
let reused_highest_key = layer.layer_desc().key_range.end; let reused_highest_key = layer.layer_desc().key_range.end;
let (desc, path) = writer let copied = writer
.finish(reused_highest_key, ctx) .finish(reused_highest_key, target_timeline, ctx)
.await .await
.map_err(CopyDeltaPrefix)?; .map_err(CopyDeltaPrefix)?;
let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
.map_err(CopyDeltaPrefix)?;
tracing::debug!(%layer, %copied, "new layer produced"); tracing::debug!(%layer, %copied, "new layer produced");
@@ -554,12 +537,11 @@ pub(super) async fn complete(
tenant: &Tenant, tenant: &Tenant,
prepared: PreparedTimelineDetach, prepared: PreparedTimelineDetach,
_ctx: &RequestContext, _ctx: &RequestContext,
) -> Result<HashSet<TimelineId>, anyhow::Error> { ) -> Result<Vec<TimelineId>, anyhow::Error> {
let PreparedTimelineDetach { layers } = prepared; let PreparedTimelineDetach { layers } = prepared;
let ancestor = detached let ancestor = detached
.ancestor_timeline .get_ancestor_timeline()
.as_ref()
.expect("must still have a ancestor"); .expect("must still have a ancestor");
let ancestor_lsn = detached.get_ancestor_lsn(); let ancestor_lsn = detached.get_ancestor_lsn();
@@ -599,7 +581,7 @@ pub(super) async fn complete(
} }
let tl_ancestor = tl.ancestor_timeline.as_ref()?; let tl_ancestor = tl.ancestor_timeline.as_ref()?;
let is_same = Arc::ptr_eq(ancestor, tl_ancestor); let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
let is_deleting = tl let is_deleting = tl
@@ -640,18 +622,13 @@ pub(super) async fn complete(
}); });
let reparenting_candidates = tasks.len(); let reparenting_candidates = tasks.len();
let mut reparented = HashSet::with_capacity(tasks.len()); let mut reparented = Vec::with_capacity(tasks.len());
while let Some(res) = tasks.join_next().await { while let Some(res) = tasks.join_next().await {
match res { match res {
Ok(Some(timeline)) => { Ok(Some(timeline)) => {
tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
assert!(
reparented.insert(timeline.timeline_id),
"duplicate reparenting? timeline_id={}",
timeline.timeline_id
);
} }
Ok(None) => { Ok(None) => {
// lets just ignore this for now. one or all reparented timelines could had // lets just ignore this for now. one or all reparented timelines could had
@@ -673,5 +650,12 @@ pub(super) async fn complete(
tracing::info!("failed to reparent some candidates"); tracing::info!("failed to reparent some candidates");
} }
reparented.sort_unstable();
let reparented = reparented
.into_iter()
.map(|(_, timeline_id)| timeline_id)
.collect();
Ok(reparented) Ok(reparented)
} }

View File

@@ -213,11 +213,19 @@ impl Timeline {
let mut js = tokio::task::JoinSet::new(); let mut js = tokio::task::JoinSet::new();
{ {
let guard = self.layers.read().await; let guard = self.layers.read().await;
let layers = guard.layer_map();
for layer in layers.iter_historic_layers() {
let layer = guard.get_from_desc(&layer);
guard // guard against eviction while we inspect it; it might be that eviction_task and
.likely_resident_layers() // disk_usage_eviction_task both select the same layers to be evicted, and
.filter(|layer| { // seemingly free up double the space. both succeeding is of no consequence.
let last_activity_ts = layer.latest_activity();
if !layer.is_likely_resident() {
continue;
}
let last_activity_ts = layer.access_stats().latest_activity();
let no_activity_for = match now.duration_since(last_activity_ts) { let no_activity_for = match now.duration_since(last_activity_ts) {
Ok(d) => d, Ok(d) => d,
@@ -237,21 +245,19 @@ impl Timeline {
// they would be meaningless outside of the pageserver process. // they would be meaningless outside of the pageserver process.
// At the time of writing, the trade-off is that access stats are more // At the time of writing, the trade-off is that access stats are more
// valuable than detecting clock skew. // valuable than detecting clock skew.
return false; continue;
} }
}; };
no_activity_for > p.threshold if no_activity_for > p.threshold {
})
.cloned()
.for_each(|layer| {
js.spawn(async move { js.spawn(async move {
layer layer
.evict_and_wait(std::time::Duration::from_secs(5)) .evict_and_wait(std::time::Duration::from_secs(5))
.await .await
}); });
stats.candidates += 1; stats.candidates += 1;
}); }
}
}; };
let join_all = async move { let join_all = async move {

View File

@@ -1,4 +1,4 @@
use anyhow::{bail, ensure, Context}; use anyhow::{bail, ensure, Context, Result};
use itertools::Itertools; use itertools::Itertools;
use pageserver_api::shard::TenantShardId; use pageserver_api::shard::TenantShardId;
use std::{collections::HashMap, sync::Arc}; use std::{collections::HashMap, sync::Arc};
@@ -24,142 +24,39 @@ use crate::{
use super::TimelineWriterState; use super::TimelineWriterState;
/// Provides semantic APIs to manipulate the layer map. /// Provides semantic APIs to manipulate the layer map.
pub(crate) enum LayerManager { #[derive(Default)]
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate pub(crate) struct LayerManager {
/// the layers. layer_map: LayerMap,
Open(OpenLayerManager), layer_fmgr: LayerFileManager<Layer>,
/// Shutdown layer manager where there are no more in-memory layers and persistent layers are
/// read-only.
Closed {
layers: HashMap<PersistentLayerKey, Layer>,
},
}
impl Default for LayerManager {
fn default() -> Self {
LayerManager::Open(OpenLayerManager::default())
}
} }
impl LayerManager { impl LayerManager {
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
// The assumption for the `expect()` is that all code maintains the following invariant: self.layer_fmgr.get_from_desc(desc)
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.layers()
.get(key)
.with_context(|| format!("get layer from key: {key}"))
.expect("not found")
.clone()
} }
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
self.get_from_key(&desc.key()) self.layer_fmgr.get_from_key(desc)
} }
/// Get an immutable reference to the layer map. /// Get an immutable reference to the layer map.
/// ///
/// We expect users only to be able to get an immutable layer map. If users want to make modifications, /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state. /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> { pub(crate) fn layer_map(&self) -> &LayerMap {
use LayerManager::*; &self.layer_map
match self {
Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
Closed { .. } => Err(Shutdown),
}
} }
pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
use LayerManager::*;
match self {
Open(open) => Ok(open),
Closed { .. } => Err(Shutdown),
}
}
/// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
/// order to allow shutdown to complete.
///
/// If there was a want to flush in-memory layers, it must have happened earlier.
pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
use LayerManager::*;
match self {
Open(OpenLayerManager {
layer_map,
layer_fmgr: LayerFileManager(hashmap),
}) => {
let open = layer_map.open_layer.take();
let frozen = layer_map.frozen_layers.len();
let taken_writer_state = writer_state.take();
tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
let layers = std::mem::take(hashmap);
*self = Closed { layers };
assert_eq!(open.is_some(), taken_writer_state.is_some());
}
Closed { .. } => {
tracing::debug!("ignoring multiple shutdowns on layer manager")
}
}
}
/// Sum up the historic layer sizes
pub(crate) fn layer_size_sum(&self) -> u64 {
self.layers()
.values()
.map(|l| l.layer_desc().file_size)
.sum()
}
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
self.layers().values().filter(|l| l.is_likely_resident())
}
pub(crate) fn contains(&self, layer: &Layer) -> bool {
self.contains_key(&layer.layer_desc().key())
}
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
self.layers().contains_key(key)
}
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
self.layers().keys().cloned().collect_vec()
}
fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
use LayerManager::*;
match self {
Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
Closed { layers } => layers,
}
}
}
#[derive(Default)]
pub(crate) struct OpenLayerManager {
layer_map: LayerMap,
layer_fmgr: LayerFileManager<Layer>,
}
impl std::fmt::Debug for OpenLayerManager {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OpenLayerManager")
.field("layer_count", &self.layer_fmgr.0.len())
.finish()
}
}
#[derive(Debug, thiserror::Error)]
#[error("layer manager has been shutdown")]
pub(crate) struct Shutdown;
impl OpenLayerManager {
/// Called from `load_layer_map`. Initialize the layer manager with: /// Called from `load_layer_map`. Initialize the layer manager with:
/// 1. all on-disk layers /// 1. all on-disk layers
/// 2. next open layer (with disk disk_consistent_lsn LSN) /// 2. next open layer (with disk disk_consistent_lsn LSN)
pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) { pub(crate) fn initialize_local_layers(
&mut self,
on_disk_layers: Vec<Layer>,
next_open_layer_at: Lsn,
) {
let mut updates = self.layer_map.batch_update(); let mut updates = self.layer_map.batch_update();
for layer in layers { for layer in on_disk_layers {
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
} }
updates.flush(); updates.flush();
@@ -171,19 +68,26 @@ impl OpenLayerManager {
self.layer_map.next_open_layer_at = Some(next_open_layer_at); self.layer_map.next_open_layer_at = Some(next_open_layer_at);
} }
/// Open a new writable layer to append data if there is no open layer, otherwise return the /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
/// current open layer, called within `get_layer_for_write`. /// called within `get_layer_for_write`.
pub(crate) async fn get_layer_for_write( pub(crate) async fn get_layer_for_write(
&mut self, &mut self,
lsn: Lsn, lsn: Lsn,
last_record_lsn: Lsn,
conf: &'static PageServerConf, conf: &'static PageServerConf,
timeline_id: TimelineId, timeline_id: TimelineId,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<Arc<InMemoryLayer>> { ) -> Result<Arc<InMemoryLayer>> {
ensure!(lsn.is_aligned()); ensure!(lsn.is_aligned());
ensure!(
lsn > last_record_lsn,
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
lsn,
last_record_lsn,
);
// Do we have a layer open for writing already? // Do we have a layer open for writing already?
let layer = if let Some(open_layer) = &self.layer_map.open_layer { let layer = if let Some(open_layer) = &self.layer_map.open_layer {
if open_layer.get_lsn_range().start > lsn { if open_layer.get_lsn_range().start > lsn {
@@ -209,15 +113,8 @@ impl OpenLayerManager {
lsn lsn
); );
let new_layer = InMemoryLayer::create( let new_layer =
conf, InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
timeline_id,
tenant_shard_id,
start_lsn,
gate_guard,
ctx,
)
.await?;
let layer = Arc::new(new_layer); let layer = Arc::new(new_layer);
self.layer_map.open_layer = Some(layer.clone()); self.layer_map.open_layer = Some(layer.clone());
@@ -271,7 +168,7 @@ impl OpenLayerManager {
froze froze
} }
/// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`]. /// Add image layers to the layer map, called from `create_image_layers`.
pub(crate) fn track_new_image_layers( pub(crate) fn track_new_image_layers(
&mut self, &mut self,
image_layers: &[ResidentLayer], image_layers: &[ResidentLayer],
@@ -344,7 +241,7 @@ impl OpenLayerManager {
self.finish_compact_l0(compact_from, compact_to, metrics) self.finish_compact_l0(compact_from, compact_to, metrics)
} }
/// Called post-compaction when some previous generation image layers were trimmed. /// Called when compaction is completed.
pub(crate) fn rewrite_layers( pub(crate) fn rewrite_layers(
&mut self, &mut self,
rewrite_layers: &[(Layer, ResidentLayer)], rewrite_layers: &[(Layer, ResidentLayer)],
@@ -362,10 +259,13 @@ impl OpenLayerManager {
new_layer.layer_desc().lsn_range new_layer.layer_desc().lsn_range
); );
// Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to // Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
// always marking rewritten layers as visible. // always marking rewritten layers as visible.
new_layer.as_ref().set_visibility(old_layer.visibility()); new_layer
.as_ref()
.access_stats()
.set_visibility(old_layer.access_stats().visibility());
// Safety: we may never rewrite the same file in-place. Callers are responsible // Safety: we may never rewrite the same file in-place. Callers are responsible
// for ensuring that they only rewrite layers after something changes the path, // for ensuring that they only rewrite layers after something changes the path,
@@ -433,6 +333,31 @@ impl OpenLayerManager {
mapping.remove(layer); mapping.remove(layer);
layer.delete_on_drop(); layer.delete_on_drop();
} }
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
// for small layer maps, we most likely have all resident, but for larger more are likely
// to be evicted assuming lots of layers correlated with longer lifespan.
self.layer_map().iter_historic_layers().filter_map(|desc| {
self.layer_fmgr
.0
.get(&desc.key())
.filter(|l| l.is_likely_resident())
.cloned()
})
}
pub(crate) fn contains(&self, layer: &Layer) -> bool {
self.layer_fmgr.contains(layer)
}
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
self.layer_fmgr.contains_key(key)
}
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
self.layer_fmgr.0.keys().cloned().collect_vec()
}
} }
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>); pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -444,6 +369,24 @@ impl<T> Default for LayerFileManager<T> {
} }
impl<T: AsLayerDesc + Clone> LayerFileManager<T> { impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(key)
.with_context(|| format!("get layer from key: {}", key))
.expect("not found")
.clone()
}
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
self.get_from_key(&desc.key())
}
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
self.0.contains_key(key)
}
pub(crate) fn insert(&mut self, layer: T) { pub(crate) fn insert(&mut self, layer: T) {
let present = self.0.insert(layer.layer_desc().key(), layer.clone()); let present = self.0.insert(layer.layer_desc().key(), layer.clone());
if present.is_some() && cfg!(debug_assertions) { if present.is_some() && cfg!(debug_assertions) {
@@ -451,6 +394,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
} }
} }
pub(crate) fn contains(&self, layer: &T) -> bool {
self.0.contains_key(&layer.layer_desc().key())
}
pub(crate) fn remove(&mut self, layer: &T) { pub(crate) fn remove(&mut self, layer: &T) {
let present = self.0.remove(&layer.layer_desc().key()); let present = self.0.remove(&layer.layer_desc().key());
if present.is_none() && cfg!(debug_assertions) { if present.is_none() && cfg!(debug_assertions) {

View File

@@ -122,10 +122,6 @@ impl CurrentLogicalSize {
Self::Exact(_) => Accuracy::Exact, Self::Exact(_) => Accuracy::Exact,
} }
} }
pub(crate) fn is_exact(&self) -> bool {
matches!(self, Self::Exact(_))
}
} }
impl LogicalSize { impl LogicalSize {

View File

@@ -30,12 +30,10 @@ use tokio::time::Instant;
pub use pageserver_api::models::virtual_file as api; pub use pageserver_api::models::virtual_file as api;
pub(crate) mod io_engine; pub(crate) mod io_engine;
pub use io_engine::feature_test as io_engine_feature_test; pub use io_engine::feature_test as io_engine_feature_test;
pub use io_engine::io_engine_for_bench;
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
mod metadata; mod metadata;
mod open_options; mod open_options;
use self::owned_buffers_io::write::OwnedAsyncWriter; use self::owned_buffers_io::write::OwnedAsyncWriter;
pub(crate) use api::DirectIoMode;
pub(crate) use io_engine::IoEngineKind; pub(crate) use io_engine::IoEngineKind;
pub(crate) use metadata::Metadata; pub(crate) use metadata::Metadata;
pub(crate) use open_options::*; pub(crate) use open_options::*;

View File

@@ -328,29 +328,3 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
.join() .join()
.unwrap() .unwrap()
} }
/// For use in benchmark binaries only.
///
/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
/// developer time trying to figure out why it's slow.
///
/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
pub fn io_engine_for_bench() -> IoEngineKind {
#[cfg(not(target_os = "linux"))]
{
panic!("This benchmark does I/O and can only give a representative result on Linux");
}
#[cfg(target_os = "linux")]
{
match feature_test().unwrap() {
FeatureTestResult::PlatformPreferred(engine) => engine,
FeatureTestResult::Worse {
engine: _engine,
remark,
} => {
panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
}
}
}
}

View File

@@ -45,7 +45,6 @@ static const char *jwt_token = NULL;
/* GUCs */ /* GUCs */
static char *ConsoleURL = NULL; static char *ConsoleURL = NULL;
static bool ForwardDDL = true; static bool ForwardDDL = true;
static bool RegressTestMode = false;
/* /*
* CURL docs say that this buffer must exist until we call curl_easy_cleanup * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -803,14 +802,6 @@ NeonProcessUtility(
case T_DropRoleStmt: case T_DropRoleStmt:
HandleDropRole(castNode(DropRoleStmt, parseTree)); HandleDropRole(castNode(DropRoleStmt, parseTree));
break; break;
case T_CreateTableSpaceStmt:
if (!RegressTestMode)
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("CREATE TABLESPACE is not supported on Neon")));
}
break;
default: default:
break; break;
} }
@@ -873,18 +864,6 @@ InitControlPlaneConnector()
NULL, NULL,
NULL); NULL);
DefineCustomBoolVariable(
"neon.regress_test_mode",
"Controls whether we are running in the regression test mode",
NULL,
&RegressTestMode,
false,
PGC_SUSET,
0,
NULL,
NULL,
NULL);
jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
if (!jwt_token) if (!jwt_token)
{ {

184
poetry.lock generated
View File

@@ -1,103 +1,91 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
version = "2.3.5"
description = "Happy Eyeballs for asyncio"
optional = false
python-versions = ">=3.8"
files = [
{file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
{file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
]
[[package]] [[package]]
name = "aiohttp" name = "aiohttp"
version = "3.10.2" version = "3.9.4"
description = "Async http client/server framework (asyncio)" description = "Async http client/server framework (asyncio)"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
{file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
{file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
{file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
{file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
{file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
{file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
{file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
{file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
{file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
{file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
{file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
{file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
{file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
{file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
{file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
{file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
{file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
{file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
{file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
{file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
{file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
{file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
{file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
{file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
{file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
] ]
[package.dependencies] [package.dependencies]
aiohappyeyeballs = ">=2.3.0"
aiosignal = ">=1.1.2" aiosignal = ">=1.1.2"
async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
attrs = ">=17.3.0" attrs = ">=17.3.0"
@@ -106,7 +94,7 @@ multidict = ">=4.5,<7.0"
yarl = ">=1.0,<2.0" yarl = ">=1.0,<2.0"
[package.extras] [package.extras]
speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] speedups = ["Brotli", "aiodns", "brotlicffi"]
[[package]] [[package]]
name = "aiopg" name = "aiopg"
@@ -1526,20 +1514,6 @@ files = [
[package.dependencies] [package.dependencies]
six = "*" six = "*"
[[package]]
name = "kafka-python"
version = "2.0.2"
description = "Pure Python client for Apache Kafka"
optional = false
python-versions = "*"
files = [
{file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
{file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
]
[package.extras]
crc32c = ["crc32c"]
[[package]] [[package]]
name = "lazy-object-proxy" name = "lazy-object-proxy"
version = "1.10.0" version = "1.10.0"
@@ -3383,4 +3357,4 @@ cffi = ["cffi (>=1.11)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"

View File

@@ -1,5 +1,5 @@
[package] [package]
name = "proxy-core" name = "proxy"
version = "0.1.0" version = "0.1.0"
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
@@ -9,11 +9,8 @@ default = []
testing = [] testing = []
[dependencies] [dependencies]
proxy-sasl = { version = "0.1", path = "../sasl" }
ahash.workspace = true ahash.workspace = true
anyhow.workspace = true anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true async-compression.workspace = true
async-trait.workspace = true async-trait.workspace = true
atomic-take.workspace = true atomic-take.workspace = true
@@ -33,6 +30,7 @@ dashmap.workspace = true
env_logger.workspace = true env_logger.workspace = true
framed-websockets.workspace = true framed-websockets.workspace = true
futures.workspace = true futures.workspace = true
git-version.workspace = true
hashbrown.workspace = true hashbrown.workspace = true
hashlink.workspace = true hashlink.workspace = true
hex.workspace = true hex.workspace = true
@@ -53,15 +51,17 @@ md5.workspace = true
measured = { workspace = true, features = ["lasso"] } measured = { workspace = true, features = ["lasso"] }
metrics.workspace = true metrics.workspace = true
once_cell.workspace = true once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true parking_lot.workspace = true
parquet.workspace = true parquet.workspace = true
parquet_derive.workspace = true parquet_derive.workspace = true
pin-project-lite.workspace = true pin-project-lite.workspace = true
postgres_backend.workspace = true postgres_backend.workspace = true
pq_proto.workspace = true pq_proto.workspace = true
prometheus.workspace = true
rand.workspace = true rand.workspace = true
regex.workspace = true regex.workspace = true
remote_storage = { version = "0.1", path = "../../libs/remote_storage/" } remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest.workspace = true reqwest.workspace = true
reqwest-middleware = { workspace = true, features = ["json"] } reqwest-middleware = { workspace = true, features = ["json"] }
reqwest-retry.workspace = true reqwest-retry.workspace = true
@@ -73,13 +73,14 @@ rustls.workspace = true
scopeguard.workspace = true scopeguard.workspace = true
serde.workspace = true serde.workspace = true
serde_json.workspace = true serde_json.workspace = true
sha2 = { workspace = true, features = ["asm", "oid"] } sha2 = { workspace = true, features = ["asm"] }
smol_str.workspace = true smol_str.workspace = true
smallvec.workspace = true smallvec.workspace = true
socket2.workspace = true socket2.workspace = true
subtle.workspace = true subtle.workspace = true
task-local-extensions.workspace = true task-local-extensions.workspace = true
thiserror.workspace = true thiserror.workspace = true
tikv-jemallocator.workspace = true
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
tokio-postgres.workspace = true tokio-postgres.workspace = true
tokio-postgres-rustls.workspace = true tokio-postgres-rustls.workspace = true
@@ -91,7 +92,6 @@ tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true tracing-subscriber.workspace = true
tracing-utils.workspace = true tracing-utils.workspace = true
tracing.workspace = true tracing.workspace = true
try-lock.workspace = true
typed-json.workspace = true typed-json.workspace = true
url.workspace = true url.workspace = true
urlencoding.workspace = true urlencoding.workspace = true
@@ -102,14 +102,6 @@ x509-parser.workspace = true
postgres-protocol.workspace = true postgres-protocol.workspace = true
redis.workspace = true redis.workspace = true
# jwt stuff
jose-jwa = "0.1.2"
jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
signature = "2"
ecdsa = "0.16"
p256 = "0.13"
rsa = "0.9"
workspace_hack.workspace = true workspace_hack.workspace = true
[dev-dependencies] [dev-dependencies]

View File

@@ -1,554 +0,0 @@
use std::{future::Future, sync::Arc, time::Duration};
use anyhow::{bail, ensure, Context};
use arc_swap::ArcSwapOption;
use dashmap::DashMap;
use jose_jwk::crypto::KeyInfo;
use signature::Verifier;
use tokio::time::Instant;
use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
// TODO(conrad): make these configurable.
const MIN_RENEW: Duration = Duration::from_secs(30);
const AUTO_RENEW: Duration = Duration::from_secs(300);
const MAX_RENEW: Duration = Duration::from_secs(3600);
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
/// How to get the JWT auth rules
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
}
#[derive(Clone)]
struct FetchAuthRulesFromCplane {
#[allow(dead_code)]
endpoint: EndpointIdInt,
}
impl FetchAuthRules for FetchAuthRulesFromCplane {
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
Err(anyhow::anyhow!("not yet implemented"))
}
}
pub struct AuthRules {
jwks_urls: Vec<url::Url>,
}
#[derive(Default)]
pub struct JwkCache {
client: reqwest::Client,
map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
}
pub struct JwkCacheEntryLock {
cached: ArcSwapOption<JwkCacheEntry>,
lookup: tokio::sync::Semaphore,
}
impl Default for JwkCacheEntryLock {
fn default() -> Self {
JwkCacheEntryLock {
cached: ArcSwapOption::empty(),
lookup: tokio::sync::Semaphore::new(1),
}
}
}
pub struct JwkCacheEntry {
/// Should refetch at least every hour to verify when old keys have been removed.
/// Should refetch when new key IDs are seen only every 5 minutes or so
last_retrieved: Instant,
/// cplane will return multiple JWKs urls that we need to scrape.
key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
}
impl JwkCacheEntryLock {
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
JwkRenewalPermit::acquire_permit(self).await
}
fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
JwkRenewalPermit::try_acquire_permit(self)
}
async fn renew_jwks<F: FetchAuthRules>(
&self,
_permit: JwkRenewalPermit<'_>,
client: &reqwest::Client,
auth_rules: &F,
) -> anyhow::Result<Arc<JwkCacheEntry>> {
// double check that no one beat us to updating the cache.
let now = Instant::now();
let guard = self.cached.load_full();
if let Some(cached) = guard {
let last_update = now.duration_since(cached.last_retrieved);
if last_update < Duration::from_secs(300) {
return Ok(cached);
}
}
let rules = auth_rules.fetch_auth_rules().await?;
let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
rules.jwks_urls.len(),
ahash::RandomState::new(),
);
// TODO(conrad): run concurrently
for url in rules.jwks_urls {
let req = client.get(url.clone());
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
match req.send().await.and_then(|r| r.error_for_status()) {
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
// I expect these failures would be quite sparse.
Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
Ok(r) => {
let resp: http::Response<reqwest::Body> = r.into();
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
resp.into_body(),
MAX_JWK_BODY_SIZE,
)
.await
{
Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
Ok(jwks) => {
key_sets.insert(url, jwks);
}
}
}
}
}
let entry = Arc::new(JwkCacheEntry {
last_retrieved: now,
key_sets,
});
self.cached.swap(Some(Arc::clone(&entry)));
Ok(entry)
}
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
self: &Arc<Self>,
client: &reqwest::Client,
fetch: &F,
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
let now = Instant::now();
let guard = self.cached.load_full();
// if we have no cached JWKs, try and get some
let Some(cached) = guard else {
let permit = self.acquire_permit().await;
return self.renew_jwks(permit, client, fetch).await;
};
let last_update = now.duration_since(cached.last_retrieved);
// check if the cached JWKs need updating.
if last_update > MAX_RENEW {
let permit = self.acquire_permit().await;
// it's been too long since we checked the keys. wait for them to update.
return self.renew_jwks(permit, client, fetch).await;
}
// every 5 minutes we should spawn a job to eagerly update the token.
if last_update > AUTO_RENEW {
if let Some(permit) = self.try_acquire_permit() {
tracing::debug!("JWKs should be renewed. Renewal permit acquired");
let permit = permit.into_owned();
let entry = self.clone();
let client = client.clone();
let fetch = fetch.clone();
tokio::spawn(async move {
if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
tracing::warn!(error=?e, "could not fetch JWKs in background job");
}
});
} else {
tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
}
}
Ok(cached)
}
async fn check_jwt<F: FetchAuthRules>(
self: &Arc<Self>,
jwt: String,
client: &reqwest::Client,
fetch: &F,
) -> Result<(), anyhow::Error> {
// JWT compact form is defined to be
// <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
let (header_payload, signature) = jwt
.rsplit_once(".")
.context("not a valid compact JWT encoding")?;
let (header, _payload) = header_payload
.split_once(".")
.context("not a valid compact JWT encoding")?;
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
.context("not a valid compact JWT encoding")?;
let header = serde_json::from_slice::<JWTHeader>(&header)
.context("not a valid compact JWT encoding")?;
ensure!(header.typ == "JWT");
let kid = header.kid.context("missing key id")?;
let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
// get the key from the JWKs if possible. If not, wait for the keys to update.
let jwk = loop {
let jwk = guard
.key_sets
.values()
.flat_map(|jwks| &jwks.keys)
.find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
match jwk {
Some(jwk) => break jwk,
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
let permit = self.acquire_permit().await;
guard = self.renew_jwks(permit, client, fetch).await?;
}
_ => {
bail!("jwk not found");
}
}
};
ensure!(
jwk.is_supported(&header.alg),
"signature algorithm not supported"
);
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
.context("not a valid compact JWT encoding")?;
match &jwk.key {
jose_jwk::Key::Ec(key) => {
verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
}
jose_jwk::Key::Rsa(key) => {
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
}
key => bail!("unsupported key type {key:?}"),
};
// TODO(conrad): verify iss, exp, nbf, etc...
Ok(())
}
}
impl JwkCache {
pub async fn check_jwt(
&self,
endpoint: EndpointIdInt,
jwt: String,
) -> Result<(), anyhow::Error> {
// try with just a read lock first
let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
let entry = match entry {
Some(entry) => entry,
None => {
// acquire a write lock after to insert.
let entry = self.map.entry(endpoint).or_default();
Arc::clone(&*entry)
}
};
let fetch = FetchAuthRulesFromCplane { endpoint };
entry.check_jwt(jwt, &self.client, &fetch).await
}
}
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
use ecdsa::Signature;
use signature::Verifier;
match key.crv {
jose_jwk::EcCurves::P256 => {
let pk =
p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
let key = p256::ecdsa::VerifyingKey::from(&pk);
let sig = Signature::from_slice(sig)?;
key.verify(data, &sig)?;
}
key => bail!("unsupported ec key type {key:?}"),
}
Ok(())
}
fn verify_rsa_signature(
data: &[u8],
sig: &[u8],
key: &jose_jwk::Rsa,
alg: &Option<jose_jwa::Algorithm>,
) -> anyhow::Result<()> {
use jose_jwa::{Algorithm, Signing};
use rsa::{
pkcs1v15::{Signature, VerifyingKey},
RsaPublicKey,
};
let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
match alg {
Some(Algorithm::Signing(Signing::Rs256)) => {
let key = VerifyingKey::<sha2::Sha256>::new(key);
let sig = Signature::try_from(sig)?;
key.verify(data, &sig)?;
}
_ => bail!("invalid RSA signing algorithm"),
};
Ok(())
}
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
#[derive(serde::Deserialize, serde::Serialize)]
struct JWTHeader<'a> {
/// must be "JWT"
typ: &'a str,
/// must be a supported alg
alg: jose_jwa::Algorithm,
/// key id, must be provided for our usecase
kid: Option<&'a str>,
}
struct JwkRenewalPermit<'a> {
inner: Option<JwkRenewalPermitInner<'a>>,
}
enum JwkRenewalPermitInner<'a> {
Owned(Arc<JwkCacheEntryLock>),
Borrowed(&'a Arc<JwkCacheEntryLock>),
}
impl JwkRenewalPermit<'_> {
fn into_owned(mut self) -> JwkRenewalPermit<'static> {
JwkRenewalPermit {
inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
}
}
async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
match from.lookup.acquire().await {
Ok(permit) => {
permit.forget();
JwkRenewalPermit {
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
}
}
Err(_) => panic!("semaphore should not be closed"),
}
}
fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
match from.lookup.try_acquire() {
Ok(permit) => {
permit.forget();
Some(JwkRenewalPermit {
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
})
}
Err(tokio::sync::TryAcquireError::NoPermits) => None,
Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
}
}
}
impl JwkRenewalPermitInner<'_> {
fn into_owned(self) -> JwkRenewalPermitInner<'static> {
match self {
JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
}
}
}
impl Drop for JwkRenewalPermit<'_> {
fn drop(&mut self) {
let entry = match &self.inner {
None => return,
Some(JwkRenewalPermitInner::Owned(p)) => p,
Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
};
entry.lookup.add_permits(1);
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
use base64::URL_SAFE_NO_PAD;
use bytes::Bytes;
use http::Response;
use http_body_util::Full;
use hyper1::service::service_fn;
use hyper_util::rt::TokioIo;
use rand::rngs::OsRng;
use signature::Signer;
use tokio::net::TcpListener;
fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
let sk = p256::SecretKey::random(&mut OsRng);
let pk = sk.public_key().into();
let jwk = jose_jwk::Jwk {
key: jose_jwk::Key::Ec(pk),
prm: jose_jwk::Parameters {
kid: Some(kid),
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
..Default::default()
},
};
(sk, jwk)
}
fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
let pk = sk.to_public_key().into();
let jwk = jose_jwk::Jwk {
key: jose_jwk::Key::Rsa(pk),
prm: jose_jwk::Parameters {
kid: Some(kid),
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
..Default::default()
},
};
(sk, jwk)
}
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
let header = JWTHeader {
typ: "JWT",
alg: jose_jwa::Algorithm::Signing(sig),
kid: Some(&kid),
};
let body = typed_json::json! {{
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
}};
let header =
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
format!("{header}.{body}")
}
fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
use p256::ecdsa::{Signature, SigningKey};
let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
format!("{payload}.{sig}")
}
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
use rsa::pkcs1v15::SigningKey;
use rsa::signature::SignatureEncoding;
let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
format!("{payload}.{sig}")
}
#[tokio::test]
async fn renew() {
let (rs1, jwk1) = new_rsa_jwk("1".into());
let (rs2, jwk2) = new_rsa_jwk("2".into());
let (ec1, jwk3) = new_ec_jwk("3".into());
let (ec2, jwk4) = new_ec_jwk("4".into());
let jwt1 = new_rsa_jwt("1".into(), rs1);
let jwt2 = new_rsa_jwt("2".into(), rs2);
let jwt3 = new_ec_jwt("3".into(), ec1);
let jwt4 = new_ec_jwt("4".into(), ec2);
let foo_jwks = jose_jwk::JwkSet {
keys: vec![jwk1, jwk3],
};
let bar_jwks = jose_jwk::JwkSet {
keys: vec![jwk2, jwk4],
};
let service = service_fn(move |req| {
let foo_jwks = foo_jwks.clone();
let bar_jwks = bar_jwks.clone();
async move {
let jwks = match req.uri().path() {
"/foo" => &foo_jwks,
"/bar" => &bar_jwks,
_ => {
return Response::builder()
.status(404)
.body(Full::new(Bytes::new()));
}
};
let body = serde_json::to_vec(jwks).unwrap();
Response::builder()
.status(200)
.body(Full::new(Bytes::from(body)))
}
});
let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
let server = hyper1::server::conn::http1::Builder::new();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
loop {
let (s, _) = listener.accept().await.unwrap();
let serve = server.serve_connection(TokioIo::new(s), service.clone());
tokio::spawn(serve.into_future());
}
});
let client = reqwest::Client::new();
#[derive(Clone)]
struct Fetch(SocketAddr);
impl FetchAuthRules for Fetch {
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
Ok(AuthRules {
jwks_urls: vec![
format!("http://{}/foo", self.0).parse().unwrap(),
format!("http://{}/bar", self.0).parse().unwrap(),
],
})
}
}
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
jwk_cache
.check_jwt(jwt1, &client, &Fetch(addr))
.await
.unwrap();
jwk_cache
.check_jwt(jwt2, &client, &Fetch(addr))
.await
.unwrap();
jwk_cache
.check_jwt(jwt3, &client, &Fetch(addr))
.await
.unwrap();
jwk_cache
.check_jwt(jwt4, &client, &Fetch(addr))
.await
.unwrap();
}
}

View File

@@ -1,29 +0,0 @@
[package]
name = "pg_sni_router"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = []
[dependencies]
proxy-sasl = { version = "0.1", path = "../sasl" }
proxy-core = { version = "0.1", path = "../core" }
anyhow.workspace = true
clap.workspace = true
futures.workspace = true
git-version.workspace = true
itertools.workspace = true
pq_proto.workspace = true
rustls-pemfile.workspace = true
rustls.workspace = true
socket2.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-utils.workspace = true
tracing.workspace = true
utils.workspace = true
uuid.workspace = true

View File

@@ -1,34 +0,0 @@
[package]
name = "proxy"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = []
[dependencies]
proxy-sasl = { version = "0.1", path = "../sasl" }
proxy-core = { version = "0.1", path = "../core" }
anyhow.workspace = true
aws-config.workspace = true
clap.workspace = true
futures.workspace = true
git-version.workspace = true
humantime.workspace = true
itertools.workspace = true
metrics.workspace = true
pq_proto.workspace = true
remote_storage = { version = "0.1", path = "../../libs/remote_storage/" }
rustls-pemfile.workspace = true
rustls.workspace = true
socket2.workspace = true
tikv-jemallocator.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-utils.workspace = true
tracing.workspace = true
utils.workspace = true
uuid.workspace = true

View File

@@ -1,37 +0,0 @@
[package]
name = "proxy-sasl"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = []
[dependencies]
ahash.workspace = true
anyhow.workspace = true
base64.workspace = true
bytes = { workspace = true, features = ["serde"] }
crossbeam-deque.workspace = true
hmac.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }
measured = { workspace = true, features = ["lasso"] }
parking_lot.workspace = true
pq_proto.workspace = true
rand.workspace = true
rustls.workspace = true
sha2 = { workspace = true, features = ["asm", "oid"] }
subtle.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing.workspace = true
x509-parser.workspace = true
postgres-protocol.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
pbkdf2 = { workspace = true, features = ["simple", "std"] }
uuid.workspace = true

View File

@@ -1,3 +0,0 @@
mod parse;
pub mod sasl;
pub mod scram;

View File

@@ -1,43 +0,0 @@
//! Small parsing helpers.
use std::ffi::CStr;
pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
Some((cstr, other))
}
/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
pub fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
(bytes.len() >= N).then(|| {
let (head, tail) = bytes.split_at(N);
(head.try_into().unwrap(), tail)
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_cstr() {
assert!(split_cstr(b"").is_none());
assert!(split_cstr(b"foo").is_none());
let (cstr, rest) = split_cstr(b"\0").expect("uh-oh");
assert_eq!(cstr.to_bytes(), b"");
assert_eq!(rest, b"");
let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh");
assert_eq!(cstr.to_bytes(), b"foo");
assert_eq!(rest, b"bar");
}
#[test]
fn test_split_at_const() {
assert!(split_at_const::<0>(b"").is_some());
assert!(split_at_const::<1>(b"").is_none());
assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k"))));
}
}

View File

@@ -38,7 +38,7 @@ pub enum AuthErrorImpl {
/// SASL protocol errors (includes [SCRAM](crate::scram)). /// SASL protocol errors (includes [SCRAM](crate::scram)).
#[error(transparent)] #[error(transparent)]
Sasl(#[from] proxy_sasl::sasl::Error), Sasl(#[from] crate::sasl::Error),
#[error("Unsupported authentication method: {0}")] #[error("Unsupported authentication method: {0}")]
BadAuthMethod(Box<str>), BadAuthMethod(Box<str>),
@@ -148,28 +148,3 @@ impl ReportableError for AuthError {
} }
} }
} }
impl UserFacingError for proxy_sasl::sasl::Error {
fn to_string_client(&self) -> String {
match self {
proxy_sasl::sasl::Error::ChannelBindingFailed(m) => m.to_string(),
proxy_sasl::sasl::Error::ChannelBindingBadMethod(m) => {
format!("unsupported channel binding method {m}")
}
_ => "authentication protocol violation".to_string(),
}
}
}
impl ReportableError for proxy_sasl::sasl::Error {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
proxy_sasl::sasl::Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
proxy_sasl::sasl::Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
proxy_sasl::sasl::Error::BadClientMessage(_) => crate::error::ErrorKind::User,
proxy_sasl::sasl::Error::MissingBinding => crate::error::ErrorKind::Service,
proxy_sasl::sasl::Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
proxy_sasl::sasl::Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
}
}
}

View File

@@ -1,6 +1,5 @@
mod classic; mod classic;
mod hacks; mod hacks;
pub mod jwt;
mod link; mod link;
use std::net::IpAddr; use std::net::IpAddr;
@@ -9,7 +8,6 @@ use std::time::Duration;
use ipnet::{Ipv4Net, Ipv6Net}; use ipnet::{Ipv4Net, Ipv6Net};
pub use link::LinkAuthError; pub use link::LinkAuthError;
use proxy_sasl::scram;
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::config::AuthKeys; use tokio_postgres::config::AuthKeys;
use tracing::{info, warn}; use tracing::{info, warn};
@@ -37,7 +35,7 @@ use crate::{
}, },
stream, url, stream, url,
}; };
use crate::{EndpointCacheKey, EndpointId, RoleName}; use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
pub enum MaybeOwned<'a, T> { pub enum MaybeOwned<'a, T> {
@@ -220,7 +218,7 @@ impl RateBucketInfo {
impl AuthenticationConfig { impl AuthenticationConfig {
pub fn check_rate_limit( pub fn check_rate_limit(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
config: &AuthenticationConfig, config: &AuthenticationConfig,
secret: AuthSecret, secret: AuthSecret,
endpoint: &EndpointId, endpoint: &EndpointId,
@@ -245,7 +243,7 @@ impl AuthenticationConfig {
let limit_not_exceeded = self.rate_limiter.check( let limit_not_exceeded = self.rate_limiter.check(
( (
endpoint_int, endpoint_int,
MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
), ),
password_weight, password_weight,
); );
@@ -276,7 +274,7 @@ impl AuthenticationConfig {
/// ///
/// All authentication flows will emit an AuthenticationOk message if successful. /// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks( async fn auth_quirks(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
api: &impl console::Api, api: &impl console::Api,
user_info: ComputeUserInfoMaybeEndpoint, user_info: ComputeUserInfoMaybeEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -305,8 +303,8 @@ async fn auth_quirks(
let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
// check allowed list // check allowed list
if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
} }
if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -358,7 +356,7 @@ async fn auth_quirks(
} }
async fn authenticate_with_secret( async fn authenticate_with_secret(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
secret: AuthSecret, secret: AuthSecret,
info: ComputeUserInfo, info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -372,8 +370,8 @@ async fn authenticate_with_secret(
let auth_outcome = let auth_outcome =
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?; validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
let keys = match auth_outcome { let keys = match auth_outcome {
proxy_sasl::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Success(key) => key,
proxy_sasl::sasl::Outcome::Failure(reason) => { crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}"); info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.user)); return Err(auth::AuthError::auth_failed(&*info.user));
} }
@@ -423,7 +421,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate( pub async fn authenticate(
self, self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool, allow_cleartext: bool,
config: &'static AuthenticationConfig, config: &'static AuthenticationConfig,
@@ -469,7 +467,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
impl BackendType<'_, ComputeUserInfo, &()> { impl BackendType<'_, ComputeUserInfo, &()> {
pub async fn get_role_secret( pub async fn get_role_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
) -> Result<CachedRoleSecret, GetAuthInfoError> { ) -> Result<CachedRoleSecret, GetAuthInfoError> {
use BackendType::*; use BackendType::*;
match self { match self {
@@ -480,7 +478,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
pub async fn get_allowed_ips_and_secret( pub async fn get_allowed_ips_and_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> { ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
use BackendType::*; use BackendType::*;
match self { match self {
@@ -494,7 +492,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> { ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
use BackendType::*; use BackendType::*;
@@ -516,7 +514,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> { ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
use BackendType::*; use BackendType::*;
@@ -559,9 +557,9 @@ mod tests {
context::RequestMonitoring, context::RequestMonitoring,
proxy::NeonOptions, proxy::NeonOptions,
rate_limiter::{EndpointRateLimiter, RateBucketInfo}, rate_limiter::{EndpointRateLimiter, RateBucketInfo},
scram::{threadpool::ThreadPool, ServerSecret},
stream::{PqStream, Stream}, stream::{PqStream, Stream},
}; };
use proxy_sasl::scram::{threadpool::ThreadPool, ServerSecret};
use super::{auth_quirks, AuthRateLimiter}; use super::{auth_quirks, AuthRateLimiter};
@@ -573,7 +571,7 @@ mod tests {
impl console::Api for Auth { impl console::Api for Auth {
async fn get_role_secret( async fn get_role_secret(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo, _user_info: &super::ComputeUserInfo,
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> { ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -581,7 +579,7 @@ mod tests {
async fn get_allowed_ips_and_secret( async fn get_allowed_ips_and_secret(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo, _user_info: &super::ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError> ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
{ {
@@ -593,7 +591,7 @@ mod tests {
async fn wake_compute( async fn wake_compute(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo, _user_info: &super::ComputeUserInfo,
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> { ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
unimplemented!() unimplemented!()
@@ -667,14 +665,10 @@ mod tests {
let (mut client, server) = tokio::io::duplex(1024); let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server)); let mut stream = PqStream::new(Stream::from_raw(server));
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let api = Auth { let api = Auth {
ips: vec![], ips: vec![],
secret: AuthSecret::Scram( secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
ServerSecret::build_test_secret("my-secret-password")
.await
.unwrap(),
),
}; };
let user_info = ComputeUserInfoMaybeEndpoint { let user_info = ComputeUserInfoMaybeEndpoint {
@@ -729,7 +723,7 @@ mod tests {
)); ));
let _creds = auth_quirks( let _creds = auth_quirks(
&ctx, &mut ctx,
&api, &api,
user_info, user_info,
&mut stream, &mut stream,
@@ -748,14 +742,10 @@ mod tests {
let (mut client, server) = tokio::io::duplex(1024); let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server)); let mut stream = PqStream::new(Stream::from_raw(server));
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let api = Auth { let api = Auth {
ips: vec![], ips: vec![],
secret: AuthSecret::Scram( secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
ServerSecret::build_test_secret("my-secret-password")
.await
.unwrap(),
),
}; };
let user_info = ComputeUserInfoMaybeEndpoint { let user_info = ComputeUserInfoMaybeEndpoint {
@@ -785,7 +775,7 @@ mod tests {
)); ));
let _creds = auth_quirks( let _creds = auth_quirks(
&ctx, &mut ctx,
&api, &api,
user_info, user_info,
&mut stream, &mut stream,
@@ -804,14 +794,10 @@ mod tests {
let (mut client, server) = tokio::io::duplex(1024); let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server)); let mut stream = PqStream::new(Stream::from_raw(server));
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let api = Auth { let api = Auth {
ips: vec![], ips: vec![],
secret: AuthSecret::Scram( secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
ServerSecret::build_test_secret("my-secret-password")
.await
.unwrap(),
),
}; };
let user_info = ComputeUserInfoMaybeEndpoint { let user_info = ComputeUserInfoMaybeEndpoint {
@@ -842,7 +828,7 @@ mod tests {
)); ));
let creds = auth_quirks( let creds = auth_quirks(
&ctx, &mut ctx,
&api, &api,
user_info, user_info,
&mut stream, &mut stream,

View File

@@ -5,14 +5,14 @@ use crate::{
config::AuthenticationConfig, config::AuthenticationConfig,
console::AuthSecret, console::AuthSecret,
context::RequestMonitoring, context::RequestMonitoring,
sasl,
stream::{PqStream, Stream}, stream::{PqStream, Stream},
}; };
use proxy_sasl::sasl;
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn}; use tracing::{info, warn};
pub(super) async fn authenticate( pub(super) async fn authenticate(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
creds: ComputeUserInfo, creds: ComputeUserInfo,
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
config: &'static AuthenticationConfig, config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
} }
AuthSecret::Scram(secret) => { AuthSecret::Scram(secret) => {
info!("auth endpoint chooses SCRAM"); info!("auth endpoint chooses SCRAM");
let scram = auth::Scram(&secret, ctx); let scram = auth::Scram(&secret, &mut *ctx);
let auth_outcome = tokio::time::timeout( let auth_outcome = tokio::time::timeout(
config.scram_protocol_timeout, config.scram_protocol_timeout,

View File

@@ -7,9 +7,9 @@ use crate::{
console::AuthSecret, console::AuthSecret,
context::RequestMonitoring, context::RequestMonitoring,
intern::EndpointIdInt, intern::EndpointIdInt,
sasl,
stream::{self, Stream}, stream::{self, Stream},
}; };
use proxy_sasl::sasl;
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn}; use tracing::{info, warn};
@@ -18,7 +18,7 @@ use tracing::{info, warn};
/// These properties are benefical for serverless JS workers, so we /// These properties are benefical for serverless JS workers, so we
/// use this mechanism for websocket connections. /// use this mechanism for websocket connections.
pub async fn authenticate_cleartext( pub async fn authenticate_cleartext(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
info: ComputeUserInfo, info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
secret: AuthSecret, secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
ctx.set_auth_method(crate::context::AuthMethod::Cleartext); ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client // pause the timer while we communicate with the client
let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
let ep = EndpointIdInt::from(&info.endpoint); let ep = EndpointIdInt::from(&info.endpoint);
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
/// Similar to [`authenticate_cleartext`], but there's a specific password format, /// Similar to [`authenticate_cleartext`], but there's a specific password format,
/// and passwords are not yet validated (we don't know how to validate them!) /// and passwords are not yet validated (we don't know how to validate them!)
pub async fn password_hack_no_authentication( pub async fn password_hack_no_authentication(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
info: ComputeUserInfoNoEndpoint, info: ComputeUserInfoNoEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
) -> auth::Result<ComputeCredentials> { ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
ctx.set_auth_method(crate::context::AuthMethod::Cleartext); ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client // pause the timer while we communicate with the client
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
let payload = AuthFlow::new(client) let payload = AuthFlow::new(client)
.begin(auth::PasswordHack) .begin(auth::PasswordHack)

View File

@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
} }
pub(super) async fn authenticate( pub(super) async fn authenticate(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
link_uri: &reqwest::Url, link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>, client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> { ) -> auth::Result<NodeInfo> {

View File

@@ -84,7 +84,7 @@ pub fn endpoint_sni(
impl ComputeUserInfoMaybeEndpoint { impl ComputeUserInfoMaybeEndpoint {
pub fn parse( pub fn parse(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
params: &StartupMessageParams, params: &StartupMessageParams,
sni: Option<&str>, sni: Option<&str>,
common_names: Option<&HashSet<String>>, common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
fn parse_bare_minimum() -> anyhow::Result<()> { fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required. // According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]); let options = StartupMessageParams::new([("user", "john_doe")]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id, None); assert_eq!(user_info.endpoint_id, None);
@@ -264,8 +264,8 @@ mod tests {
("database", "world"), // should be ignored ("database", "world"), // should be ignored
("foo", "bar"), // should be ignored ("foo", "bar"), // should be ignored
]); ]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id, None); assert_eq!(user_info.endpoint_id, None);
@@ -279,9 +279,9 @@ mod tests {
let sni = Some("foo.localhost"); let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into()); let common_names = Some(["localhost".into()].into());
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = let user_info =
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
assert_eq!(user_info.options.get_cache_key("foo"), "foo"); assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"), ("options", "-ckey=1 project=bar -c geqo=off"),
]); ]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -311,8 +311,8 @@ mod tests {
("options", "-ckey=1 endpoint=bar -c geqo=off"), ("options", "-ckey=1 endpoint=bar -c geqo=off"),
]); ]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -329,8 +329,8 @@ mod tests {
), ),
]); ]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert!(user_info.endpoint_id.is_none()); assert!(user_info.endpoint_id.is_none());
@@ -344,8 +344,8 @@ mod tests {
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
]); ]);
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?; let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert!(user_info.endpoint_id.is_none()); assert!(user_info.endpoint_id.is_none());
@@ -359,9 +359,9 @@ mod tests {
let sni = Some("baz.localhost"); let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into()); let common_names = Some(["localhost".into()].into());
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = let user_info =
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.user, "john_doe"); assert_eq!(user_info.user, "john_doe");
assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
@@ -374,16 +374,16 @@ mod tests {
let common_names = Some(["a.com".into(), "b.com".into()].into()); let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com"); let sni = Some("p1.a.com");
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = let user_info =
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into()); let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com"); let sni = Some("p1.b.com");
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = let user_info =
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
Ok(()) Ok(())
@@ -397,8 +397,9 @@ mod tests {
let sni = Some("second.localhost"); let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into()); let common_names = Some(["localhost".into()].into());
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) let err =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
.expect_err("should fail"); .expect_err("should fail");
match err { match err {
InconsistentProjectNames { domain, option } => { InconsistentProjectNames { domain, option } => {
@@ -416,8 +417,9 @@ mod tests {
let sni = Some("project.localhost"); let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into()); let common_names = Some(["example.com".into()].into());
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) let err =
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
.expect_err("should fail"); .expect_err("should fail");
match err { match err {
UnknownCommonName { cn } => { UnknownCommonName { cn } => {
@@ -436,9 +438,9 @@ mod tests {
let sni = Some("project.localhost"); let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into()); let common_names = Some(["localhost".into()].into());
let ctx = RequestMonitoring::test(); let mut ctx = RequestMonitoring::test();
let user_info = let user_info =
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?; ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
assert_eq!( assert_eq!(
user_info.options.get_cache_key("project"), user_info.options.get_cache_key("project"),

View File

@@ -2,17 +2,16 @@
use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
use crate::{ use crate::{
config::TlsServerEndPoint,
console::AuthSecret, console::AuthSecret,
context::RequestMonitoring, context::RequestMonitoring,
intern::EndpointIdInt, intern::EndpointIdInt,
sasl,
scram::{self, threadpool::ThreadPool},
stream::{PqStream, Stream}, stream::{PqStream, Stream},
}; };
use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
use proxy_sasl::{
sasl,
scram::{self, threadpool::ThreadPool, TlsServerEndPoint},
};
use std::{io, sync::Arc}; use std::{io, sync::Arc};
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info; use tracing::info;
@@ -28,7 +27,7 @@ pub trait AuthMethod {
pub struct Begin; pub struct Begin;
/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring); pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
impl AuthMethod for Scram<'_> { impl AuthMethod for Scram<'_> {
#[inline(always)] #[inline(always)]
@@ -57,7 +56,7 @@ impl AuthMethod for PasswordHack {
/// Use clear-text password auth called `password` in docs /// Use clear-text password auth called `password` in docs
/// <https://www.postgresql.org/docs/current/auth-password.html> /// <https://www.postgresql.org/docs/current/auth-password.html>
pub struct CleartextPassword { pub struct CleartextPassword {
pub pool: Arc<ThreadPool<EndpointIdInt>>, pub pool: Arc<ThreadPool>,
pub endpoint: EndpointIdInt, pub endpoint: EndpointIdInt,
pub secret: AuthSecret, pub secret: AuthSecret,
} }
@@ -156,7 +155,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
let Scram(secret, ctx) = self.state; let Scram(secret, ctx) = self.state;
// pause the timer while we communicate with the client // pause the timer while we communicate with the client
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
// Initial client message contains the chosen auth method's name. // Initial client message contains the chosen auth method's name.
let msg = self.stream.read_password_message().await?; let msg = self.stream.read_password_message().await?;
@@ -169,13 +168,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
} }
match sasl.method { match sasl.method {
SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256), SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus), SCRAM_SHA_256_PLUS => {
ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
}
_ => {} _ => {}
} }
info!("client chooses {}", sasl.method); info!("client chooses {}", sasl.method);
let outcome = sasl::SaslStream::new(&mut self.stream.framed, sasl.message) let outcome = sasl::SaslStream::new(self.stream, sasl.message)
.authenticate(scram::Exchange::new( .authenticate(scram::Exchange::new(
secret, secret,
rand::random, rand::random,
@@ -192,7 +193,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
} }
pub(crate) async fn validate_password_and_exchange( pub(crate) async fn validate_password_and_exchange(
pool: &ThreadPool<EndpointIdInt>, pool: &ThreadPool,
endpoint: EndpointIdInt, endpoint: EndpointIdInt,
password: &[u8], password: &[u8],
secret: AuthSecret, secret: AuthSecret,
@@ -207,8 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
} }
// perform scram authentication as both client and server to validate the keys // perform scram authentication as both client and server to validate the keys
AuthSecret::Scram(scram_secret) => { AuthSecret::Scram(scram_secret) => {
let outcome = let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
proxy_sasl::scram::exchange(pool, endpoint, &scram_secret, password).await?;
let client_key = match outcome { let client_key = match outcome {
sasl::Outcome::Success(client_key) => client_key, sasl::Outcome::Success(client_key) => client_key,

View File

@@ -7,18 +7,17 @@ use std::{net::SocketAddr, sync::Arc};
use futures::future::Either; use futures::future::Either;
use itertools::Itertools; use itertools::Itertools;
use proxy_core::context::RequestMonitoring; use proxy::config::TlsServerEndPoint;
use proxy_core::metrics::Metrics; use proxy::context::RequestMonitoring;
use proxy_core::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; use proxy::metrics::{Metrics, ThreadPoolMetrics};
use proxy_sasl::scram::threadpool::ThreadPoolMetrics; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
use proxy_sasl::scram::TlsServerEndPoint;
use rustls::pki_types::PrivateKeyDer; use rustls::pki_types::PrivateKeyDer;
use tokio::net::TcpListener; use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{anyhow, bail, ensure, Context};
use clap::Arg; use clap::Arg;
use futures::TryFutureExt; use futures::TryFutureExt;
use proxy_core::stream::{PqStream, Stream}; use proxy::stream::{PqStream, Stream};
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
@@ -63,7 +62,7 @@ fn cli() -> clap::Command {
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy_core::logging::init().await?; let _logging_guard = proxy::logging::init().await?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
@@ -134,14 +133,14 @@ async fn main() -> anyhow::Result<()> {
proxy_listener, proxy_listener,
cancellation_token.clone(), cancellation_token.clone(),
)); ));
let signals_task = tokio::spawn(proxy_core::handle_signals(cancellation_token)); let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
// the signal task cant ever succeed. // the signal task cant ever succeed.
// the main task can error, or can succeed on cancellation. // the main task can error, or can succeed on cancellation.
// we want to immediately exit on either of these cases // we want to immediately exit on either of these cases
let signal = match futures::future::select(signals_task, main).await { let signal = match futures::future::select(signals_task, main).await {
Either::Left((res, _)) => proxy_core::flatten_err(res)?, Either::Left((res, _)) => proxy::flatten_err(res)?,
Either::Right((res, _)) => return proxy_core::flatten_err(res), Either::Right((res, _)) => return proxy::flatten_err(res),
}; };
// maintenance tasks return `Infallible` success values, this is an impossible value // maintenance tasks return `Infallible` success values, this is an impossible value
@@ -181,7 +180,7 @@ async fn task_main(
let ctx = RequestMonitoring::new( let ctx = RequestMonitoring::new(
session_id, session_id,
peer_addr.ip(), peer_addr.ip(),
proxy_core::metrics::Protocol::SniRouter, proxy::metrics::Protocol::SniRouter,
"sni", "sni",
); );
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
@@ -206,7 +205,7 @@ async fn task_main(
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>( async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
raw_stream: S, raw_stream: S,
tls_config: Arc<rustls::ServerConfig>, tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint, tls_server_end_point: TlsServerEndPoint,
@@ -250,20 +249,20 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
"unexpected startup packet, rejecting connection" "unexpected startup packet, rejecting connection"
); );
stream stream
.throw_error_str(ERR_INSECURE_CONNECTION, proxy_core::error::ErrorKind::User) .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
.await? .await?
} }
} }
} }
async fn handle_client( async fn handle_client(
ctx: RequestMonitoring, mut ctx: RequestMonitoring,
dest_suffix: Arc<String>, dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>, tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint, tls_server_end_point: TlsServerEndPoint,
stream: impl AsyncRead + AsyncWrite + Unpin, stream: impl AsyncRead + AsyncWrite + Unpin,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
// Cut off first part of the SNI domain // Cut off first part of the SNI domain
// We receive required destination details in the format of // We receive required destination details in the format of

View File

@@ -5,39 +5,38 @@ use aws_config::meta::region::RegionProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::provider_config::ProviderConfig; use aws_config::provider_config::ProviderConfig;
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use aws_config::Region;
use futures::future::Either; use futures::future::Either;
use proxy_core::auth; use proxy::auth;
use proxy_core::auth::backend::AuthRateLimiter; use proxy::auth::backend::AuthRateLimiter;
use proxy_core::auth::backend::MaybeOwned; use proxy::auth::backend::MaybeOwned;
use proxy_core::cancellation::CancelMap; use proxy::cancellation::CancelMap;
use proxy_core::cancellation::CancellationHandler; use proxy::cancellation::CancellationHandler;
use proxy_core::config::remote_storage_from_toml; use proxy::config::remote_storage_from_toml;
use proxy_core::config::AuthenticationConfig; use proxy::config::AuthenticationConfig;
use proxy_core::config::CacheOptions; use proxy::config::CacheOptions;
use proxy_core::config::HttpConfig; use proxy::config::HttpConfig;
use proxy_core::config::ProjectInfoCacheOptions; use proxy::config::ProjectInfoCacheOptions;
use proxy_core::console; use proxy::console;
use proxy_core::context::parquet::ParquetUploadArgs; use proxy::context::parquet::ParquetUploadArgs;
use proxy_core::http; use proxy::http;
use proxy_core::http::health_server::AppMetrics; use proxy::http::health_server::AppMetrics;
use proxy_core::metrics::Metrics; use proxy::metrics::Metrics;
use proxy_core::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::EndpointRateLimiter;
use proxy_core::rate_limiter::LeakyBucketConfig; use proxy::rate_limiter::LeakyBucketConfig;
use proxy_core::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateBucketInfo;
use proxy_core::rate_limiter::WakeComputeRateLimiter; use proxy::rate_limiter::WakeComputeRateLimiter;
use proxy_core::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy_core::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy_core::redis::elasticache; use proxy::redis::elasticache;
use proxy_core::redis::notifications; use proxy::redis::notifications;
use proxy_core::serverless::cancel_set::CancelSet; use proxy::scram::threadpool::ThreadPool;
use proxy_core::serverless::GlobalConnPoolOptions; use proxy::serverless::cancel_set::CancelSet;
use proxy_core::usage_metrics; use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
use anyhow::bail; use anyhow::bail;
use proxy_core::config::{self, ProxyConfig}; use proxy::config::{self, ProxyConfig};
use proxy_core::serverless; use proxy::serverless;
use proxy_sasl::scram::threadpool::ThreadPool;
use remote_storage::RemoteStorageConfig; use remote_storage::RemoteStorageConfig;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::pin::pin; use std::pin::pin;
@@ -268,7 +267,7 @@ struct SqlOverHttpArgs {
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy_core::logging::init().await?; let _logging_guard = proxy::logging::init().await?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
@@ -279,7 +278,7 @@ async fn main() -> anyhow::Result<()> {
build_tag: BUILD_TAG, build_tag: BUILD_TAG,
}); });
let jemalloc = match proxy_core::jemalloc::MetricRecorder::new() { let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
Ok(t) => Some(t), Ok(t) => Some(t),
Err(e) => { Err(e) => {
tracing::error!(error = ?e, "could not start jemalloc metrics loop"); tracing::error!(error = ?e, "could not start jemalloc metrics loop");
@@ -291,10 +290,9 @@ async fn main() -> anyhow::Result<()> {
let config = build_config(&args)?; let config = build_config(&args)?;
info!("Authentication backend: {}", config.auth_backend); info!("Authentication backend: {}", config.auth_backend);
info!("Using region: {}", args.aws_region); info!("Using region: {}", config.aws_region);
let region_provider = let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
let provider_conf = let provider_conf =
ProviderConfig::without_region().with_region(region_provider.region().await); ProviderConfig::without_region().with_region(region_provider.region().await);
let aws_credentials_provider = { let aws_credentials_provider = {
@@ -320,7 +318,7 @@ async fn main() -> anyhow::Result<()> {
}; };
let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
elasticache::AWSIRSAConfig::new( elasticache::AWSIRSAConfig::new(
args.aws_region.clone(), config.aws_region.clone(),
args.redis_cluster_name, args.redis_cluster_name,
args.redis_user_id, args.redis_user_id,
), ),
@@ -378,14 +376,11 @@ async fn main() -> anyhow::Result<()> {
let cancel_map = CancelMap::default(); let cancel_map = CancelMap::default();
let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
RateBucketInfo::validate(redis_rps_limit)?;
let redis_publisher = match &regional_redis_client { let redis_publisher = match &regional_redis_client {
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
redis_publisher.clone(), redis_publisher.clone(),
args.region.clone(), args.region.clone(),
redis_rps_limit, &config.redis_rps_limit,
)?))), )?))),
None => None, None => None,
}; };
@@ -394,7 +389,7 @@ async fn main() -> anyhow::Result<()> {
>::new( >::new(
cancel_map.clone(), cancel_map.clone(),
redis_publisher, redis_publisher,
proxy_core::metrics::CancellationSource::FromClient, proxy::metrics::CancellationSource::FromClient,
)); ));
// bit of a hack - find the min rps and max rps supported and turn it into // bit of a hack - find the min rps and max rps supported and turn it into
@@ -419,7 +414,7 @@ async fn main() -> anyhow::Result<()> {
// client facing tasks. these will exit on error or on cancellation // client facing tasks. these will exit on error or on cancellation
// cancellation returns Ok(()) // cancellation returns Ok(())
let mut client_tasks = JoinSet::new(); let mut client_tasks = JoinSet::new();
client_tasks.spawn(proxy_core::proxy::task_main( client_tasks.spawn(proxy::proxy::task_main(
config, config,
proxy_listener, proxy_listener,
cancellation_token.clone(), cancellation_token.clone(),
@@ -443,20 +438,20 @@ async fn main() -> anyhow::Result<()> {
)); ));
} }
client_tasks.spawn(proxy_core::context::parquet::worker( client_tasks.spawn(proxy::context::parquet::worker(
cancellation_token.clone(), cancellation_token.clone(),
args.parquet_upload, args.parquet_upload,
)); ));
// maintenance tasks. these never return unless there's an error // maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new(); let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy_core::handle_signals(cancellation_token.clone())); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
maintenance_tasks.spawn(http::health_server::task_main( maintenance_tasks.spawn(http::health_server::task_main(
http_listener, http_listener,
AppMetrics { AppMetrics {
jemalloc, jemalloc,
neon_metrics, neon_metrics,
proxy: proxy_core::metrics::Metrics::get(), proxy: proxy::metrics::Metrics::get(),
}, },
)); ));
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
@@ -471,7 +466,7 @@ async fn main() -> anyhow::Result<()> {
} }
if let auth::BackendType::Console(api, _) = &config.auth_backend { if let auth::BackendType::Console(api, _) = &config.auth_backend {
if let proxy_core::console::provider::ConsoleBackend::Console(api) = &**api { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
match (redis_notifications_client, regional_redis_client.clone()) { match (redis_notifications_client, regional_redis_client.clone()) {
(None, None) => {} (None, None) => {}
(client1, client2) => { (client1, client2) => {
@@ -516,11 +511,11 @@ async fn main() -> anyhow::Result<()> {
.await .await
{ {
// exit immediately on maintenance task completion // exit immediately on maintenance task completion
Either::Left((Some(res), _)) => break proxy_core::flatten_err(res)?, Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
// exit immediately on client task error // exit immediately on client task error
Either::Right((Some(res), _)) => proxy_core::flatten_err(res)?, Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
// exit if all our client tasks have shutdown gracefully // exit if all our client tasks have shutdown gracefully
Either::Right((None, _)) => return Ok(()), Either::Right((None, _)) => return Ok(()),
} }
@@ -607,7 +602,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
timeout, timeout,
epoch, epoch,
&Metrics::get().wake_compute_lock, &Metrics::get().wake_compute_lock,
))); )?));
tokio::spawn(locks.garbage_collect_worker()); tokio::spawn(locks.garbage_collect_worker());
let url = args.auth_endpoint.parse()?; let url = args.auth_endpoint.parse()?;
@@ -658,9 +653,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
timeout, timeout,
epoch, epoch,
&Metrics::get().proxy.connect_compute_lock, &Metrics::get().proxy.connect_compute_lock,
); )?;
let http_config = HttpConfig { let http_config = HttpConfig {
request_timeout: args.sql_over_http.sql_over_http_timeout,
pool_options: GlobalConnPoolOptions { pool_options: GlobalConnPoolOptions {
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -680,6 +676,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
}; };
let mut redis_rps_limit = args.redis_rps_limit.clone();
RateBucketInfo::validate(&mut redis_rps_limit)?;
let config = Box::leak(Box::new(ProxyConfig { let config = Box::leak(Box::new(ProxyConfig {
tls_config, tls_config,
auth_backend, auth_backend,
@@ -688,8 +687,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
http_config, http_config,
authentication_config, authentication_config,
require_client_ip: args.require_client_ip, require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
redis_rps_limit,
handshake_timeout: args.handshake_timeout, handshake_timeout: args.handshake_timeout,
region: args.region.clone(), region: args.region.clone(),
aws_region: args.aws_region.clone(),
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
connect_compute_locks, connect_compute_locks,
connect_to_compute_retry_config: config::RetryConfig::parse( connect_to_compute_retry_config: config::RetryConfig::parse(
@@ -707,7 +709,7 @@ mod tests {
use std::time::Duration; use std::time::Duration;
use clap::Parser; use clap::Parser;
use proxy_core::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateBucketInfo;
#[test] #[test]
fn parse_endpoint_rps_limit() { fn parse_endpoint_rps_limit() {

View File

@@ -68,7 +68,7 @@ impl EndpointsCache {
ready: AtomicBool::new(false), ready: AtomicBool::new(false),
} }
} }
pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool { pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
if !self.ready.load(Ordering::Acquire) { if !self.ready.load(Ordering::Acquire) {
return true; return true;
} }

View File

@@ -371,8 +371,7 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::ProjectId; use crate::{scram::ServerSecret, ProjectId};
use proxy_sasl::scram::ServerSecret;
#[tokio::test] #[tokio::test]
async fn test_project_info_cache_settings() { async fn test_project_info_cache_settings() {

View File

@@ -103,12 +103,8 @@ impl ConnCfg {
/// Reuse password or auth keys from the other config. /// Reuse password or auth keys from the other config.
pub fn reuse_password(&mut self, other: Self) { pub fn reuse_password(&mut self, other: Self) {
if let Some(password) = other.get_password() { if let Some(password) = other.get_auth() {
self.password(password); self.auth(password);
}
if let Some(keys) = other.get_auth_keys() {
self.auth_keys(keys);
} }
} }
@@ -124,48 +120,64 @@ impl ConnCfg {
/// Apply startup message params to the connection config. /// Apply startup message params to the connection config.
pub fn set_startup_params(&mut self, params: &StartupMessageParams) { pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
let mut client_encoding = false;
for (k, v) in params.iter() {
match k {
"user" => {
// Only set `user` if it's not present in the config. // Only set `user` if it's not present in the config.
// Link auth flow takes username from the console's response. // Link auth flow takes username from the console's response.
if let (None, Some(user)) = (self.get_user(), params.get("user")) { if self.get_user().is_none() {
self.user(user); self.user(v);
} }
// Only set `dbname` if it's not present in the config.
// Link auth flow takes dbname from the console's response.
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
self.dbname(dbname);
}
// Don't add `options` if they were only used for specifying a project.
// Connection pools don't support `options`, because they affect backend startup.
if let Some(options) = filtered_options(params) {
self.options(&options);
}
if let Some(app_name) = params.get("application_name") {
self.application_name(app_name);
}
// TODO: This is especially ugly...
if let Some(replication) = params.get("replication") {
use tokio_postgres::config::ReplicationMode;
match replication {
"true" | "on" | "yes" | "1" => {
self.replication_mode(ReplicationMode::Physical);
} }
"database" => { "database" => {
self.replication_mode(ReplicationMode::Logical); // Only set `dbname` if it's not present in the config.
// Link auth flow takes dbname from the console's response.
if self.get_dbname().is_none() {
self.dbname(v);
} }
_other => {} }
"options" => {
// Don't add `options` if they were only used for specifying a project.
// Connection pools don't support `options`, because they affect backend startup.
if let Some(options) = filtered_options(v) {
self.options(&options);
} }
} }
// TODO: extend the list of the forwarded startup parameters. // the special ones in tokio-postgres that we don't want being set by the user
// Currently, tokio-postgres doesn't allow us to pass "dbname" => {}
// arbitrary parameters, but the ones above are a good start. "password" => {}
// "sslmode" => {}
// This and the reverse params problem can be better addressed "host" => {}
// in a bespoke connection machinery (a new library for that sake). "port" => {}
"connect_timeout" => {}
"keepalives" => {}
"keepalives_idle" => {}
"keepalives_interval" => {}
"keepalives_retries" => {}
"target_session_attrs" => {}
"channel_binding" => {}
"max_backend_message_size" => {}
"client_encoding" => {
client_encoding = true;
// only error should be from bad null bytes,
// but we've already checked for those.
_ = self.param("client_encoding", v);
}
_ => {
// only error should be from bad null bytes,
// but we've already checked for those.
_ = self.param(k, v);
}
}
}
if !client_encoding {
// for compatibility since we removed it from tokio-postgres
self.param("client_encoding", "UTF8").unwrap();
}
} }
} }
@@ -276,12 +288,12 @@ impl ConnCfg {
/// Connect to a corresponding compute node. /// Connect to a corresponding compute node.
pub async fn connect( pub async fn connect(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
allow_self_signed_compute: bool, allow_self_signed_compute: bool,
aux: MetricsAuxInfo, aux: MetricsAuxInfo,
timeout: Duration, timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> { ) -> Result<PostgresConnection, ConnectionError> {
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(timeout).await?; let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause); drop(pause);
@@ -304,14 +316,14 @@ impl ConnCfg {
)?; )?;
// connect_raw() will not use TLS if sslmode is "disable" // connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (client, connection) = self.0.connect_raw(stream, tls).await?; let (client, connection) = self.0.connect_raw(stream, tls).await?;
drop(pause); drop(pause);
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
let stream = connection.stream.into_inner(); let stream = connection.stream.into_inner();
info!( info!(
cold_start_info = ctx.cold_start_info().as_str(), cold_start_info = ctx.cold_start_info.as_str(),
"connected to compute node at {host} ({socket_addr}) sslmode={:?}", "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
self.0.get_ssl_mode() self.0.get_ssl_mode()
); );
@@ -330,7 +342,7 @@ impl ConnCfg {
params, params,
cancel_closure, cancel_closure,
aux, aux,
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
}; };
Ok(connection) Ok(connection)
@@ -338,10 +350,9 @@ impl ConnCfg {
} }
/// Retrieve `options` from a startup message, dropping all proxy-secific flags. /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
fn filtered_options(params: &StartupMessageParams) -> Option<String> { fn filtered_options(options: &str) -> Option<String> {
#[allow(unstable_name_collisions)] #[allow(unstable_name_collisions)]
let options: String = params let options: String = StartupMessageParams::parse_options_raw(options)
.options_raw()?
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
.intersperse(" ") // TODO: use impl from std once it's stabilized .intersperse(" ") // TODO: use impl from std once it's stabilized
.collect(); .collect();
@@ -413,27 +424,23 @@ mod tests {
#[test] #[test]
fn test_filtered_options() { fn test_filtered_options() {
// Empty options is unlikely to be useful anyway. // Empty options is unlikely to be useful anyway.
let params = StartupMessageParams::new([("options", "")]); assert_eq!(filtered_options(""), None);
assert_eq!(filtered_options(&params), None);
// It's likely that clients will only use options to specify endpoint/project. // It's likely that clients will only use options to specify endpoint/project.
let params = StartupMessageParams::new([("options", "project=foo")]); let params = "project=foo";
assert_eq!(filtered_options(&params), None); assert_eq!(filtered_options(params), None);
// Same, because unescaped whitespaces are no-op. // Same, because unescaped whitespaces are no-op.
let params = StartupMessageParams::new([("options", " project=foo ")]); let params = " project=foo ";
assert_eq!(filtered_options(&params).as_deref(), None); assert_eq!(filtered_options(params), None);
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); let params = r"\ project=foo \ ";
assert_eq!(filtered_options(&params).as_deref(), Some(r"\ \ ")); assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ "));
let params = StartupMessageParams::new([("options", "project = foo")]); let params = "project = foo";
assert_eq!(filtered_options(&params).as_deref(), Some("project = foo")); assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
let params = StartupMessageParams::new([( let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2";
"options", assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
)]);
assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
} }
} }

View File

@@ -1,26 +1,27 @@
use crate::{ use crate::{
auth::{self, backend::AuthRateLimiter}, auth::{self, backend::AuthRateLimiter},
console::locks::ApiLocks, console::locks::ApiLocks,
intern::EndpointIdInt,
rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
scram::threadpool::ThreadPool,
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
Host, Host,
}; };
use anyhow::{bail, ensure, Context, Ok}; use anyhow::{bail, ensure, Context, Ok};
use itertools::Itertools; use itertools::Itertools;
use proxy_sasl::scram::{threadpool::ThreadPool, TlsServerEndPoint};
use remote_storage::RemoteStorageConfig; use remote_storage::RemoteStorageConfig;
use rustls::{ use rustls::{
crypto::ring::sign, crypto::ring::sign,
pki_types::{CertificateDer, PrivateKeyDer}, pki_types::{CertificateDer, PrivateKeyDer},
}; };
use sha2::{Digest, Sha256};
use std::{ use std::{
collections::{HashMap, HashSet}, collections::{HashMap, HashSet},
str::FromStr, str::FromStr,
sync::Arc, sync::Arc,
time::Duration, time::Duration,
}; };
use tracing::{error, info};
use x509_parser::oid_registry;
pub struct ProxyConfig { pub struct ProxyConfig {
pub tls_config: Option<TlsConfig>, pub tls_config: Option<TlsConfig>,
@@ -30,8 +31,11 @@ pub struct ProxyConfig {
pub http_config: HttpConfig, pub http_config: HttpConfig,
pub authentication_config: AuthenticationConfig, pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool, pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
pub redis_rps_limit: Vec<RateBucketInfo>,
pub region: String, pub region: String,
pub handshake_timeout: Duration, pub handshake_timeout: Duration,
pub aws_region: String,
pub wake_compute_retry_config: RetryConfig, pub wake_compute_retry_config: RetryConfig,
pub connect_compute_locks: ApiLocks<Host>, pub connect_compute_locks: ApiLocks<Host>,
pub connect_to_compute_retry_config: RetryConfig, pub connect_to_compute_retry_config: RetryConfig,
@@ -51,13 +55,14 @@ pub struct TlsConfig {
} }
pub struct HttpConfig { pub struct HttpConfig {
pub request_timeout: tokio::time::Duration,
pub pool_options: GlobalConnPoolOptions, pub pool_options: GlobalConnPoolOptions,
pub cancel_set: CancelSet, pub cancel_set: CancelSet,
pub client_conn_threshold: u64, pub client_conn_threshold: u64,
} }
pub struct AuthenticationConfig { pub struct AuthenticationConfig {
pub thread_pool: Arc<ThreadPool<EndpointIdInt>>, pub thread_pool: Arc<ThreadPool>,
pub scram_protocol_timeout: tokio::time::Duration, pub scram_protocol_timeout: tokio::time::Duration,
pub rate_limiter_enabled: bool, pub rate_limiter_enabled: bool,
pub rate_limiter: AuthRateLimiter, pub rate_limiter: AuthRateLimiter,
@@ -125,6 +130,66 @@ pub fn configure_tls(
}) })
} }
/// Channel binding parameter
///
/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
/// Description: The hash of the TLS server's certificate as it
/// appears, octet for octet, in the server's Certificate message. Note
/// that the Certificate message contains a certificate_list, in which
/// the first element is the server's certificate.
///
/// The hash function is to be selected as follows:
///
/// * if the certificate's signatureAlgorithm uses a single hash
/// function, and that hash function is either MD5 or SHA-1, then use SHA-256;
///
/// * if the certificate's signatureAlgorithm uses a single hash
/// function and that hash function neither MD5 nor SHA-1, then use
/// the hash function associated with the certificate's
/// signatureAlgorithm;
///
/// * if the certificate's signatureAlgorithm uses no hash functions or
/// uses multiple hash functions, then this channel binding type's
/// channel bindings are undefined at this time (updates to is channel
/// binding type may occur to address this issue if it ever arises).
#[derive(Debug, Clone, Copy)]
pub enum TlsServerEndPoint {
Sha256([u8; 32]),
Undefined,
}
impl TlsServerEndPoint {
pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
let sha256_oids = [
// I'm explicitly not adding MD5 or SHA1 here... They're bad.
oid_registry::OID_SIG_ECDSA_WITH_SHA256,
oid_registry::OID_PKCS1_SHA256WITHRSA,
];
let pem = x509_parser::parse_x509_certificate(cert)
.context("Failed to parse PEM object from cerficiate")?
.1;
info!(subject = %pem.subject, "parsing TLS certificate");
let reg = oid_registry::OidRegistry::default().with_all_crypto();
let oid = pem.signature_algorithm.oid();
let alg = reg.get(oid);
if sha256_oids.contains(oid) {
let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
Ok(Self::Sha256(tls_server_end_point))
} else {
error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
Ok(Self::Undefined)
}
}
pub fn supported(&self) -> bool {
!matches!(self, TlsServerEndPoint::Undefined)
}
}
#[derive(Default, Debug)] #[derive(Default, Debug)]
pub struct CertResolver { pub struct CertResolver {
certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>, certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,

View File

@@ -16,10 +16,9 @@ use crate::{
intern::ProjectIdInt, intern::ProjectIdInt,
metrics::ApiLockMetrics, metrics::ApiLockMetrics,
rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
EndpointCacheKey, scram, EndpointCacheKey,
}; };
use dashmap::DashMap; use dashmap::DashMap;
use proxy_sasl::scram;
use std::{hash::Hash, sync::Arc, time::Duration}; use std::{hash::Hash, sync::Arc, time::Duration};
use tokio::time::Instant; use tokio::time::Instant;
use tracing::info; use tracing::info;
@@ -293,7 +292,7 @@ pub struct NodeInfo {
impl NodeInfo { impl NodeInfo {
pub async fn connect( pub async fn connect(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
timeout: Duration, timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> { ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config self.config
@@ -331,20 +330,20 @@ pub(crate) trait Api {
/// We still have to mock the scram to avoid leaking information that user doesn't exist. /// We still have to mock the scram to avoid leaking information that user doesn't exist.
async fn get_role_secret( async fn get_role_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>; ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips_and_secret( async fn get_allowed_ips_and_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>; ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info. /// Wake up the compute node and return the corresponding connection info.
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>; ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
} }
@@ -364,7 +363,7 @@ pub enum ConsoleBackend {
impl Api for ConsoleBackend { impl Api for ConsoleBackend {
async fn get_role_secret( async fn get_role_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> { ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
use ConsoleBackend::*; use ConsoleBackend::*;
@@ -379,7 +378,7 @@ impl Api for ConsoleBackend {
async fn get_allowed_ips_and_secret( async fn get_allowed_ips_and_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> { ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
use ConsoleBackend::*; use ConsoleBackend::*;
@@ -394,7 +393,7 @@ impl Api for ConsoleBackend {
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> { ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
use ConsoleBackend::*; use ConsoleBackend::*;
@@ -470,15 +469,15 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
timeout: Duration, timeout: Duration,
epoch: std::time::Duration, epoch: std::time::Duration,
metrics: &'static ApiLockMetrics, metrics: &'static ApiLockMetrics,
) -> Self { ) -> prometheus::Result<Self> {
Self { Ok(Self {
name, name,
node_locks: DashMap::with_shard_amount(shards), node_locks: DashMap::with_shard_amount(shards),
config, config,
timeout, timeout,
epoch, epoch,
metrics, metrics,
} })
} }
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> { pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {

View File

@@ -5,7 +5,7 @@ use super::{
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
}; };
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, url::ApiUrl}; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{auth::IpPattern, cache::Cached}; use crate::{auth::IpPattern, cache::Cached};
use crate::{ use crate::{
console::{ console::{
@@ -15,7 +15,6 @@ use crate::{
BranchId, EndpointId, ProjectId, BranchId, EndpointId, ProjectId,
}; };
use futures::TryFutureExt; use futures::TryFutureExt;
use proxy_sasl::scram;
use std::{str::FromStr, sync::Arc}; use std::{str::FromStr, sync::Arc};
use thiserror::Error; use thiserror::Error;
use tokio_postgres::{config::SslMode, Client}; use tokio_postgres::{config::SslMode, Client};
@@ -159,7 +158,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn get_role_secret( async fn get_role_secret(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> { ) -> Result<CachedRoleSecret, GetAuthInfoError> {
Ok(CachedRoleSecret::new_uncached( Ok(CachedRoleSecret::new_uncached(
@@ -169,7 +168,7 @@ impl super::Api for Api {
async fn get_allowed_ips_and_secret( async fn get_allowed_ips_and_secret(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> { ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
Ok(( Ok((
@@ -183,7 +182,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn wake_compute( async fn wake_compute(
&self, &self,
_ctx: &RequestMonitoring, _ctx: &mut RequestMonitoring,
_user_info: &ComputeUserInfo, _user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> { ) -> Result<CachedNodeInfo, WakeComputeError> {
self.do_wake_compute().map_ok(Cached::new_uncached).await self.do_wake_compute().map_ok(Cached::new_uncached).await

View File

@@ -13,11 +13,10 @@ use crate::{
http, http,
metrics::{CacheOutcome, Metrics}, metrics::{CacheOutcome, Metrics},
rate_limiter::WakeComputeRateLimiter, rate_limiter::WakeComputeRateLimiter,
EndpointCacheKey, scram, EndpointCacheKey,
}; };
use crate::{cache::Cached, context::RequestMonitoring}; use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt; use futures::TryFutureExt;
use proxy_sasl::scram;
use std::{sync::Arc, time::Duration}; use std::{sync::Arc, time::Duration};
use tokio::time::Instant; use tokio::time::Instant;
use tokio_postgres::config::SslMode; use tokio_postgres::config::SslMode;
@@ -58,7 +57,7 @@ impl Api {
async fn do_get_auth_info( async fn do_get_auth_info(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> { ) -> Result<AuthInfo, GetAuthInfoError> {
if !self if !self
@@ -70,7 +69,7 @@ impl Api {
info!("endpoint is not valid, skipping the request"); info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default()); return Ok(AuthInfo::default());
} }
let request_id = ctx.session_id().to_string(); let request_id = ctx.session_id.to_string();
let application_name = ctx.console_application_name(); let application_name = ctx.console_application_name();
async { async {
let request = self let request = self
@@ -78,7 +77,7 @@ impl Api {
.get("proxy_get_role_secret") .get("proxy_get_role_secret")
.header("X-Request-ID", &request_id) .header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt)) .header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())]) .query(&[("session_id", ctx.session_id)])
.query(&[ .query(&[
("application_name", application_name.as_str()), ("application_name", application_name.as_str()),
("project", user_info.endpoint.as_str()), ("project", user_info.endpoint.as_str()),
@@ -88,7 +87,7 @@ impl Api {
info!(url = request.url().as_str(), "sending http request"); info!(url = request.url().as_str(), "sending http request");
let start = Instant::now(); let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
let response = self.endpoint.execute(request).await?; let response = self.endpoint.execute(request).await?;
drop(pause); drop(pause);
info!(duration = ?start.elapsed(), "received http response"); info!(duration = ?start.elapsed(), "received http response");
@@ -131,10 +130,10 @@ impl Api {
async fn do_wake_compute( async fn do_wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<NodeInfo, WakeComputeError> { ) -> Result<NodeInfo, WakeComputeError> {
let request_id = ctx.session_id().to_string(); let request_id = ctx.session_id.to_string();
let application_name = ctx.console_application_name(); let application_name = ctx.console_application_name();
async { async {
let mut request_builder = self let mut request_builder = self
@@ -142,7 +141,7 @@ impl Api {
.get("proxy_wake_compute") .get("proxy_wake_compute")
.header("X-Request-ID", &request_id) .header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt)) .header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())]) .query(&[("session_id", ctx.session_id)])
.query(&[ .query(&[
("application_name", application_name.as_str()), ("application_name", application_name.as_str()),
("project", user_info.endpoint.as_str()), ("project", user_info.endpoint.as_str()),
@@ -157,7 +156,7 @@ impl Api {
info!(url = request.url().as_str(), "sending http request"); info!(url = request.url().as_str(), "sending http request");
let start = Instant::now(); let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
let response = self.endpoint.execute(request).await?; let response = self.endpoint.execute(request).await?;
drop(pause); drop(pause);
info!(duration = ?start.elapsed(), "received http response"); info!(duration = ?start.elapsed(), "received http response");
@@ -193,7 +192,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn get_role_secret( async fn get_role_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> { ) -> Result<CachedRoleSecret, GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize(); let normalized_ep = &user_info.endpoint.normalize();
@@ -227,7 +226,7 @@ impl super::Api for Api {
async fn get_allowed_ips_and_secret( async fn get_allowed_ips_and_secret(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> { ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize(); let normalized_ep = &user_info.endpoint.normalize();
@@ -269,7 +268,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> { ) -> Result<CachedNodeInfo, WakeComputeError> {
let key = user_info.endpoint_cache_key(); let key = user_info.endpoint_cache_key();

View File

@@ -7,14 +7,13 @@ use smol_str::SmolStr;
use std::net::IpAddr; use std::net::IpAddr;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tracing::{field::display, info, info_span, Span}; use tracing::{field::display, info, info_span, Span};
use try_lock::TryLock;
use uuid::Uuid; use uuid::Uuid;
use crate::{ use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo}, console::messages::{ColdStartInfo, MetricsAuxInfo},
error::ErrorKind, error::ErrorKind,
intern::{BranchIdInt, ProjectIdInt}, intern::{BranchIdInt, ProjectIdInt},
metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
DbName, EndpointId, RoleName, DbName, EndpointId, RoleName,
}; };
@@ -29,15 +28,7 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
/// ///
/// This data should **not** be used for connection logic, only for observability and limiting purposes. /// This data should **not** be used for connection logic, only for observability and limiting purposes.
/// All connection logic should instead use strongly typed state machines, not a bunch of Options. /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
pub struct RequestMonitoring( pub struct RequestMonitoring {
/// To allow easier use of the ctx object, we have interior mutability.
/// I would typically use a RefCell but that would break the `Send` requirements
/// so we need something with thread-safety. `TryLock` is a cheap alternative
/// that offers similar semantics to a `RefCell` but with synchronisation.
TryLock<RequestMonitoringInner>,
);
struct RequestMonitoringInner {
pub peer_addr: IpAddr, pub peer_addr: IpAddr,
pub session_id: Uuid, pub session_id: Uuid,
pub protocol: Protocol, pub protocol: Protocol,
@@ -94,7 +85,7 @@ impl RequestMonitoring {
role = tracing::field::Empty, role = tracing::field::Empty,
); );
let inner = RequestMonitoringInner { Self {
peer_addr, peer_addr,
session_id, session_id,
protocol, protocol,
@@ -119,9 +110,7 @@ impl RequestMonitoring {
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol), latency_timer: LatencyTimer::new(protocol),
disconnect_timestamp: None, disconnect_timestamp: None,
}; }
Self(TryLock::new(inner))
} }
#[cfg(test)] #[cfg(test)]
@@ -130,177 +119,48 @@ impl RequestMonitoring {
} }
pub fn console_application_name(&self) -> String { pub fn console_application_name(&self) -> String {
let this = self.0.try_lock().expect("should not deadlock");
format!( format!(
"{}/{}", "{}/{}",
this.application.as_deref().unwrap_or_default(), self.application.as_deref().unwrap_or_default(),
this.protocol self.protocol
) )
} }
pub fn set_rejected(&self, rejected: bool) { pub fn set_rejected(&mut self, rejected: bool) {
let mut this = self.0.try_lock().expect("should not deadlock"); self.rejected = Some(rejected);
this.rejected = Some(rejected);
} }
pub fn set_cold_start_info(&self, info: ColdStartInfo) { pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.0
.try_lock()
.expect("should not deadlock")
.set_cold_start_info(info);
}
pub fn set_db_options(&self, options: StartupMessageParams) {
let mut this = self.0.try_lock().expect("should not deadlock");
this.set_application(options.get("application_name").map(SmolStr::from));
if let Some(user) = options.get("user") {
this.set_user(user.into());
}
if let Some(dbname) = options.get("database") {
this.set_dbname(dbname.into());
}
this.pg_options = Some(options);
}
pub fn set_project(&self, x: MetricsAuxInfo) {
let mut this = self.0.try_lock().expect("should not deadlock");
if this.endpoint_id.is_none() {
this.set_endpoint_id(x.endpoint_id.as_str().into())
}
this.branch = Some(x.branch_id);
this.project = Some(x.project_id);
this.set_cold_start_info(x.cold_start_info);
}
pub fn set_project_id(&self, project_id: ProjectIdInt) {
let mut this = self.0.try_lock().expect("should not deadlock");
this.project = Some(project_id);
}
pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
self.0
.try_lock()
.expect("should not deadlock")
.set_endpoint_id(endpoint_id);
}
pub fn set_dbname(&self, dbname: DbName) {
self.0
.try_lock()
.expect("should not deadlock")
.set_dbname(dbname);
}
pub fn set_user(&self, user: RoleName) {
self.0
.try_lock()
.expect("should not deadlock")
.set_user(user);
}
pub fn set_auth_method(&self, auth_method: AuthMethod) {
let mut this = self.0.try_lock().expect("should not deadlock");
this.auth_method = Some(auth_method);
}
pub fn has_private_peer_addr(&self) -> bool {
self.0
.try_lock()
.expect("should not deadlock")
.has_private_peer_addr()
}
pub fn set_error_kind(&self, kind: ErrorKind) {
let mut this = self.0.try_lock().expect("should not deadlock");
// Do not record errors from the private address to metrics.
if !this.has_private_peer_addr() {
Metrics::get().proxy.errors_total.inc(kind);
}
if let Some(ep) = &this.endpoint_id {
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
let label = metric.with_labels(kind);
metric.get_metric(label).measure(ep);
}
this.error_kind = Some(kind);
}
pub fn set_success(&self) {
let mut this = self.0.try_lock().expect("should not deadlock");
this.success = true;
}
pub fn log_connect(&self) {
self.0
.try_lock()
.expect("should not deadlock")
.log_connect();
}
pub fn protocol(&self) -> Protocol {
self.0.try_lock().expect("should not deadlock").protocol
}
pub fn span(&self) -> Span {
self.0.try_lock().expect("should not deadlock").span.clone()
}
pub fn session_id(&self) -> Uuid {
self.0.try_lock().expect("should not deadlock").session_id
}
pub fn peer_addr(&self) -> IpAddr {
self.0.try_lock().expect("should not deadlock").peer_addr
}
pub fn cold_start_info(&self) -> ColdStartInfo {
self.0
.try_lock()
.expect("should not deadlock")
.cold_start_info
}
pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
LatencyTimerPause {
ctx: self,
start: tokio::time::Instant::now(),
waiting_for,
}
}
pub fn success(&self) {
self.0
.try_lock()
.expect("should not deadlock")
.latency_timer
.success()
}
}
pub struct LatencyTimerPause<'a> {
ctx: &'a RequestMonitoring,
start: tokio::time::Instant,
waiting_for: Waiting,
}
impl Drop for LatencyTimerPause<'_> {
fn drop(&mut self) {
self.ctx
.0
.try_lock()
.expect("should not deadlock")
.latency_timer
.unpause(self.start, self.waiting_for);
}
}
impl RequestMonitoringInner {
fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.cold_start_info = info; self.cold_start_info = info;
self.latency_timer.cold_start_info(info); self.latency_timer.cold_start_info(info);
} }
fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { pub fn set_db_options(&mut self, options: StartupMessageParams) {
self.set_application(options.get("application_name").map(SmolStr::from));
if let Some(user) = options.get("user") {
self.set_user(user.into());
}
if let Some(dbname) = options.get("database") {
self.set_dbname(dbname.into());
}
self.pg_options = Some(options);
}
pub fn set_project(&mut self, x: MetricsAuxInfo) {
if self.endpoint_id.is_none() {
self.set_endpoint_id(x.endpoint_id.as_str().into())
}
self.branch = Some(x.branch_id);
self.project = Some(x.project_id);
self.set_cold_start_info(x.cold_start_info);
}
pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
self.project = Some(project_id);
}
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
if self.endpoint_id.is_none() { if self.endpoint_id.is_none() {
self.span.record("ep", display(&endpoint_id)); self.span.record("ep", display(&endpoint_id));
let metric = &Metrics::get().proxy.connecting_endpoints; let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -316,23 +176,44 @@ impl RequestMonitoringInner {
} }
} }
fn set_dbname(&mut self, dbname: DbName) { pub fn set_dbname(&mut self, dbname: DbName) {
self.dbname = Some(dbname); self.dbname = Some(dbname);
} }
fn set_user(&mut self, user: RoleName) { pub fn set_user(&mut self, user: RoleName) {
self.span.record("role", display(&user)); self.span.record("role", display(&user));
self.user = Some(user); self.user = Some(user);
} }
fn has_private_peer_addr(&self) -> bool { pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
self.auth_method = Some(auth_method);
}
pub fn has_private_peer_addr(&self) -> bool {
match self.peer_addr { match self.peer_addr {
IpAddr::V4(ip) => ip.is_private(), IpAddr::V4(ip) => ip.is_private(),
_ => false, _ => false,
} }
} }
fn log_connect(&mut self) { pub fn set_error_kind(&mut self, kind: ErrorKind) {
// Do not record errors from the private address to metrics.
if !self.has_private_peer_addr() {
Metrics::get().proxy.errors_total.inc(kind);
}
if let Some(ep) = &self.endpoint_id {
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
let label = metric.with_labels(kind);
metric.get_metric(label).measure(ep);
}
self.error_kind = Some(kind);
}
pub fn set_success(&mut self) {
self.success = true;
}
pub fn log_connect(&mut self) {
let outcome = if self.success { let outcome = if self.success {
ConnectOutcome::Success ConnectOutcome::Success
} else { } else {
@@ -375,7 +256,7 @@ impl RequestMonitoringInner {
} }
} }
impl Drop for RequestMonitoringInner { impl Drop for RequestMonitoring {
fn drop(&mut self) { fn drop(&mut self) {
if self.sender.is_some() { if self.sender.is_some() {
self.log_connect(); self.log_connect();

View File

@@ -23,7 +23,7 @@ use utils::backoff;
use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
use super::{RequestMonitoringInner, LOG_CHAN}; use super::{RequestMonitoring, LOG_CHAN};
#[derive(clap::Args, Clone, Debug)] #[derive(clap::Args, Clone, Debug)]
pub struct ParquetUploadArgs { pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
} }
} }
impl From<&RequestMonitoringInner> for RequestData { impl From<&RequestMonitoring> for RequestData {
fn from(value: &RequestMonitoringInner) -> Self { fn from(value: &RequestMonitoring) -> Self {
Self { Self {
session_id: value.session_id, session_id: value.session_id,
peer_addr: value.peer_addr.to_string(), peer_addr: value.peer_addr.to_string(),

View File

@@ -6,12 +6,6 @@ pub mod health_server;
use std::time::Duration; use std::time::Duration;
use anyhow::bail;
use bytes::Bytes;
use http_body_util::BodyExt;
use hyper1::body::Body;
use serde::de::DeserializeOwned;
pub use reqwest::{Request, Response, StatusCode}; pub use reqwest::{Request, Response, StatusCode};
pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
@@ -102,33 +96,6 @@ impl Endpoint {
} }
} }
pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
limit: usize,
) -> anyhow::Result<D> {
// We could use `b.limited().collect().await.to_bytes()` here
// but this ends up being slightly more efficient as far as I can tell.
// check the lower bound of the size hint.
// in reqwest, this value is influenced by the Content-Length header.
let lower_bound = match usize::try_from(b.size_hint().lower()) {
Ok(bound) if bound <= limit => bound,
_ => bail!("content length exceeds limit"),
};
let mut bytes = Vec::with_capacity(lower_bound);
while let Some(frame) = b.frame().await.transpose()? {
if let Ok(data) = frame.into_data() {
if bytes.len() + data.len() > limit {
bail!("content length exceeds limit")
}
bytes.extend_from_slice(&data);
}
}
Ok(serde_json::from_slice::<D>(&bytes)?)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

Some files were not shown because too many files have changed in this diff Show More