mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 02:42:56 +00:00
Compare commits
106 Commits
release-62
...
split-prox
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
18303e4d68 | ||
|
|
3df6d368e3 | ||
|
|
b62e7c0138 | ||
|
|
a2968c6cf8 | ||
|
|
bae1288671 | ||
|
|
1254d8f56e | ||
|
|
073508493c | ||
|
|
7cb2349296 | ||
|
|
87151f9efd | ||
|
|
96fe084c57 | ||
|
|
20fdf3e19f | ||
|
|
c6b36d8171 | ||
|
|
0e8a848937 | ||
|
|
db4085fe22 | ||
|
|
0d895ba002 | ||
|
|
103f34e954 | ||
|
|
262378e561 | ||
|
|
9f38ab39c6 | ||
|
|
fa92328423 | ||
|
|
f7a3380aec | ||
|
|
507f1a5bdd | ||
|
|
401dcd3551 | ||
|
|
4a53cd0fc3 | ||
|
|
f5cef7bf7f | ||
|
|
e6770d79fd | ||
|
|
201f56baf7 | ||
|
|
a155914c1c | ||
|
|
7e08fbd1b9 | ||
|
|
2ca5ff26d7 | ||
|
|
8acce00953 | ||
|
|
d28a6f2576 | ||
|
|
4431688dc6 | ||
|
|
953b7d4f7e | ||
|
|
8561b2c628 | ||
|
|
21638ee96c | ||
|
|
cbe8c77997 | ||
|
|
cf3eac785b | ||
|
|
542385e364 | ||
|
|
05dd1ae9e0 | ||
|
|
8468d51a14 | ||
|
|
a81fab4826 | ||
|
|
b3eea45277 | ||
|
|
fc78774f39 | ||
|
|
ad0988f278 | ||
|
|
4d7c0dac93 | ||
|
|
00c981576a | ||
|
|
c3f2240fbd | ||
|
|
ed5724d79d | ||
|
|
ca5390a89d | ||
|
|
3727c6fbbe | ||
|
|
42229aacf6 | ||
|
|
b7beaa0fd7 | ||
|
|
16c91ff5d3 | ||
|
|
078f941dc8 | ||
|
|
68bcbf8227 | ||
|
|
a31c95cb40 | ||
|
|
dc7eb5ae5a | ||
|
|
44fedfd6c3 | ||
|
|
138f008bab | ||
|
|
6a6f30e378 | ||
|
|
8f3bc5ae35 | ||
|
|
e6e578821b | ||
|
|
c32807ac19 | ||
|
|
50daff9655 | ||
|
|
bd845c7587 | ||
|
|
f63c8e5a8c | ||
|
|
200fa56b04 | ||
|
|
0f3dac265b | ||
|
|
1dc496a2c9 | ||
|
|
6814bdd30b | ||
|
|
0a667bc8ef | ||
|
|
f3acfb2d80 | ||
|
|
8c828c586e | ||
|
|
2334fed762 | ||
|
|
c53799044d | ||
|
|
e7477855b7 | ||
|
|
f4a668a27d | ||
|
|
970f2923b2 | ||
|
|
1678dea20f | ||
|
|
163f2eaf79 | ||
|
|
980d506bda | ||
|
|
d6c79b77df | ||
|
|
3350daeb9a | ||
|
|
939d50a41c | ||
|
|
2f9ada13c4 | ||
|
|
ff51b565d3 | ||
|
|
5e0409de95 | ||
|
|
4e3b70e308 | ||
|
|
61a65f61f3 | ||
|
|
d21246c8bd | ||
|
|
4825b0fec3 | ||
|
|
a4df3c8488 | ||
|
|
d95b46f3f3 | ||
|
|
85bef9f05d | ||
|
|
e374d6778e | ||
|
|
9ceaf9a986 | ||
|
|
f72fe68626 | ||
|
|
9fabdda2dc | ||
|
|
1c7b06c988 | ||
|
|
52b02d95c8 | ||
|
|
4be58522fb | ||
|
|
d09dad0ea2 | ||
|
|
5775662276 | ||
|
|
bdfc9ca7e9 | ||
|
|
1d8cf5b3a9 | ||
|
|
859f019185 |
3
.github/actionlint.yml
vendored
3
.github/actionlint.yml
vendored
@@ -8,6 +8,9 @@ self-hosted-runner:
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
- DEV_AWS_OIDC_ROLE_ARN
|
||||
|
||||
97
.github/workflows/benchmarking.yml
vendored
97
.github/workflows/benchmarking.yml
vendored
@@ -56,6 +56,10 @@ concurrency:
|
||||
jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -63,9 +67,13 @@ jobs:
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -76,14 +84,21 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: ${{ matrix.IMAGE }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -147,7 +162,7 @@ jobs:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -161,6 +176,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -168,7 +184,7 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run benchmark
|
||||
- name: Run Logical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
@@ -176,12 +192,15 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
|
||||
BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
|
||||
|
||||
- name: Run benchmark
|
||||
- name: Run Physical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
@@ -234,6 +253,9 @@ jobs:
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default='["self-hosted", "us-east-2", "x64"]'
|
||||
runner_azure='["self-hosted", "eastus2", "x64"]'
|
||||
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
@@ -247,16 +269,19 @@ jobs:
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
"runner": ['"$runner_default"'],
|
||||
"image": [ "'"$image_default"'" ],
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -299,6 +324,10 @@ jobs:
|
||||
pgbench-compare:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
needs: [ generate-matrices ]
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -314,9 +343,9 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
runs-on: ${{ matrix.runner }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: ${{ matrix.image }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
@@ -325,6 +354,13 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -432,12 +468,20 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-pgvector:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
@@ -450,9 +494,9 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: ${{ matrix.IMAGE }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -463,12 +507,12 @@ jobs:
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
run: |
|
||||
cd /home/nonroot
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
@@ -493,6 +537,13 @@ jobs:
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Benchmark pgvector hnsw indexing
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -521,7 +572,7 @@ jobs:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -149,8 +149,6 @@ jobs:
|
||||
|
||||
env:
|
||||
BUILD_TYPE: release
|
||||
# remove the cachepot wrapper and build without crate caches
|
||||
RUSTC_WRAPPER: ""
|
||||
# build with incremental compilation produce partial results
|
||||
# so do not attempt to cache this build, also disable the incremental compilation
|
||||
CARGO_INCREMENTAL: 0
|
||||
|
||||
26
.github/workflows/pg-clients.yml
vendored
26
.github/workflows/pg-clients.yml
vendored
@@ -66,7 +66,31 @@ jobs:
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
ports:
|
||||
- 2181:2181
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
env:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_JMX_PORT: 9991
|
||||
ports:
|
||||
- 9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
env:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
CONFIG_STORAGE_TOPIC: debezium-config
|
||||
OFFSET_STORAGE_TOPIC: debezium-offset
|
||||
STATUS_STORAGE_TOPIC: debezium-status
|
||||
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
|
||||
ports:
|
||||
- 8083:8083
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
60
.github/workflows/pin-build-tools-image.yml
vendored
60
.github/workflows/pin-build-tools-image.yml
vendored
@@ -7,12 +7,20 @@ on:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
force:
|
||||
description: 'Force the image to be pinned'
|
||||
default: false
|
||||
type: boolean
|
||||
workflow_call:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
force:
|
||||
description: 'Force the image to be pinned'
|
||||
default: false
|
||||
type: boolean
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -22,15 +30,18 @@ concurrency:
|
||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||
cancel-in-progress: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
jobs:
|
||||
check-manifests:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
skip: ${{ steps.check-manifests.outputs.skip }}
|
||||
|
||||
steps:
|
||||
- name: Check if we really need to pin the image
|
||||
@@ -47,27 +58,44 @@ jobs:
|
||||
|
||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
tag-image:
|
||||
needs: check-manifests
|
||||
|
||||
# use format(..) to catch both inputs.force = true AND inputs.force = 'true'
|
||||
if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # for `azure/login`
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
|
||||
-t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
|
||||
-t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
38
.github/workflows/trigger-e2e-tests.yml
vendored
38
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -13,8 +13,6 @@ defaults:
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
@@ -64,19 +62,35 @@ jobs:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
EVENT_ACTION: ${{ github.event.action }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
- name: Wait for `promote-images` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
if [ "$OUTPUT" == "" ]; then
|
||||
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# For PRs we use the run id as the tag
|
||||
BUILD_AND_TEST_RUN_ID=${TAG}
|
||||
while true; do
|
||||
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
|
||||
case "$conclusion" in
|
||||
success)
|
||||
break
|
||||
;;
|
||||
failure | cancelled | skipped)
|
||||
echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The 'promote-images' hasn't succeed yet. Waiting..."
|
||||
sleep 60
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
|
||||
371
Cargo.lock
generated
371
Cargo.lock
generated
@@ -484,7 +484,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"once_cell",
|
||||
"p256",
|
||||
"p256 0.11.1",
|
||||
"percent-encoding",
|
||||
"ring 0.17.6",
|
||||
"sha2",
|
||||
@@ -848,6 +848,12 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
|
||||
|
||||
[[package]]
|
||||
name = "base16ct"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.13.1"
|
||||
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.16.0"
|
||||
version = "1.16.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
|
||||
checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
@@ -1526,8 +1532,10 @@ version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
|
||||
dependencies = [
|
||||
"const-oid",
|
||||
"pem-rfc7468",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"const-oid",
|
||||
"crypto-common",
|
||||
"subtle",
|
||||
]
|
||||
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
|
||||
dependencies = [
|
||||
"der 0.6.1",
|
||||
"elliptic-curve",
|
||||
"rfc6979",
|
||||
"elliptic-curve 0.12.3",
|
||||
"rfc6979 0.3.1",
|
||||
"signature 1.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ecdsa"
|
||||
version = "0.16.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"digest",
|
||||
"elliptic-curve 0.13.8",
|
||||
"rfc6979 0.4.0",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.8.1"
|
||||
@@ -1788,16 +1812,36 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"base16ct 0.1.1",
|
||||
"crypto-bigint 0.4.9",
|
||||
"der 0.6.1",
|
||||
"digest",
|
||||
"ff",
|
||||
"ff 0.12.1",
|
||||
"generic-array",
|
||||
"group",
|
||||
"pkcs8",
|
||||
"group 0.12.1",
|
||||
"pkcs8 0.9.0",
|
||||
"rand_core 0.6.4",
|
||||
"sec1",
|
||||
"sec1 0.3.0",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "elliptic-curve"
|
||||
version = "0.13.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
|
||||
dependencies = [
|
||||
"base16ct 0.2.0",
|
||||
"crypto-bigint 0.5.5",
|
||||
"digest",
|
||||
"ff 0.13.0",
|
||||
"generic-array",
|
||||
"group 0.13.0",
|
||||
"pem-rfc7468",
|
||||
"pkcs8 0.10.2",
|
||||
"rand_core 0.6.4",
|
||||
"sec1 0.7.3",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -1951,6 +1995,16 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ff"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
|
||||
dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.22"
|
||||
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
"version_check",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2214,7 +2269,18 @@ version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
|
||||
dependencies = [
|
||||
"ff",
|
||||
"ff 0.12.1",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "group"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
|
||||
dependencies = [
|
||||
"ff 0.13.0",
|
||||
"rand_core 0.6.4",
|
||||
"subtle",
|
||||
]
|
||||
@@ -2776,6 +2842,42 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-b64"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"serde",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-jwa"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jose-jwk"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
|
||||
dependencies = [
|
||||
"jose-b64",
|
||||
"jose-jwa",
|
||||
"p256 0.13.2",
|
||||
"p384",
|
||||
"rsa",
|
||||
"serde",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.69"
|
||||
@@ -2835,6 +2937,9 @@ name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
dependencies = [
|
||||
"spin 0.5.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
@@ -3204,6 +3309,23 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint-dig"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"lazy_static",
|
||||
"libm",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-traits",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.4"
|
||||
@@ -3481,11 +3603,33 @@ version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
|
||||
dependencies = [
|
||||
"ecdsa",
|
||||
"elliptic-curve",
|
||||
"ecdsa 0.14.8",
|
||||
"elliptic-curve 0.12.3",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
|
||||
dependencies = [
|
||||
"ecdsa 0.16.9",
|
||||
"elliptic-curve 0.13.8",
|
||||
"primeorder",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "p384"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
|
||||
dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
"primeorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
@@ -3847,6 +3991,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem-rfc7468"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.2.0"
|
||||
@@ -3863,6 +4016,29 @@ dependencies = [
|
||||
"indexmap 1.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pg_sni_router"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"futures",
|
||||
"git-version",
|
||||
"itertools 0.10.5",
|
||||
"pq_proto",
|
||||
"proxy-core",
|
||||
"proxy-sasl",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"socket2 0.5.5",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.1"
|
||||
@@ -3913,6 +4089,17 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pkcs1"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"pkcs8 0.10.2",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkcs8"
|
||||
version = "0.9.0"
|
||||
@@ -3923,6 +4110,16 @@ dependencies = [
|
||||
"spki 0.6.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkcs8"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
|
||||
dependencies = [
|
||||
"der 0.7.8",
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.27"
|
||||
@@ -3960,7 +4157,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3973,7 +4170,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3992,7 +4189,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4116,6 +4313,15 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "primeorder"
|
||||
version = "0.13.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
|
||||
dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.20+deprecated"
|
||||
@@ -4230,9 +4436,38 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"clap",
|
||||
"futures",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"pq_proto",
|
||||
"proxy-core",
|
||||
"proxy-sasl",
|
||||
"remote_storage",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"socket2 0.5.5",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proxy-core"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
@@ -4250,11 +4485,11 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"crossbeam-deque",
|
||||
"dashmap",
|
||||
"ecdsa 0.16.9",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
@@ -4270,12 +4505,14 @@ dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"p256 0.13.2",
|
||||
"parking_lot 0.12.1",
|
||||
"parquet",
|
||||
"parquet_derive",
|
||||
@@ -4284,7 +4521,7 @@ dependencies = [
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
"proxy-sasl",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rcgen",
|
||||
@@ -4296,6 +4533,7 @@ dependencies = [
|
||||
"reqwest-retry",
|
||||
"reqwest-tracing",
|
||||
"routerify",
|
||||
"rsa",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.22.4",
|
||||
@@ -4305,6 +4543,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"signature 2.2.0",
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
@@ -4312,7 +4551,6 @@ dependencies = [
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
@@ -4324,6 +4562,7 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
"typed-json",
|
||||
"url",
|
||||
"urlencoding",
|
||||
@@ -4334,6 +4573,35 @@ dependencies = [
|
||||
"x509-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proxy-sasl"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"crossbeam-deque",
|
||||
"hmac",
|
||||
"itertools 0.10.5",
|
||||
"lasso",
|
||||
"measured",
|
||||
"parking_lot 0.12.1",
|
||||
"pbkdf2",
|
||||
"postgres-protocol",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rustls 0.22.4",
|
||||
"sha2",
|
||||
"subtle",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
@@ -4806,6 +5074,16 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rfc6979"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
|
||||
dependencies = [
|
||||
"hmac",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.16.20"
|
||||
@@ -4866,6 +5144,26 @@ dependencies = [
|
||||
"archery",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rsa"
|
||||
version = "0.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
|
||||
dependencies = [
|
||||
"const-oid",
|
||||
"digest",
|
||||
"num-bigint-dig",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"pkcs1",
|
||||
"pkcs8 0.10.2",
|
||||
"rand_core 0.6.4",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstest"
|
||||
version = "0.18.2"
|
||||
@@ -5194,10 +5492,24 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"base16ct 0.1.1",
|
||||
"der 0.6.1",
|
||||
"generic-array",
|
||||
"pkcs8",
|
||||
"pkcs8 0.9.0",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sec1"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
|
||||
dependencies = [
|
||||
"base16ct 0.2.0",
|
||||
"der 0.7.8",
|
||||
"generic-array",
|
||||
"pkcs8 0.10.2",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -5544,6 +5856,7 @@ version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
@@ -6186,7 +6499,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6563,9 +6876,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.4"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
@@ -7378,13 +7691,17 @@ dependencies = [
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"crypto-bigint 0.5.5",
|
||||
"der 0.7.8",
|
||||
"deranged",
|
||||
"digest",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.14.5",
|
||||
"hex",
|
||||
@@ -7392,6 +7709,7 @@ dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"indexmap 1.9.3",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"memchr",
|
||||
@@ -7415,7 +7733,9 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"signature 2.2.0",
|
||||
"smallvec",
|
||||
"spki 0.7.3",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.52",
|
||||
@@ -7526,6 +7846,7 @@ version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
|
||||
@@ -9,7 +9,10 @@ members = [
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"proxy/core",
|
||||
"proxy/sasl",
|
||||
"proxy/proxy",
|
||||
"proxy/pg_sni_router",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
@@ -184,6 +187,7 @@ tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
|
||||
21
Dockerfile
21
Dockerfile
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
ENV BUILD_TYPE=release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& rm -rf pg_install/build \
|
||||
@@ -29,24 +29,12 @@ WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
|
||||
ARG RUSTC_WRAPPER=cachepot
|
||||
ENV AWS_REGION=eu-central-1
|
||||
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
--bin pg_sni_router \
|
||||
@@ -58,8 +46,7 @@ RUN set -e \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release \
|
||||
&& cachepot -s
|
||||
--locked --release
|
||||
|
||||
# Build final image
|
||||
#
|
||||
@@ -104,7 +91,7 @@ RUN mkdir -p /data/.neon/ && \
|
||||
|
||||
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
|
||||
# that want a particular postgres version will select it explicitly: this is just a default.
|
||||
ENV LD_LIBRARY_PATH /usr/local/v16/lib
|
||||
ENV LD_LIBRARY_PATH=/usr/local/v16/lib
|
||||
|
||||
|
||||
VOLUME ["/data"]
|
||||
@@ -112,5 +99,5 @@ USER neon
|
||||
EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
|
||||
CMD /usr/local/bin/pageserver -D /data/.neon
|
||||
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ RUN set -e \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
ENV PROTOC_VERSION=25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.31.0
|
||||
ENV MOLD_VERSION=v2.33.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
@@ -168,7 +168,7 @@ USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.18 \
|
||||
ENV PYTHON_VERSION=3.9.19 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
@@ -192,9 +192,14 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.80.0
|
||||
ENV RUSTC_VERSION=1.80.1
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
ARG CARGO_HAKARI_VERSION=0.9.30
|
||||
ARG CARGO_DENY_VERSION=0.16.1
|
||||
ARG CARGO_HACK_VERSION=0.6.31
|
||||
ARG CARGO_NEXTEST_VERSION=0.9.72
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
@@ -203,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny --locked && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
|
||||
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
|
||||
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
|
||||
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14") \
|
||||
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
@@ -506,7 +506,7 @@ RUN apt-get update && \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
FROM build-deps AS pg-ivm-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
|
||||
FROM build-deps AS pg-partman-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
@@ -1032,6 +1034,6 @@ RUN apt update && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
ENV LANG en_US.utf8
|
||||
ENV LANG=en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
|
||||
@@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf {
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
pub max_secondary_lag_bytes: Option<u64>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf {
|
||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -383,6 +383,10 @@ impl StorageController {
|
||||
args.push(format!("--split-threshold={split_threshold}"))
|
||||
}
|
||||
|
||||
if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
|
||||
args.push(format!("--max-secondary-lag-bytes={lag}"))
|
||||
}
|
||||
|
||||
args.push(format!(
|
||||
"--neon-local-repo-dir={}",
|
||||
self.env.base_data_dir.display()
|
||||
|
||||
15
deny.toml
15
deny.toml
@@ -4,6 +4,7 @@
|
||||
# to your expectations and requirements.
|
||||
|
||||
# Root options
|
||||
[graph]
|
||||
targets = [
|
||||
{ triple = "x86_64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-unknown-linux-gnu" },
|
||||
@@ -12,6 +13,7 @@ targets = [
|
||||
]
|
||||
all-features = false
|
||||
no-default-features = false
|
||||
[output]
|
||||
feature-depth = 1
|
||||
|
||||
# This section is considered when running `cargo deny check advisories`
|
||||
@@ -19,17 +21,16 @@ feature-depth = 1
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
|
||||
[advisories]
|
||||
db-urls = ["https://github.com/rustsec/advisory-db"]
|
||||
vulnerability = "deny"
|
||||
unmaintained = "warn"
|
||||
yanked = "warn"
|
||||
notice = "warn"
|
||||
ignore = []
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
[licenses]
|
||||
unlicensed = "deny"
|
||||
allow = [
|
||||
"Apache-2.0",
|
||||
"Artistic-2.0",
|
||||
@@ -42,10 +43,6 @@ allow = [
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
]
|
||||
deny = []
|
||||
copyleft = "warn"
|
||||
allow-osi-fsf-free = "neither"
|
||||
default = "deny"
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
# Zlib license has some restrictions if we decide to change sth
|
||||
|
||||
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src
|
||||
cd /ext-src || exit 2
|
||||
FAILED=
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d ${d} ] || continue
|
||||
[ -d "${d}" ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo ${FAILED}
|
||||
echo "${FAILED}"
|
||||
exit 1
|
||||
@@ -1,13 +1,18 @@
|
||||
# Summary
|
||||
|
||||
# Looking for `neon.tech` docs?
|
||||
|
||||
This page linkes to a selection of technical content about the open source code in this repository.
|
||||
|
||||
Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
|
||||
in this repository.
|
||||
|
||||
# Architecture
|
||||
|
||||
[Introduction]()
|
||||
- [Separation of Compute and Storage](./separation-compute-storage.md)
|
||||
|
||||
# Architecture
|
||||
|
||||
- [Compute]()
|
||||
- [WAL proposer]()
|
||||
- [WAL Backpressure]()
|
||||
- [Postgres changes](./core_changes.md)
|
||||
|
||||
- [Pageserver](./pageserver.md)
|
||||
@@ -16,33 +21,15 @@
|
||||
- [WAL Redo](./pageserver-walredo.md)
|
||||
- [Page cache](./pageserver-pagecache.md)
|
||||
- [Storage](./pageserver-storage.md)
|
||||
- [Datadir mapping]()
|
||||
- [Layer files]()
|
||||
- [Branching]()
|
||||
- [Garbage collection]()
|
||||
- [Cloud Storage]()
|
||||
- [Processing a GetPage request](./pageserver-processing-getpage.md)
|
||||
- [Processing WAL](./pageserver-processing-wal.md)
|
||||
- [Management API]()
|
||||
- [Tenant Rebalancing]()
|
||||
|
||||
- [WAL Service](walservice.md)
|
||||
- [Consensus protocol](safekeeper-protocol.md)
|
||||
- [Management API]()
|
||||
- [Rebalancing]()
|
||||
|
||||
- [Control Plane]()
|
||||
|
||||
- [Proxy]()
|
||||
|
||||
- [Source view](./sourcetree.md)
|
||||
- [docker.md](./docker.md) — Docker images and building pipeline.
|
||||
- [Error handling and logging](./error-handling.md)
|
||||
- [Testing]()
|
||||
- [Unit testing]()
|
||||
- [Integration testing]()
|
||||
- [Benchmarks]()
|
||||
|
||||
|
||||
- [Glossary](./glossary.md)
|
||||
|
||||
@@ -58,28 +45,6 @@
|
||||
|
||||
# RFCs
|
||||
|
||||
- [RFCs](./rfcs/README.md)
|
||||
|
||||
- [002-storage](rfcs/002-storage.md)
|
||||
- [003-laptop-cli](rfcs/003-laptop-cli.md)
|
||||
- [004-durability](rfcs/004-durability.md)
|
||||
- [005-zenith_local](rfcs/005-zenith_local.md)
|
||||
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
|
||||
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
|
||||
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
|
||||
- [008-push-pull](rfcs/008-push-pull.md)
|
||||
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
|
||||
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
|
||||
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
|
||||
- [010-storage_details](rfcs/010-storage_details.md)
|
||||
- [011-retention-policy](rfcs/011-retention-policy.md)
|
||||
- [012-background-tasks](rfcs/012-background-tasks.md)
|
||||
- [013-term-history](rfcs/013-term-history.md)
|
||||
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
|
||||
- [014-storage-lsm](rfcs/014-storage-lsm.md)
|
||||
- [015-storage-messaging](rfcs/015-storage-messaging.md)
|
||||
- [016-connection-routing](rfcs/016-connection-routing.md)
|
||||
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
|
||||
- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
|
||||
- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
|
||||
- [cluster-size-limits](rfcs/cluster-size-limits.md)
|
||||
Major changes are documented in RFCS:
|
||||
- See [RFCs](./rfcs/README.md) for more information
|
||||
- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs
|
||||
|
||||
@@ -107,7 +107,10 @@ impl Key {
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
assert!(
|
||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
|
||||
"invalid key: {self}",
|
||||
);
|
||||
(((self.field1 & 0x7F) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
|
||||
@@ -637,6 +637,13 @@ pub struct TenantInfo {
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
pub generation: u32,
|
||||
|
||||
/// Opaque explanation if gc is being blocked.
|
||||
///
|
||||
/// Only looked up for the individual tenant detail, not the listing. This is purely for
|
||||
/// debugging, not included in openapi.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_blocking: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -940,6 +947,8 @@ pub struct TopTenantShardsResponse {
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
@@ -958,6 +967,53 @@ pub mod virtual_file {
|
||||
#[cfg(target_os = "linux")]
|
||||
TokioEpollUring,
|
||||
}
|
||||
|
||||
/// Direct IO modes for a pageserver.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum DirectIoMode {
|
||||
/// Direct IO disabled (uses usual buffered IO).
|
||||
#[default]
|
||||
Disabled,
|
||||
/// Direct IO disabled (performs checks and perf simulations).
|
||||
Evaluate {
|
||||
/// Alignment check level
|
||||
alignment_check: DirectIoAlignmentCheckLevel,
|
||||
/// Latency padded for performance simulation.
|
||||
latency_padding: DirectIoLatencyPadding,
|
||||
},
|
||||
/// Direct IO enabled.
|
||||
Enabled {
|
||||
/// Actions to perform on alignment error.
|
||||
on_alignment_error: DirectIoOnAlignmentErrorAction,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoAlignmentCheckLevel {
|
||||
#[default]
|
||||
Error,
|
||||
Log,
|
||||
None,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoOnAlignmentErrorAction {
|
||||
Error,
|
||||
#[default]
|
||||
FallbackToBuffered,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "type", rename_all = "kebab-case")]
|
||||
pub enum DirectIoLatencyPadding {
|
||||
/// Pad virtual file operations with IO to a fake file.
|
||||
FakeFileRW { path: PathBuf },
|
||||
#[default]
|
||||
None,
|
||||
}
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1427,6 +1483,7 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1449,6 +1506,7 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct AncestorDetached {
|
||||
pub reparented_timelines: Vec<TimelineId>,
|
||||
pub reparented_timelines: HashSet<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
|
||||
// implement and this function is hardly a bottleneck. The function is only called around
|
||||
// establishing a new connection.
|
||||
#[allow(unstable_name_collisions)]
|
||||
config.options(&encode_options(&self.options));
|
||||
config.options(
|
||||
&self
|
||||
.options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>(),
|
||||
);
|
||||
}
|
||||
config
|
||||
}
|
||||
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unstable_name_collisions)]
|
||||
fn encode_options(options: &[String]) -> String {
|
||||
options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_pg_connection_config {
|
||||
use crate::{encode_options, PgConnectionConfig};
|
||||
use crate::PgConnectionConfig;
|
||||
use once_cell::sync::Lazy;
|
||||
use url::Host;
|
||||
|
||||
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
|
||||
|
||||
#[test]
|
||||
fn test_with_options() {
|
||||
let options = encode_options(&[
|
||||
"hello".to_owned(),
|
||||
"world".to_owned(),
|
||||
"with space".to_owned(),
|
||||
"and \\ backslashes".to_owned(),
|
||||
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
|
||||
"hello",
|
||||
"world",
|
||||
"with space",
|
||||
"and \\ backslashes",
|
||||
]);
|
||||
assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
|
||||
assert_eq!(cfg.host(), &*STUB_HOST);
|
||||
assert_eq!(cfg.port(), 123);
|
||||
assert_eq!(cfg.raw_address(), "stub.host.example:123");
|
||||
assert_eq!(
|
||||
cfg.to_tokio_postgres_config().get_options(),
|
||||
Some("hello world with\\ space and\\ \\\\\\ backslashes")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||
/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
|
||||
@@ -78,8 +78,9 @@ impl Drop for GateGuard {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum GateError {
|
||||
#[error("gate is closed")]
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
|
||||
@@ -108,3 +108,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_walredo"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
239
pageserver/benches/bench_ingest.rs
Normal file
239
pageserver/benches/bench_ingest.rs
Normal file
@@ -0,0 +1,239 @@
|
||||
use std::{env, num::NonZeroUsize};
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
// A very cheap hash for generating non-sequential keys.
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
enum KeyLayout {
|
||||
/// Sequential unique keys
|
||||
Sequential,
|
||||
/// Random unique keys
|
||||
Random,
|
||||
/// Random keys, but only use the bits from the mask of them
|
||||
RandomReuse(u32),
|
||||
}
|
||||
|
||||
enum WriteDelta {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
async fn ingest(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = utils::lsn::Lsn(1000);
|
||||
let mut key = Key::from_i128(0x0);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
// Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
|
||||
// usually care the most about write performance when they're blasting a huge batch of data into a huge table.
|
||||
match key_layout {
|
||||
KeyLayout::Sequential => {
|
||||
// Use sequential order to illustrate the experience a user is likely to have
|
||||
// when ingesting bulk data.
|
||||
key.field6 = i as u32;
|
||||
}
|
||||
KeyLayout::Random => {
|
||||
// Use random-order keys to avoid giving a false advantage to data structures that are
|
||||
// faster when inserting on the end.
|
||||
key.field6 = murmurhash32(i as u32);
|
||||
}
|
||||
KeyLayout::RandomReuse(mask) => {
|
||||
// Use low bits only, to limit cardinality
|
||||
key.field6 = murmurhash32(i as u32) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
layer.put_value(key, lsn, &data, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
if matches!(write_delta, WriteDelta::Yes) {
|
||||
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
let (_desc, path) = layer
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner())
|
||||
.await?
|
||||
.unwrap();
|
||||
tokio::fs::remove_file(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wrapper to instantiate a tokio runtime
|
||||
fn ingest_main(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
runtime.block_on(async move {
|
||||
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Declare a series of benchmarks for the Pageserver's ingest write path.
|
||||
///
|
||||
/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
|
||||
/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
|
||||
///
|
||||
/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on
|
||||
/// a fast disk, CPU is the bottleneck at time of writing.
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
|
||||
eprintln!("Data directory: {}", temp_dir.path());
|
||||
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-small-values");
|
||||
let put_size = 100usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/100b seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Random,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::RandomReuse(0x3ff),
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-big-values");
|
||||
let put_size = 8192usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/8k seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/8k seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
@@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.get_impl, "starting with get page implementation");
|
||||
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
|
||||
@@ -300,6 +300,9 @@ pub struct PageServerConf {
|
||||
/// This flag is temporary and will be removed after gradual rollout.
|
||||
/// See <https://github.com/neondatabase/neon/issues/8184>.
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -408,6 +411,8 @@ struct PageServerConfigBuilder {
|
||||
l0_flush: BuilderValue<L0FlushConfig>,
|
||||
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
|
||||
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -498,6 +503,7 @@ impl PageServerConfigBuilder {
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -685,6 +691,10 @@ impl PageServerConfigBuilder {
|
||||
self.compact_level0_phase1_value_access = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
|
||||
self.virtual_file_direct_io = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -743,6 +753,7 @@ impl PageServerConfigBuilder {
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
virtual_file_direct_io,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1018,6 +1029,9 @@ impl PageServerConf {
|
||||
"compact_level0_phase1_value_access" => {
|
||||
builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
|
||||
}
|
||||
"virtual_file_direct_io" => {
|
||||
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1103,6 +1117,7 @@ impl PageServerConf {
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s'
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s'
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -308,6 +308,45 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently add a gc blocking at the tenant level because of this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently remove a tenant level gc blocking for this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
@@ -893,7 +932,7 @@ components:
|
||||
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
||||
ArchivalConfigRequest:
|
||||
type: object
|
||||
required
|
||||
required:
|
||||
- state
|
||||
properties:
|
||||
state:
|
||||
|
||||
@@ -935,6 +935,7 @@ async fn tenant_list_handler(
|
||||
generation: (*gen)
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: None,
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -986,6 +987,7 @@ async fn tenant_status(
|
||||
.generation()
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
@@ -1160,7 +1162,10 @@ async fn layer_map_info_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||
let layer_map_info = timeline
|
||||
.layer_map_info(reset)
|
||||
.await
|
||||
.map_err(|_shutdown| ApiError::ShuttingDown)?;
|
||||
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
@@ -1226,6 +1231,72 @@ async fn evict_timeline_layer_handler(
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_gc_blocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, true).await
|
||||
}
|
||||
|
||||
async fn timeline_gc_unblocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
async fn block_or_unblock_gc(
|
||||
request: Request<Body>,
|
||||
block: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::{
|
||||
remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
|
||||
};
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let fut = async {
|
||||
if block {
|
||||
timeline.block_gc(&tenant).await.map(|_| ())
|
||||
} else {
|
||||
timeline.unblock_gc(&tenant).await
|
||||
}
|
||||
};
|
||||
|
||||
let span = tracing::info_span!(
|
||||
"block_or_unblock_gc",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %timeline_id,
|
||||
block = block,
|
||||
);
|
||||
|
||||
let res = fut.instrument(span).await;
|
||||
|
||||
res.map_err(|e| {
|
||||
if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
@@ -2904,6 +2975,14 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| api_handler(r, evict_timeline_layer_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
||||
|r| api_handler(r, timeline_gc_blocking_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
|
||||
#[derive(Clone)]
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub(crate) enum Inner {
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn inner(&self) -> &Arc<Inner> {
|
||||
pub fn inner(&self) -> &Arc<Inner> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,7 +56,6 @@ impl Statvfs {
|
||||
}
|
||||
|
||||
pub mod mock {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use regex::Regex;
|
||||
use tracing::log::info;
|
||||
@@ -135,14 +134,30 @@ pub mod mock {
|
||||
{
|
||||
continue;
|
||||
}
|
||||
total += entry
|
||||
.metadata()
|
||||
.with_context(|| format!("get metadata of {:?}", entry.path()))?
|
||||
.len();
|
||||
let m = match entry.metadata() {
|
||||
Ok(m) => m,
|
||||
Err(e) if is_not_found(&e) => {
|
||||
// some temp file which got removed right as we are walking
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(e)
|
||||
.context(format!("get metadata of {:?}", entry.path())))
|
||||
}
|
||||
};
|
||||
total += m.len();
|
||||
}
|
||||
Ok(total)
|
||||
}
|
||||
|
||||
fn is_not_found(e: &walkdir::Error) -> bool {
|
||||
let Some(io_error) = e.io_error() else {
|
||||
return false;
|
||||
};
|
||||
let kind = io_error.kind();
|
||||
matches!(kind, std::io::ErrorKind::NotFound)
|
||||
}
|
||||
|
||||
pub struct Statvfs {
|
||||
pub blocks: u64,
|
||||
pub blocks_available: u64,
|
||||
|
||||
@@ -148,6 +148,7 @@ pub(crate) mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
mod gc_block;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -303,6 +304,12 @@ pub struct Tenant {
|
||||
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||
|
||||
/// `index_part.json` based gc blocking reason tracking.
|
||||
///
|
||||
/// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
|
||||
/// proceeding.
|
||||
pub(crate) gc_block: gc_block::GcBlock,
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -594,6 +601,12 @@ impl From<PageReconstructError> for GcError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<timeline::layer_manager::Shutdown> for GcError {
|
||||
fn from(_: timeline::layer_manager::Shutdown) -> Self {
|
||||
GcError::TimelineCancelled
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum LoadConfigError {
|
||||
#[error("TOML deserialization error: '{0}'")]
|
||||
@@ -703,6 +716,7 @@ impl Tenant {
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.expect("currently loading, layer manager cannot be shutdown already")
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.is_some(),
|
||||
@@ -1036,6 +1050,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let mut gc_blocks = HashMap::new();
|
||||
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
// and build a layer map that contains an entry for each remote and local
|
||||
// layer file.
|
||||
@@ -1045,6 +1061,16 @@ impl Tenant {
|
||||
.remove(&timeline_id)
|
||||
.expect("just put it in above");
|
||||
|
||||
if let Some(blocking) = index_part.gc_blocking.as_ref() {
|
||||
// could just filter these away, but it helps while testing
|
||||
anyhow::ensure!(
|
||||
!blocking.reasons.is_empty(),
|
||||
"index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
|
||||
);
|
||||
let prev = gc_blocks.insert(timeline_id, blocking.reasons);
|
||||
assert!(prev.is_none());
|
||||
}
|
||||
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(
|
||||
timeline_id,
|
||||
@@ -1089,6 +1115,8 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
self.gc_block.set_scanned(gc_blocks);
|
||||
|
||||
fail::fail_point!("attach-before-activate", |_| {
|
||||
anyhow::bail!("attach-before-activate");
|
||||
});
|
||||
@@ -1679,6 +1707,14 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let _guard = match self.gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(reasons) => {
|
||||
info!("Skipping GC: {reasons}");
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
};
|
||||
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
|
||||
.await
|
||||
}
|
||||
@@ -2691,6 +2727,7 @@ impl Tenant {
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
}
|
||||
}
|
||||
@@ -2975,54 +3012,6 @@ impl Tenant {
|
||||
// because that will stall branch creation.
|
||||
let gc_cs = self.gc_cs.lock().await;
|
||||
|
||||
// Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
|
||||
// depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
|
||||
// and fail out if it's inaccurate.
|
||||
// (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
|
||||
{
|
||||
let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
|
||||
BTreeMap::new();
|
||||
timelines.iter().for_each(|timeline| {
|
||||
if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
|
||||
let ancestor_children =
|
||||
all_branchpoints.entry(*ancestor_timeline_id).or_default();
|
||||
ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
|
||||
}
|
||||
});
|
||||
|
||||
for timeline in &timelines {
|
||||
let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
|
||||
.remove(&timeline.timeline_id)
|
||||
.unwrap_or_default();
|
||||
|
||||
branchpoints.sort_by_key(|b| b.0);
|
||||
|
||||
let target = timeline.gc_info.read().unwrap();
|
||||
|
||||
// We require that retain_lsns contains everything in `branchpoints`, but not that
|
||||
// they are exactly equal: timeline deletions can race with us, so retain_lsns
|
||||
// may contain some extra stuff. It is safe to have extra timelines in there, because it
|
||||
// just means that we retain slightly more data than we otherwise might.
|
||||
let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
|
||||
for b in &branchpoints {
|
||||
if !have_branchpoints.contains(b) {
|
||||
tracing::error!(
|
||||
"Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}",
|
||||
branchpoints,
|
||||
target.retain_lsns
|
||||
);
|
||||
debug_assert!(false);
|
||||
// Do not GC based on bad information!
|
||||
// (ab-use an existing GcError type rather than adding a new one, since this is a
|
||||
// "should never happen" check that will be removed soon).
|
||||
return Err(GcError::Remote(anyhow::anyhow!(
|
||||
"retain_lsns failed validation!"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we now know all the branch points.
|
||||
// Update the GC information for each timeline.
|
||||
let mut gc_timelines = Vec::with_capacity(timelines.len());
|
||||
@@ -4092,7 +4081,7 @@ pub(crate) mod harness {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
@@ -4644,10 +4633,10 @@ mod tests {
|
||||
|
||||
let layer_map = tline.layers.read().await;
|
||||
let level0_deltas = layer_map
|
||||
.layer_map()
|
||||
.get_level0_deltas()
|
||||
.into_iter()
|
||||
.map(|desc| layer_map.get_from_desc(&desc))
|
||||
.layer_map()?
|
||||
.level0_deltas()
|
||||
.iter()
|
||||
.map(|desc| layer_map.get_from_desc(desc))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(!level0_deltas.is_empty());
|
||||
@@ -4767,7 +4756,7 @@ mod tests {
|
||||
lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
|
||||
let compact = true;
|
||||
bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
|
||||
}
|
||||
@@ -4780,7 +4769,9 @@ mod tests {
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
compact: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
|
||||
let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
|
||||
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let mut blknum = 0;
|
||||
|
||||
@@ -4801,6 +4792,7 @@ mod tests {
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
inserted.entry(test_key).or_default().insert(lsn);
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
@@ -4825,7 +4817,7 @@ mod tests {
|
||||
assert_eq!(res.layers_removed, 0, "this never removes anything");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
//
|
||||
@@ -4872,14 +4864,16 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let guard = tline.layers.read().await;
|
||||
guard.layer_map().dump(true, &ctx).await?;
|
||||
let lm = guard.layer_map()?;
|
||||
|
||||
lm.dump(true, &ctx).await?;
|
||||
|
||||
let mut reads = Vec::new();
|
||||
let mut prev = None;
|
||||
guard.layer_map().iter_historic_layers().for_each(|desc| {
|
||||
lm.iter_historic_layers().for_each(|desc| {
|
||||
if !desc.is_delta() {
|
||||
prev = Some(desc.clone());
|
||||
return;
|
||||
@@ -4933,9 +4927,39 @@ mod tests {
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
tline
|
||||
.validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
|
||||
let mut expect_missing = false;
|
||||
let mut key = read.start().unwrap();
|
||||
while key != read.end().unwrap() {
|
||||
if let Some(lsns) = inserted.get(&key) {
|
||||
let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
|
||||
match expected_lsn {
|
||||
Some(lsn) => {
|
||||
expected_lsns.insert(key, *lsn);
|
||||
}
|
||||
None => {
|
||||
expect_missing = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
expect_missing = true;
|
||||
break;
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
if expect_missing {
|
||||
assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
|
||||
} else {
|
||||
for (key, image) in vectored_res? {
|
||||
let expected_lsn = expected_lsns.get(&key).expect("determined above");
|
||||
let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
|
||||
assert_eq!(image?, expected_image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4985,10 +5009,6 @@ mod tests {
|
||||
)
|
||||
.await;
|
||||
|
||||
child_timeline
|
||||
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let images = vectored_res?;
|
||||
assert!(images.is_empty());
|
||||
Ok(())
|
||||
@@ -5859,23 +5879,12 @@ mod tests {
|
||||
tline.freeze_and_flush().await?; // force create a delta layer
|
||||
}
|
||||
|
||||
let before_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.get_level0_deltas()
|
||||
.len();
|
||||
let before_num_l0_delta_files =
|
||||
tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
|
||||
tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
|
||||
|
||||
let after_num_l0_delta_files = tline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.layer_map()
|
||||
.get_level0_deltas()
|
||||
.len();
|
||||
let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
|
||||
|
||||
assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
|
||||
|
||||
@@ -6899,7 +6908,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for (idx, expected) in expected_result.iter().enumerate() {
|
||||
assert_eq!(
|
||||
@@ -6993,7 +7005,10 @@ mod tests {
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -7327,7 +7342,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -7353,7 +7371,10 @@ mod tests {
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -7898,11 +7919,28 @@ mod tests {
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
let mut dryrun_flags = EnumSet::new();
|
||||
dryrun_flags.insert(CompactFlags::DryRun);
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||
verify_result().await;
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// compact again
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// increase GC horizon and compact again
|
||||
@@ -7912,11 +7950,17 @@ mod tests {
|
||||
guard.cutoffs.time = Lsn(0x38);
|
||||
guard.cutoffs.space = Lsn(0x38);
|
||||
}
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
|
||||
|
||||
// not increasing the GC horizon and compact again
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
@@ -8097,7 +8141,10 @@ mod tests {
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
branch_tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
verify_result().await;
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ impl EphemeralFile {
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||
@@ -51,10 +52,12 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -161,7 +164,11 @@ mod tests {
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo", &ctx).await?;
|
||||
assert_eq!(
|
||||
@@ -215,4 +222,38 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ephemeral_file_holds_gate_open() {
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
||||
|
||||
let (conf, tenant_id, timeline_id, ctx) =
|
||||
harness("ephemeral_file_holds_gate_open").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut closing = tokio::task::spawn(async move {
|
||||
gate.close().await;
|
||||
});
|
||||
|
||||
// gate is entered until the ephemeral file is dropped
|
||||
// do not start paused tokio-epoll-uring has a sleep loop
|
||||
tokio::time::pause();
|
||||
tokio::time::timeout(FOREVER, &mut closing)
|
||||
.await
|
||||
.expect_err("closing cannot complete before dropping");
|
||||
|
||||
// this is a requirement of the reset_tenant functionality: we have to be able to restart a
|
||||
// tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
|
||||
drop(file);
|
||||
|
||||
tokio::time::timeout(FOREVER, &mut closing)
|
||||
.await
|
||||
.expect("closing completes right away")
|
||||
.expect("closing does not panic");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@ use super::zero_padded_read_write;
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
@@ -29,7 +31,11 @@ pub enum PrewarmOnWrite {
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
@@ -38,6 +44,7 @@ impl RW {
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,6 +152,7 @@ impl Drop for RW {
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
|
||||
213
pageserver/src/tenant/gc_block.rs
Normal file
213
pageserver/src/tenant/gc_block.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
///
|
||||
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
|
||||
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
|
||||
reasons: std::sync::Mutex<Storage>,
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl GcBlock {
|
||||
/// Start another gc iteration.
|
||||
///
|
||||
/// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
|
||||
/// it's ending, or if not currently possible, a value describing the reasons why not.
|
||||
///
|
||||
/// Cancellation safe.
|
||||
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
|
||||
let reasons = {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
// TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
|
||||
// tests, we use everything. we should warn if the gc has been consecutively blocked
|
||||
// for more than 1h (within single tenant session?).
|
||||
BlockingReasons::clean_and_summarize(g)
|
||||
};
|
||||
|
||||
if let Some(reasons) = reasons {
|
||||
Err(reasons)
|
||||
} else {
|
||||
Ok(Guard {
|
||||
_inner: self.blocking.lock().await,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
BlockingReasons::summarize(&g)
|
||||
}
|
||||
|
||||
/// Start blocking gc for this one timeline for the given reason.
|
||||
///
|
||||
/// This is not a guard based API but instead it mimics set API. The returned future will not
|
||||
/// resolve until an existing gc round has completed.
|
||||
///
|
||||
/// Returns true if this block was new, false if gc was already blocked for this reason.
|
||||
///
|
||||
/// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
|
||||
/// keep the gc blocking reason.
|
||||
pub(crate) async fn insert(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_insert_gc_block_reason(reason)?;
|
||||
|
||||
(added, uploaded)
|
||||
};
|
||||
|
||||
uploaded.await?;
|
||||
|
||||
// ensure that any ongoing gc iteration has completed
|
||||
drop(self.blocking.lock().await);
|
||||
|
||||
Ok(added)
|
||||
}
|
||||
|
||||
/// Remove blocking gc for this one timeline and the given reason.
|
||||
pub(crate) async fn remove(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
super::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
if set.is_empty() {
|
||||
oe.remove();
|
||||
}
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
// we must still do the index_part.json update regardless, in case we had earlier
|
||||
// been cancelled
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_remove_gc_block_reason(reason)?;
|
||||
|
||||
(remaining_blocks, uploaded)
|
||||
};
|
||||
uploaded.await?;
|
||||
|
||||
// no need to synchronize with gc iteration again
|
||||
|
||||
if remaining_blocks > 0 {
|
||||
tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
|
||||
} else {
|
||||
tracing::info!("gc is now unblocked for the tenant");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
|
||||
if unblocked {
|
||||
tracing::info!("gc is now unblocked following deletion");
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct Guard<'a> {
|
||||
_inner: tokio::sync::MutexGuard<'a, ()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
if !g.is_empty() {
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
if g.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -846,8 +846,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
|
||||
self.l0_delta_layers.to_vec()
|
||||
pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
|
||||
&self.l0_delta_layers
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
|
||||
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
}
|
||||
|
||||
/// See [`Self::spawn`].
|
||||
#[derive(Clone)]
|
||||
pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
|
||||
enum BackgroundPurgesInner {
|
||||
Open(tokio::task::JoinSet<()>),
|
||||
// we use the async mutex for coalescing
|
||||
ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
|
||||
}
|
||||
|
||||
impl Default for BackgroundPurges {
|
||||
fn default() -> Self {
|
||||
Self(Arc::new(std::sync::Mutex::new(
|
||||
BackgroundPurgesInner::Open(JoinSet::new()),
|
||||
)))
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Default)]
|
||||
pub struct BackgroundPurges(tokio_util::task::TaskTracker);
|
||||
|
||||
impl BackgroundPurges {
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
@@ -247,24 +234,32 @@ impl BackgroundPurges {
|
||||
/// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
/// Thus the [`BackgroundPurges`] type to keep track of these tasks.
|
||||
pub fn spawn(&self, tmp_path: Utf8PathBuf) {
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
let jset = match &mut *guard {
|
||||
BackgroundPurgesInner::Open(ref mut jset) => jset,
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
warn!("trying to spawn background purge during shutdown, ignoring");
|
||||
return;
|
||||
// because on shutdown we close and wait, we are misusing TaskTracker a bit.
|
||||
//
|
||||
// so first acquire a token, then check if the tracker has been closed. the tracker might get closed
|
||||
// right after, but at least the shutdown will wait for what we are spawning next.
|
||||
let token = self.0.token();
|
||||
|
||||
if self.0.is_closed() {
|
||||
warn!(
|
||||
%tmp_path,
|
||||
"trying to spawn background purge during shutdown, ignoring"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let span = info_span!(parent: None, "background_purge", %tmp_path);
|
||||
|
||||
let task = move || {
|
||||
let _token = token;
|
||||
let _entered = span.entered();
|
||||
if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
|
||||
// should we fatal_io_error here?
|
||||
warn!(%error, "failed to purge tenant directory");
|
||||
}
|
||||
};
|
||||
jset.spawn_on(
|
||||
async move {
|
||||
if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
|
||||
// should we fatal_io_error here?
|
||||
warn!(%error, path=%tmp_path, "failed to purge tenant directory");
|
||||
}
|
||||
}
|
||||
.instrument(info_span!(parent: None, "background_purge")),
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
);
|
||||
|
||||
BACKGROUND_RUNTIME.spawn_blocking(task);
|
||||
}
|
||||
|
||||
/// When this future completes, all background purges have completed.
|
||||
@@ -278,42 +273,9 @@ impl BackgroundPurges {
|
||||
/// instances of this future will continue to be correct.
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown(&self) {
|
||||
let jset = {
|
||||
let mut guard = self.0.lock().unwrap();
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::Open(jset) => {
|
||||
*guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
|
||||
std::mem::take(jset),
|
||||
)))
|
||||
}
|
||||
BackgroundPurgesInner::ShuttingDown(_) => {
|
||||
// calling shutdown multiple times is most likely a bug in pageserver shutdown code
|
||||
warn!("already shutting down");
|
||||
}
|
||||
};
|
||||
match &mut *guard {
|
||||
BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
|
||||
BackgroundPurgesInner::Open(_) => {
|
||||
unreachable!("above code transitions into shut down state");
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut jset = jset.lock().await; // concurrent callers coalesce here
|
||||
while let Some(res) = jset.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.is_panic() => {
|
||||
// If it panicked, the error is already logged by the panic hook.
|
||||
}
|
||||
Err(e) if e.is_cancelled() => {
|
||||
unreachable!("we don't cancel the joinset or runtime")
|
||||
}
|
||||
Err(e) => {
|
||||
// No idea when this can happen, but let's log it.
|
||||
warn!(%e, "background purge task failed or panicked");
|
||||
}
|
||||
}
|
||||
}
|
||||
// forbid new tasks (can be called many times)
|
||||
self.0.close();
|
||||
self.0.wait().await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1767,14 +1729,9 @@ impl TenantManager {
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.collect::<Vec<_>>();
|
||||
let layers = timeline.layers.read().await;
|
||||
|
||||
for layer in timeline_layers {
|
||||
for layer in layers.likely_resident_layers() {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
.strip_prefix(&parent_path)
|
||||
@@ -1971,7 +1928,8 @@ impl TenantManager {
|
||||
timeline_id: TimelineId,
|
||||
prepared: PreparedTimelineDetach,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
) -> Result<HashSet<TimelineId>, anyhow::Error> {
|
||||
// FIXME: this is unnecessary, slotguard already has these semantics
|
||||
struct RevertOnDropSlot(Option<SlotGuard>);
|
||||
|
||||
impl Drop for RevertOnDropSlot {
|
||||
|
||||
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
|
||||
.context("wait completion")
|
||||
}
|
||||
|
||||
/// Adds a gc blocking reason for this timeline if one does not exist already.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_insert_gc_block_reason(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
|
||||
drop(guard);
|
||||
panic!("cannot start detach ancestor if there is nothing to detach from");
|
||||
}
|
||||
}
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
// Usual case: !wanted(x) && !wanted(y)
|
||||
//
|
||||
// Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
|
||||
// turn on and off some reason.
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
// this could be avoided by having external in-memory synchronization, like
|
||||
// timeline detach ancestor
|
||||
warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
|
||||
}
|
||||
|
||||
// at this point, the metadata must always show that there is a parent
|
||||
upload_queue.dirty.gc_blocking = current
|
||||
.map(|x| x.with_reason(reason))
|
||||
.or_else(|| Some(index::GcBlocking::started_now_for(reason)));
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
/// Removes a gc blocking reason for this timeline if one exists.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_remove_gc_block_reason(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if !upload_queue
|
||||
.clean
|
||||
.0
|
||||
.lineage
|
||||
.is_detached_from_original_ancestor()
|
||||
{
|
||||
drop(guard);
|
||||
panic!("cannot complete timeline_ancestor_detach while not detached");
|
||||
}
|
||||
}
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| {
|
||||
x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
|
||||
};
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
|
||||
}
|
||||
|
||||
upload_queue.dirty.gc_blocking =
|
||||
current.as_ref().and_then(|x| x.without_reason(reason));
|
||||
assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
|
||||
// FIXME: bogus ?
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
/// Launch an upload operation in the background; the file is added to be included in next
|
||||
/// `index_part.json` upload.
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
|
||||
@@ -60,6 +60,9 @@ pub struct IndexPart {
|
||||
#[serde(default)]
|
||||
pub(crate) lineage: Lineage,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) gc_blocking: Option<GcBlocking>,
|
||||
|
||||
/// Describes the kind of aux files stored in the timeline.
|
||||
///
|
||||
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
|
||||
@@ -85,10 +88,11 @@ impl IndexPart {
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
/// - 8: added `archived_at`
|
||||
const LATEST_VERSION: usize = 8;
|
||||
/// - 9: +gc_blocking
|
||||
const LATEST_VERSION: usize = 9;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -101,6 +105,7 @@ impl IndexPart {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
}
|
||||
}
|
||||
@@ -251,6 +256,64 @@ impl Lineage {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub(crate) struct GcBlocking {
|
||||
pub(crate) started_at: NaiveDateTime,
|
||||
pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
|
||||
#[enumset(serialize_repr = "list")]
|
||||
pub(crate) enum GcBlockingReason {
|
||||
Manual,
|
||||
DetachAncestor,
|
||||
}
|
||||
|
||||
impl GcBlocking {
|
||||
pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
|
||||
GcBlocking {
|
||||
started_at: chrono::Utc::now().naive_utc(),
|
||||
reasons: enumset::EnumSet::only(reason),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given reason is one of the reasons why the gc is blocked.
|
||||
pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
|
||||
self.reasons.contains(reason)
|
||||
}
|
||||
|
||||
/// Returns a version of self with the given reason.
|
||||
pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
|
||||
assert!(!self.blocked_by(reason));
|
||||
let mut reasons = self.reasons;
|
||||
reasons.insert(reason);
|
||||
|
||||
Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a version of self without the given reason. Assumption is that if
|
||||
/// there are no more reasons, we can unblock the gc by returning `None`.
|
||||
pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
|
||||
assert!(self.blocked_by(reason));
|
||||
|
||||
if self.reasons.len() == 1 {
|
||||
None
|
||||
} else {
|
||||
let mut reasons = self.reasons;
|
||||
assert!(reasons.remove(reason));
|
||||
assert!(!reasons.is_empty());
|
||||
|
||||
Some(Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -292,6 +355,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -335,6 +399,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -379,6 +444,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -426,6 +492,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -468,6 +535,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -513,6 +581,7 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -563,6 +632,7 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
@@ -618,6 +688,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
@@ -674,6 +745,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
@@ -681,6 +753,68 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v9_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 9,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
}
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 9,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
|
||||
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||
id::TimelineId, serde_system_time,
|
||||
id::TimelineId, pausable_failpoint, serde_system_time,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
layer: HeatMapLayer,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<HeatMapLayer>, UpdateError> {
|
||||
// Failpoint for simulating slow remote storage
|
||||
// Failpoints for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
pausable_failpoint!("secondary-layer-download-pausable");
|
||||
|
||||
let local_path = local_layer_path(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
|
||||
@@ -435,21 +435,6 @@ impl ReadableLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue,
|
||||
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
@@ -554,19 +539,25 @@ impl LayerAccessStats {
|
||||
self.record_residence_event_at(SystemTime::now())
|
||||
}
|
||||
|
||||
pub(crate) fn record_access_at(&self, now: SystemTime) {
|
||||
fn record_access_at(&self, now: SystemTime) -> bool {
|
||||
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
|
||||
|
||||
// A layer which is accessed must be visible.
|
||||
mask |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
value |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
|
||||
self.write_bits(mask, value);
|
||||
let old_bits = self.write_bits(mask, value);
|
||||
!matches!(
|
||||
self.decode_visibility(old_bits),
|
||||
LayerVisibilityHint::Visible
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) {
|
||||
/// Returns true if we modified the layer's visibility to set it to Visible implicitly
|
||||
/// as a result of this access
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
self.record_access_at(SystemTime::now())
|
||||
|
||||
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
@@ -72,10 +71,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -200,7 +196,6 @@ impl DeltaKey {
|
||||
pub struct DeltaLayer {
|
||||
path: Utf8PathBuf,
|
||||
pub desc: PersistentLayerDesc,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
@@ -299,7 +294,6 @@ impl DeltaLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
@@ -350,7 +344,6 @@ impl DeltaLayer {
|
||||
summary.lsn_range,
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -373,7 +366,6 @@ impl DeltaLayer {
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
struct DeltaLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
pub path: Utf8PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -384,6 +376,9 @@ struct DeltaLayerWriterInner {
|
||||
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
||||
|
||||
blob_writer: BlobWriter<true>,
|
||||
|
||||
// Number of key-lsns in the layer.
|
||||
num_keys: usize,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriterInner {
|
||||
@@ -417,7 +412,6 @@ impl DeltaLayerWriterInner {
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -425,6 +419,7 @@ impl DeltaLayerWriterInner {
|
||||
lsn_range,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
num_keys: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -475,6 +470,9 @@ impl DeltaLayerWriterInner {
|
||||
|
||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||
let res = self.tree.append(&delta_key.0, blob_ref.0);
|
||||
|
||||
self.num_keys += 1;
|
||||
|
||||
(val, res.map_err(|e| anyhow::anyhow!(e)))
|
||||
}
|
||||
|
||||
@@ -488,11 +486,10 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, timeline, ctx).await;
|
||||
let result = self.finish0(key_end, ctx).await;
|
||||
if result.is_err() {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
@@ -505,9 +502,8 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish0(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -572,11 +568,9 @@ impl DeltaLayerWriterInner {
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
trace!("created delta layer {}", self.path);
|
||||
|
||||
trace!("created delta layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
Ok((desc, self.path))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -677,14 +671,20 @@ impl DeltaLayerWriter {
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(key_end, timeline, ctx)
|
||||
.await
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
}
|
||||
|
||||
@@ -808,95 +808,6 @@ impl DeltaLayerInner {
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
&block_reader,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&search_key.0,
|
||||
VisitDirection::Backwards,
|
||||
|key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
@@ -1669,8 +1580,9 @@ pub(crate) mod test {
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
@@ -1964,9 +1876,8 @@ pub(crate) mod test {
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
|
||||
let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
|
||||
|
||||
let inner = resident.get_as_delta(&ctx).await?;
|
||||
|
||||
@@ -2046,6 +1957,7 @@ pub(crate) mod test {
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.next()
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
@@ -2120,7 +2032,8 @@ pub(crate) mod test {
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.find(|x| x != &initdb_layer)
|
||||
.find(|&x| x != &initdb_layer)
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
// create a copy for the timeline, so we don't overwrite the file
|
||||
@@ -2155,7 +2068,8 @@ pub(crate) mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
|
||||
let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
|
||||
|
||||
copied_layer.get_as_delta(ctx).await.unwrap();
|
||||
|
||||
@@ -2283,7 +2197,9 @@ pub(crate) mod test {
|
||||
for (key, lsn, value) in deltas {
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
}
|
||||
let delta_layer = writer.finish(key_end, tline, ctx).await?;
|
||||
|
||||
let (desc, path) = writer.finish(key_end, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
|
||||
|
||||
Ok::<_, anyhow::Error>(delta_layer)
|
||||
}
|
||||
|
||||
@@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
@@ -137,7 +134,6 @@ pub struct ImageLayer {
|
||||
pub desc: PersistentLayerDesc,
|
||||
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
|
||||
pub lsn: Lsn,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
@@ -255,7 +251,6 @@ impl ImageLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
@@ -306,7 +301,6 @@ impl ImageLayer {
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -429,46 +423,6 @@ impl ImageLayerInner {
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader
|
||||
.get(
|
||||
&keybuf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
let blob = block_reader
|
||||
.block_cursor()
|
||||
.read_blob(
|
||||
offset,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerValue)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
@@ -753,6 +707,10 @@ struct ImageLayerWriterInner {
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -1044,6 +1002,10 @@ impl ImageLayerWriter {
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
|
||||
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -34,8 +34,7 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
@@ -55,9 +54,6 @@ pub struct InMemoryLayer {
|
||||
/// Writes are only allowed when this is `None`.
|
||||
pub(crate) end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer after frozen.
|
||||
frozen_local_path_str: OnceLock<Arc<str>>,
|
||||
|
||||
@@ -248,12 +244,6 @@ impl InMemoryLayer {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
self.frozen_local_path_str
|
||||
.get()
|
||||
.unwrap_or(&self.local_path_str)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
@@ -303,60 +293,6 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos, &ctx).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
@@ -449,20 +385,17 @@ impl InMemoryLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_lsn: Lsn,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
|
||||
let file =
|
||||
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
|
||||
let key = InMemoryLayerFileId(file.page_cache_file_id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
local_path_str: {
|
||||
let mut buf = String::new();
|
||||
inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
|
||||
buf.into()
|
||||
},
|
||||
frozen_local_path_str: OnceLock::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
@@ -482,8 +415,7 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
pub async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -548,8 +480,6 @@ impl InMemoryLayer {
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
@@ -567,9 +497,13 @@ impl InMemoryLayer {
|
||||
})
|
||||
.expect("frozen_local_path_str set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -579,12 +513,12 @@ impl InMemoryLayer {
|
||||
/// if there are no matching keys.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub(crate) async fn write_to_disk(
|
||||
pub async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
l0_flush_global_state: &l0_flush::Inner,
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -596,9 +530,8 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match &*l0_flush_global_state {
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
@@ -628,7 +561,7 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await?;
|
||||
|
||||
match &*l0_flush_global_state {
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
@@ -693,7 +626,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||
let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
|
||||
|
||||
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
|
||||
//
|
||||
@@ -705,6 +638,6 @@ impl InMemoryLayer {
|
||||
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
|
||||
drop(_concurrency_permit);
|
||||
|
||||
Ok(Some(delta_layer))
|
||||
Ok(Some((desc, path)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer::{self};
|
||||
use super::{
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -301,42 +300,6 @@ impl Layer {
|
||||
self.0.delete_on_drop();
|
||||
}
|
||||
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
/// It is up to the caller to collect more data from the previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use anyhow::ensure;
|
||||
|
||||
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
|
||||
self.0.access_stats.record_access(ctx);
|
||||
|
||||
if self.layer_desc().is_delta {
|
||||
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
} else {
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
|
||||
ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
|
||||
}
|
||||
|
||||
layer
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
.instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
|
||||
.await
|
||||
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -353,7 +316,7 @@ impl Layer {
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
self.0.access_stats.record_access(ctx);
|
||||
self.record_access(ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
@@ -433,18 +396,18 @@ impl Layer {
|
||||
self.0.info(reset)
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.0.access_stats
|
||||
pub(crate) fn latest_activity(&self) -> SystemTime {
|
||||
self.0.access_stats.latest_activity()
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
self.0.access_stats.visibility()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> &Utf8Path {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn debug_str(&self) -> &Arc<str> {
|
||||
&self.0.debug_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.0.metadata()
|
||||
}
|
||||
@@ -488,13 +451,31 @@ impl Layer {
|
||||
}
|
||||
}
|
||||
|
||||
fn record_access(&self, ctx: &RequestContext) {
|
||||
if self.0.access_stats.record_access(ctx) {
|
||||
// Visibility was modified to Visible
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let old_visibility = self.access_stats().set_visibility(visibility.clone());
|
||||
let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
|
||||
use LayerVisibilityHint::*;
|
||||
match (old_visibility, visibility) {
|
||||
(Visible, Covered) => {
|
||||
// Subtract this layer's contribution to the visible size metric
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
debug_assert!(
|
||||
tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
|
||||
);
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.sub(self.0.desc.file_size)
|
||||
@@ -519,7 +500,7 @@ impl Layer {
|
||||
///
|
||||
/// However when we want something evicted, we cannot evict it right away as there might be current
|
||||
/// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
|
||||
/// read with [`Layer::get_value_reconstruct_data`].
|
||||
/// read with [`Layer::get_values_reconstruct_data`].
|
||||
///
|
||||
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
|
||||
#[derive(Debug)]
|
||||
@@ -600,9 +581,6 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the layer, used for traversal id.
|
||||
debug_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
/// Timeline access is needed for remote timeline client and metrics.
|
||||
@@ -715,6 +693,9 @@ impl Drop for LayerInner {
|
||||
}
|
||||
|
||||
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
|
||||
debug_assert!(
|
||||
timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
|
||||
);
|
||||
timeline
|
||||
.metrics
|
||||
.visible_physical_size_gauge
|
||||
@@ -836,9 +817,6 @@ impl LayerInner {
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: {
|
||||
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
|
||||
},
|
||||
path: local_path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
@@ -1759,28 +1737,6 @@ impl DownloadedLayer {
|
||||
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => {
|
||||
d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_value_reconstruct_data(key, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -1879,7 +1835,7 @@ impl ResidentLayer {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
owner.access_stats.record_access(ctx);
|
||||
self.owner.record_access(ctx);
|
||||
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
|
||||
@@ -39,7 +39,7 @@ async fn smoke_test() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -50,13 +50,26 @@ async fn smoke_test() {
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
// Arc<DownloadedLayer>.
|
||||
|
||||
let controlfile_keyspace = KeySpace {
|
||||
ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
|
||||
};
|
||||
|
||||
let img_before = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
let mut data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
data.img
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
@@ -74,13 +87,24 @@ async fn smoke_test() {
|
||||
|
||||
// on accesses when the layer is evicted, it will automatically be downloaded.
|
||||
let img_after = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
let mut data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(download_span.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
data.img.take().unwrap()
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
|
||||
assert_eq!(img_before, img_after);
|
||||
@@ -152,7 +176,7 @@ async fn smoke_test() {
|
||||
{
|
||||
let layers = &[layer];
|
||||
let mut g = timeline.layers.write().await;
|
||||
g.finish_gc_timeline(layers);
|
||||
g.open_mut().unwrap().finish_gc_timeline(layers);
|
||||
// this just updates the remote_physical_size for demonstration purposes
|
||||
rtc.schedule_gc_update(layers).unwrap();
|
||||
}
|
||||
@@ -192,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -236,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
// the deletion of the layer in remote_storage happens.
|
||||
{
|
||||
let mut layers = timeline.layers.write().await;
|
||||
layers.finish_gc_timeline(&[layer]);
|
||||
layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
@@ -277,7 +301,7 @@ fn read_wins_pending_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -409,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -578,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -658,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.likely_resident_layers().cloned().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -777,9 +801,9 @@ async fn eviction_cancellation_on_drop() {
|
||||
let (evicted_layer, not_evicted) = {
|
||||
let mut layers = {
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let layers = guard.likely_resident_layers().collect::<Vec<_>>();
|
||||
let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
|
||||
// remove the layers from layermap
|
||||
guard.finish_gc_timeline(&layers);
|
||||
guard.open_mut().unwrap().finish_gc_timeline(&layers);
|
||||
|
||||
layers
|
||||
};
|
||||
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
fn layer_size() {
|
||||
assert_eq!(size_of::<LayerAccessStats>(), 8);
|
||||
assert_eq!(size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(size_of::<LayerInner>(), 312);
|
||||
assert_eq!(size_of::<LayerInner>(), 296);
|
||||
// it also has the utf8 path
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
use std::sync::Arc;
|
||||
use std::{ops::Range, sync::Arc};
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::{ImageLayerWriter, ResidentLayer};
|
||||
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
@@ -98,6 +99,111 @@ impl SplitImageLayerWriter {
|
||||
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: DeltaLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_key,
|
||||
lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers.push(delta_layer);
|
||||
}
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(delta_layer);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -138,7 +244,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut writer = SplitImageLayerWriter::new(
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
@@ -150,11 +256,42 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
writer
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
}
|
||||
|
||||
@@ -170,7 +307,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut writer = SplitImageLayerWriter::new(
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
@@ -181,26 +318,58 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
const N: usize = 2000;
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
writer
|
||||
image_writer
|
||||
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let layers = writer
|
||||
let image_layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), N / 512 + 1);
|
||||
for idx in 0..layers.len() {
|
||||
assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
layers[idx - 1].layer_desc().key_range.end,
|
||||
layers[idx].layer_desc().key_range.start
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -218,7 +387,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut writer = SplitImageLayerWriter::new(
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
@@ -230,15 +399,56 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
writer
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
image_writer
|
||||
.put_image(get_key(1), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(1),
|
||||
Lsn(0x1A),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
error!(
|
||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
);
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
|
||||
wait_duration
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -19,8 +19,10 @@ use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
mut image_writer: Option<&mut ImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
|
||||
let Value::Image(img) = &logs[0].1 else {
|
||||
unreachable!()
|
||||
};
|
||||
stat.produce_image_key(img);
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
struct CompactionStatisticsNumSize {
|
||||
num: u64,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
pub struct CompactionStatistics {
|
||||
delta_layer_visited: CompactionStatisticsNumSize,
|
||||
image_layer_visited: CompactionStatisticsNumSize,
|
||||
delta_layer_produced: CompactionStatisticsNumSize,
|
||||
image_layer_produced: CompactionStatisticsNumSize,
|
||||
num_delta_layer_discarded: usize,
|
||||
num_image_layer_discarded: usize,
|
||||
num_unique_keys_visited: usize,
|
||||
wal_keys_visited: CompactionStatisticsNumSize,
|
||||
image_keys_visited: CompactionStatisticsNumSize,
|
||||
wal_produced: CompactionStatisticsNumSize,
|
||||
image_produced: CompactionStatisticsNumSize,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
fn estimated_size_of_value(val: &Value) -> usize {
|
||||
match val {
|
||||
Value::Image(img) => img.len(),
|
||||
Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
|
||||
_ => std::mem::size_of::<NeonWalRecord>(),
|
||||
}
|
||||
}
|
||||
fn estimated_size_of_key() -> usize {
|
||||
KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
|
||||
}
|
||||
fn visit_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_visited.num += 1;
|
||||
self.delta_layer_visited.size += size;
|
||||
}
|
||||
fn visit_image_layer(&mut self, size: u64) {
|
||||
self.image_layer_visited.num += 1;
|
||||
self.image_layer_visited.size += size;
|
||||
}
|
||||
fn on_unique_key_visited(&mut self) {
|
||||
self.num_unique_keys_visited += 1;
|
||||
}
|
||||
fn visit_wal_key(&mut self, val: &Value) {
|
||||
self.wal_keys_visited.num += 1;
|
||||
self.wal_keys_visited.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn visit_image_key(&mut self, val: &Value) {
|
||||
self.image_keys_visited.num += 1;
|
||||
self.image_keys_visited.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn produce_key(&mut self, val: &Value) {
|
||||
match val {
|
||||
Value::Image(img) => self.produce_image_key(img),
|
||||
Value::WalRecord(_) => self.produce_wal_key(val),
|
||||
}
|
||||
}
|
||||
fn produce_wal_key(&mut self, val: &Value) {
|
||||
self.wal_produced.num += 1;
|
||||
self.wal_produced.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn produce_image_key(&mut self, val: &Bytes) {
|
||||
self.image_produced.num += 1;
|
||||
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn discard_delta_layer(&mut self) {
|
||||
self.num_delta_layer_discarded += 1;
|
||||
}
|
||||
fn discard_image_layer(&mut self) {
|
||||
self.num_image_layer_discarded += 1;
|
||||
}
|
||||
fn produce_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_produced.num += 1;
|
||||
self.delta_layer_produced.size += size;
|
||||
}
|
||||
fn produce_image_layer(&mut self, size: u64) {
|
||||
self.image_layer_produced.num += 1;
|
||||
self.image_layer_produced.size += size;
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// TODO: cancellation
|
||||
///
|
||||
@@ -118,12 +210,18 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, CompactionError> {
|
||||
if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
|
||||
self.compact_with_gc(cancel, ctx)
|
||||
self.compact_with_gc(cancel, flags, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if flags.contains(CompactFlags::DryRun) {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"dry-run mode is not supported for legacy compaction for now"
|
||||
)));
|
||||
}
|
||||
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
@@ -273,7 +371,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map().iter_historic_layers() {
|
||||
for layer_desc in layers.layer_map()?.iter_historic_layers() {
|
||||
let layer = layers.get_from_desc(&layer_desc);
|
||||
if layer.metadata().shard.shard_count == self.shard_identity.count {
|
||||
// This layer does not belong to a historic ancestor, no need to re-image it.
|
||||
@@ -451,7 +549,9 @@ impl Timeline {
|
||||
///
|
||||
/// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
|
||||
/// that we know won't be needed for reads.
|
||||
pub(super) async fn update_layer_visibility(&self) {
|
||||
pub(super) async fn update_layer_visibility(
|
||||
&self,
|
||||
) -> Result<(), super::layer_manager::Shutdown> {
|
||||
let head_lsn = self.get_last_record_lsn();
|
||||
|
||||
// We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas
|
||||
@@ -459,7 +559,7 @@ impl Timeline {
|
||||
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
|
||||
// they will be subject to L0->L1 compaction in the near future.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_map = layer_manager.layer_map();
|
||||
let layer_map = layer_manager.layer_map()?;
|
||||
|
||||
let readable_points = {
|
||||
let children = self.gc_info.read().unwrap().retain_lsns.clone();
|
||||
@@ -482,6 +582,7 @@ impl Timeline {
|
||||
// TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
|
||||
// avoid assuming that everything at a branch point is visible.
|
||||
drop(covered);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
@@ -535,12 +636,8 @@ impl Timeline {
|
||||
) -> Result<CompactLevel0Phase1Result, CompactionError> {
|
||||
stats.read_lock_held_spawn_blocking_startup_micros =
|
||||
stats.read_lock_acquisition_micros.till_now(); // set by caller
|
||||
let layers = guard.layer_map();
|
||||
let level0_deltas = layers.get_level0_deltas();
|
||||
let mut level0_deltas = level0_deltas
|
||||
.into_iter()
|
||||
.map(|x| guard.get_from_desc(&x))
|
||||
.collect_vec();
|
||||
let layers = guard.layer_map()?;
|
||||
let level0_deltas = layers.level0_deltas();
|
||||
stats.level0_deltas_count = Some(level0_deltas.len());
|
||||
|
||||
// Only compact if enough layers have accumulated.
|
||||
@@ -553,6 +650,11 @@ impl Timeline {
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
|
||||
let mut level0_deltas = level0_deltas
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
//
|
||||
// Start with the oldest Level 0 delta file, and collect any other
|
||||
@@ -1006,14 +1108,16 @@ impl Timeline {
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(
|
||||
writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), self, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
let (desc, path) = writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
new_layers.push(new_delta);
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
@@ -1076,12 +1180,13 @@ impl Timeline {
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(
|
||||
writer
|
||||
.finish(prev_key.unwrap().next(), self, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
let (desc, path) = writer
|
||||
.finish(prev_key.unwrap().next(), ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
new_layers.push(new_delta);
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -1306,10 +1411,9 @@ impl Timeline {
|
||||
// Find the top of the historical layers
|
||||
let end_lsn = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
let l0_deltas = layers.get_level0_deltas();
|
||||
drop(guard);
|
||||
let l0_deltas = layers.level0_deltas();
|
||||
|
||||
// As an optimization, if we find that there are too few L0 layers,
|
||||
// bail out early. We know that the compaction algorithm would do
|
||||
@@ -1641,6 +1745,7 @@ impl Timeline {
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::BTreeSet;
|
||||
@@ -1664,19 +1769,23 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!("running enhanced gc bottom-most compaction");
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
};
|
||||
|
||||
let mut stat = CompactionStatistics::default();
|
||||
|
||||
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
let gc_cutoff = gc_info.cutoffs.select_min();
|
||||
@@ -1740,6 +1849,9 @@ impl Timeline {
|
||||
let key_range = desc.get_key_range();
|
||||
delta_split_points.insert(key_range.start);
|
||||
delta_split_points.insert(key_range.end);
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
let mut delta_layers = Vec::new();
|
||||
@@ -1775,6 +1887,8 @@ impl Timeline {
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
stats: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
last_batch: bool,
|
||||
) -> anyhow::Result<Option<FlushDeltaResult>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
@@ -1831,6 +1945,7 @@ impl Timeline {
|
||||
let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
stats.discard_delta_layer();
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
@@ -1857,9 +1972,16 @@ impl Timeline {
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer
|
||||
.finish(delta_key.key_range.end, tline, ctx)
|
||||
|
||||
stats.produce_delta_layer(delta_layer_writer.size());
|
||||
if dry_run {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (desc, path) = delta_layer_writer
|
||||
.finish(delta_key.key_range.end, ctx)
|
||||
.await?;
|
||||
let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
|
||||
Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
|
||||
}
|
||||
|
||||
@@ -1951,6 +2073,13 @@ impl Timeline {
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
}
|
||||
match val {
|
||||
Value::Image(_) => stat.visit_image_key(&val),
|
||||
Value::WalRecord(_) => stat.visit_wal_key(&val),
|
||||
}
|
||||
if last_key.is_none() || last_key.as_ref() == Some(&key) {
|
||||
if last_key.is_none() {
|
||||
last_key = Some(key);
|
||||
@@ -1958,6 +2087,7 @@ impl Timeline {
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
@@ -1974,6 +2104,7 @@ impl Timeline {
|
||||
*last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -1986,6 +2117,8 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
false,
|
||||
)
|
||||
.await?,
|
||||
@@ -1998,6 +2131,7 @@ impl Timeline {
|
||||
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
@@ -2014,6 +2148,7 @@ impl Timeline {
|
||||
last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2026,6 +2161,8 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
true,
|
||||
)
|
||||
.await?,
|
||||
@@ -2033,12 +2170,28 @@ impl Timeline {
|
||||
assert!(delta_values.is_empty(), "unprocessed keys");
|
||||
|
||||
let image_layer = if discard_image_layer {
|
||||
stat.discard_image_layer();
|
||||
None
|
||||
} else if let Some(writer) = image_layer_writer {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
stat.produce_image_layer(writer.size());
|
||||
if !dry_run {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
);
|
||||
|
||||
if dry_run {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
@@ -2062,10 +2215,13 @@ impl Timeline {
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
compact_to.extend(image_layer);
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
|
||||
guard
|
||||
.open_mut()?
|
||||
.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
|
||||
};
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&layer_selection, &compact_to)?;
|
||||
@@ -2145,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
self.flush_updates().await?;
|
||||
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
let layer_map = guard.layer_map()?;
|
||||
|
||||
let result = layer_map
|
||||
.iter_historic_layers()
|
||||
@@ -2268,9 +2424,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
))
|
||||
});
|
||||
|
||||
let new_delta_layer = writer
|
||||
.finish(prev.unwrap().0.next(), &self.timeline, ctx)
|
||||
.await?;
|
||||
let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
|
||||
let new_delta_layer =
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
|
||||
|
||||
self.new_deltas.push(new_delta_layer);
|
||||
Ok(())
|
||||
|
||||
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
tenant.gc_block.before_delete(&timeline);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, sync::Arc};
|
||||
|
||||
use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
|
||||
use crate::{
|
||||
@@ -74,6 +74,11 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
|
||||
Error::ShuttingDown
|
||||
}
|
||||
}
|
||||
impl From<super::layer_manager::Shutdown> for Error {
|
||||
fn from(_: super::layer_manager::Shutdown) -> Self {
|
||||
Error::ShuttingDown
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FlushLayerError> for Error {
|
||||
fn from(value: FlushLayerError) -> Self {
|
||||
@@ -141,50 +146,9 @@ pub(super) async fn prepare(
|
||||
}
|
||||
}
|
||||
|
||||
// detached has previously been detached; let's inspect each of the current timelines and
|
||||
// report back the timelines which have been reparented by our detach
|
||||
let mut all_direct_children = tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
|
||||
.map(|tl| (tl.ancestor_lsn, tl.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut any_shutdown = false;
|
||||
|
||||
all_direct_children.retain(
|
||||
|(_, tl)| match tl.remote_client.initialized_upload_queue() {
|
||||
Ok(accessor) => accessor
|
||||
.latest_uploaded_index_part()
|
||||
.lineage
|
||||
.is_reparented(),
|
||||
Err(_shutdownalike) => {
|
||||
// not 100% a shutdown, but let's bail early not to give inconsistent results in
|
||||
// sharded enviroment.
|
||||
any_shutdown = true;
|
||||
true
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if any_shutdown {
|
||||
// it could be one or many being deleted; have client retry
|
||||
return Err(Error::ShuttingDown);
|
||||
}
|
||||
|
||||
let mut reparented = all_direct_children;
|
||||
// why this instead of hashset? there is a reason, but I've forgotten it many times.
|
||||
//
|
||||
// maybe if this was a hashset we would not be able to distinguish some race condition.
|
||||
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
|
||||
|
||||
let reparented_timelines = reparented_direct_children(detached, tenant)?;
|
||||
return Ok(Progress::Done(AncestorDetached {
|
||||
reparented_timelines: reparented
|
||||
.into_iter()
|
||||
.map(|(_, tl)| tl.timeline_id)
|
||||
.collect(),
|
||||
reparented_timelines,
|
||||
}));
|
||||
};
|
||||
|
||||
@@ -277,7 +241,7 @@ pub(super) async fn prepare(
|
||||
|
||||
// between retries, these can change if compaction or gc ran in between. this will mean
|
||||
// we have to redo work.
|
||||
partition_work(ancestor_lsn, &layers)
|
||||
partition_work(ancestor_lsn, &layers)?
|
||||
};
|
||||
|
||||
// TODO: layers are already sorted by something: use that to determine how much of remote
|
||||
@@ -381,16 +345,67 @@ pub(super) async fn prepare(
|
||||
Ok(Progress::Prepared(guard, prepared))
|
||||
}
|
||||
|
||||
fn reparented_direct_children(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<HashSet<TimelineId>, Error> {
|
||||
let mut all_direct_children = tenant
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter_map(|tl| {
|
||||
let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
|
||||
|
||||
if is_direct_child {
|
||||
Some(tl.clone())
|
||||
} else {
|
||||
if let Some(timeline) = tl.ancestor_timeline.as_ref() {
|
||||
assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
|
||||
}
|
||||
None
|
||||
}
|
||||
})
|
||||
// Collect to avoid lock taking order problem with Tenant::timelines and
|
||||
// Timeline::remote_client
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut any_shutdown = false;
|
||||
|
||||
all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
|
||||
Ok(accessor) => accessor
|
||||
.latest_uploaded_index_part()
|
||||
.lineage
|
||||
.is_reparented(),
|
||||
Err(_shutdownalike) => {
|
||||
// not 100% a shutdown, but let's bail early not to give inconsistent results in
|
||||
// sharded enviroment.
|
||||
any_shutdown = true;
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
if any_shutdown {
|
||||
// it could be one or many being deleted; have client retry
|
||||
return Err(Error::ShuttingDown);
|
||||
}
|
||||
|
||||
Ok(all_direct_children
|
||||
.into_iter()
|
||||
.map(|tl| tl.timeline_id)
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn partition_work(
|
||||
ancestor_lsn: Lsn,
|
||||
source_layermap: &LayerManager,
|
||||
) -> (usize, Vec<Layer>, Vec<Layer>) {
|
||||
source: &LayerManager,
|
||||
) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
|
||||
let mut straddling_branchpoint = vec![];
|
||||
let mut rest_of_historic = vec![];
|
||||
|
||||
let mut later_by_lsn = 0;
|
||||
|
||||
for desc in source_layermap.layer_map().iter_historic_layers() {
|
||||
for desc in source.layer_map()?.iter_historic_layers() {
|
||||
// off by one chances here:
|
||||
// - start is inclusive
|
||||
// - end is exclusive
|
||||
@@ -409,10 +424,10 @@ fn partition_work(
|
||||
&mut rest_of_historic
|
||||
};
|
||||
|
||||
target.push(source_layermap.get_from_desc(&desc));
|
||||
target.push(source.get_from_desc(&desc));
|
||||
}
|
||||
|
||||
(later_by_lsn, straddling_branchpoint, rest_of_historic)
|
||||
Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
|
||||
}
|
||||
|
||||
async fn upload_rewritten_layer(
|
||||
@@ -488,10 +503,12 @@ async fn copy_lsn_prefix(
|
||||
// reuse the key instead of adding more holes between layers by using the real
|
||||
// highest key in the layer.
|
||||
let reused_highest_key = layer.layer_desc().key_range.end;
|
||||
let copied = writer
|
||||
.finish(reused_highest_key, target_timeline, ctx)
|
||||
let (desc, path) = writer
|
||||
.finish(reused_highest_key, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
tracing::debug!(%layer, %copied, "new layer produced");
|
||||
|
||||
@@ -537,11 +554,12 @@ pub(super) async fn complete(
|
||||
tenant: &Tenant,
|
||||
prepared: PreparedTimelineDetach,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Vec<TimelineId>, anyhow::Error> {
|
||||
) -> Result<HashSet<TimelineId>, anyhow::Error> {
|
||||
let PreparedTimelineDetach { layers } = prepared;
|
||||
|
||||
let ancestor = detached
|
||||
.get_ancestor_timeline()
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.expect("must still have a ancestor");
|
||||
let ancestor_lsn = detached.get_ancestor_lsn();
|
||||
|
||||
@@ -581,7 +599,7 @@ pub(super) async fn complete(
|
||||
}
|
||||
|
||||
let tl_ancestor = tl.ancestor_timeline.as_ref()?;
|
||||
let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
|
||||
let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
|
||||
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
|
||||
|
||||
let is_deleting = tl
|
||||
@@ -622,13 +640,18 @@ pub(super) async fn complete(
|
||||
});
|
||||
|
||||
let reparenting_candidates = tasks.len();
|
||||
let mut reparented = Vec::with_capacity(tasks.len());
|
||||
let mut reparented = HashSet::with_capacity(tasks.len());
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Some(timeline)) => {
|
||||
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
|
||||
reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
|
||||
|
||||
assert!(
|
||||
reparented.insert(timeline.timeline_id),
|
||||
"duplicate reparenting? timeline_id={}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
Ok(None) => {
|
||||
// lets just ignore this for now. one or all reparented timelines could had
|
||||
@@ -650,12 +673,5 @@ pub(super) async fn complete(
|
||||
tracing::info!("failed to reparent some candidates");
|
||||
}
|
||||
|
||||
reparented.sort_unstable();
|
||||
|
||||
let reparented = reparented
|
||||
.into_iter()
|
||||
.map(|(_, timeline_id)| timeline_id)
|
||||
.collect();
|
||||
|
||||
Ok(reparented)
|
||||
}
|
||||
|
||||
@@ -213,51 +213,45 @@ impl Timeline {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
for layer in layers.iter_historic_layers() {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
// disk_usage_eviction_task both select the same layers to be evicted, and
|
||||
// seemingly free up double the space. both succeeding is of no consequence.
|
||||
guard
|
||||
.likely_resident_layers()
|
||||
.filter(|layer| {
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
|
||||
if !layer.is_likely_resident() {
|
||||
continue;
|
||||
}
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
Err(_e) => {
|
||||
// We reach here if `now` < `last_activity_ts`, which can legitimately
|
||||
// happen if there is an access between us getting `now`, and us getting
|
||||
// the access stats from the layer.
|
||||
//
|
||||
// The other reason why it can happen is system clock skew because
|
||||
// SystemTime::now() is not monotonic, so, even if there is no access
|
||||
// to the layer after we get `now` at the beginning of this function,
|
||||
// it could be that `now` < `last_activity_ts`.
|
||||
//
|
||||
// To distinguish the cases, we would need to record `Instant`s in the
|
||||
// access stats (i.e., monotonic timestamps), but then, the timestamps
|
||||
// values in the access stats would need to be `Instant`'s, and hence
|
||||
// they would be meaningless outside of the pageserver process.
|
||||
// At the time of writing, the trade-off is that access stats are more
|
||||
// valuable than detecting clock skew.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if no_activity_for > p.threshold {
|
||||
no_activity_for > p.threshold
|
||||
})
|
||||
.cloned()
|
||||
.for_each(|layer| {
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.evict_and_wait(std::time::Duration::from_secs(5))
|
||||
.await
|
||||
});
|
||||
stats.candidates += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
let join_all = async move {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
@@ -24,39 +24,142 @@ use crate::{
|
||||
use super::TimelineWriterState;
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
pub(crate) enum LayerManager {
|
||||
/// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
|
||||
/// the layers.
|
||||
Open(OpenLayerManager),
|
||||
/// Shutdown layer manager where there are no more in-memory layers and persistent layers are
|
||||
/// read-only.
|
||||
Closed {
|
||||
layers: HashMap<PersistentLayerKey, Layer>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for LayerManager {
|
||||
fn default() -> Self {
|
||||
LayerManager::Open(OpenLayerManager::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.layers()
|
||||
.get(key)
|
||||
.with_context(|| format!("get layer from key: {key}"))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
|
||||
self.layer_fmgr.get_from_key(desc)
|
||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||
self.get_from_key(&desc.key())
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the layer map.
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
|
||||
pub(crate) fn layer_map(&self) -> &LayerMap {
|
||||
&self.layer_map
|
||||
pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
|
||||
Closed { .. } => Err(Shutdown),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
|
||||
use LayerManager::*;
|
||||
|
||||
match self {
|
||||
Open(open) => Ok(open),
|
||||
Closed { .. } => Err(Shutdown),
|
||||
}
|
||||
}
|
||||
|
||||
/// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
|
||||
/// order to allow shutdown to complete.
|
||||
///
|
||||
/// If there was a want to flush in-memory layers, it must have happened earlier.
|
||||
pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager {
|
||||
layer_map,
|
||||
layer_fmgr: LayerFileManager(hashmap),
|
||||
}) => {
|
||||
let open = layer_map.open_layer.take();
|
||||
let frozen = layer_map.frozen_layers.len();
|
||||
let taken_writer_state = writer_state.take();
|
||||
tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
|
||||
let layers = std::mem::take(hashmap);
|
||||
*self = Closed { layers };
|
||||
assert_eq!(open.is_some(), taken_writer_state.is_some());
|
||||
}
|
||||
Closed { .. } => {
|
||||
tracing::debug!("ignoring multiple shutdowns on layer manager")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Sum up the historic layer sizes
|
||||
pub(crate) fn layer_size_sum(&self) -> u64 {
|
||||
self.layers()
|
||||
.values()
|
||||
.map(|l| l.layer_desc().file_size)
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
|
||||
self.layers().values().filter(|l| l.is_likely_resident())
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.layers().contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layers().keys().cloned().collect_vec()
|
||||
}
|
||||
|
||||
fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
|
||||
use LayerManager::*;
|
||||
match self {
|
||||
Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
|
||||
Closed { layers } => layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct OpenLayerManager {
|
||||
layer_map: LayerMap,
|
||||
layer_fmgr: LayerFileManager<Layer>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for OpenLayerManager {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("OpenLayerManager")
|
||||
.field("layer_count", &self.layer_fmgr.0.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("layer manager has been shutdown")]
|
||||
pub(crate) struct Shutdown;
|
||||
|
||||
impl OpenLayerManager {
|
||||
/// Called from `load_layer_map`. Initialize the layer manager with:
|
||||
/// 1. all on-disk layers
|
||||
/// 2. next open layer (with disk disk_consistent_lsn LSN)
|
||||
pub(crate) fn initialize_local_layers(
|
||||
&mut self,
|
||||
on_disk_layers: Vec<Layer>,
|
||||
next_open_layer_at: Lsn,
|
||||
) {
|
||||
pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
for layer in on_disk_layers {
|
||||
for layer in layers {
|
||||
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
@@ -68,26 +171,19 @@ impl LayerManager {
|
||||
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
|
||||
}
|
||||
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
|
||||
/// called within `get_layer_for_write`.
|
||||
/// Open a new writable layer to append data if there is no open layer, otherwise return the
|
||||
/// current open layer, called within `get_layer_for_write`.
|
||||
pub(crate) async fn get_layer_for_write(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
gate_guard: utils::sync::gate::GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<InMemoryLayer>> {
|
||||
) -> anyhow::Result<Arc<InMemoryLayer>> {
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
ensure!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
);
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer = if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
if open_layer.get_lsn_range().start > lsn {
|
||||
@@ -113,8 +209,15 @@ impl LayerManager {
|
||||
lsn
|
||||
);
|
||||
|
||||
let new_layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
|
||||
let new_layer = InMemoryLayer::create(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_lsn,
|
||||
gate_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let layer = Arc::new(new_layer);
|
||||
|
||||
self.layer_map.open_layer = Some(layer.clone());
|
||||
@@ -168,7 +271,7 @@ impl LayerManager {
|
||||
froze
|
||||
}
|
||||
|
||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||
/// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
|
||||
pub(crate) fn track_new_image_layers(
|
||||
&mut self,
|
||||
image_layers: &[ResidentLayer],
|
||||
@@ -241,7 +344,7 @@ impl LayerManager {
|
||||
self.finish_compact_l0(compact_from, compact_to, metrics)
|
||||
}
|
||||
|
||||
/// Called when compaction is completed.
|
||||
/// Called post-compaction when some previous generation image layers were trimmed.
|
||||
pub(crate) fn rewrite_layers(
|
||||
&mut self,
|
||||
rewrite_layers: &[(Layer, ResidentLayer)],
|
||||
@@ -259,13 +362,10 @@ impl LayerManager {
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
|
||||
// always marking rewritten layers as visible.
|
||||
new_layer
|
||||
.as_ref()
|
||||
.access_stats()
|
||||
.set_visibility(old_layer.access_stats().visibility());
|
||||
new_layer.as_ref().set_visibility(old_layer.visibility());
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
@@ -333,31 +433,6 @@ impl LayerManager {
|
||||
mapping.remove(layer);
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
|
||||
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||
|
||||
self.layer_map().iter_historic_layers().filter_map(|desc| {
|
||||
self.layer_fmgr
|
||||
.0
|
||||
.get(&desc.key())
|
||||
.filter(|l| l.is_likely_resident())
|
||||
.cloned()
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
|
||||
pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.layer_fmgr.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
|
||||
self.layer_fmgr.0.keys().cloned().collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
@@ -369,24 +444,6 @@ impl<T> Default for LayerFileManager<T> {
|
||||
}
|
||||
|
||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
.get(key)
|
||||
.with_context(|| format!("get layer from key: {}", key))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
self.get_from_key(&desc.key())
|
||||
}
|
||||
|
||||
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.0.contains_key(key)
|
||||
}
|
||||
|
||||
pub(crate) fn insert(&mut self, layer: T) {
|
||||
let present = self.0.insert(layer.layer_desc().key(), layer.clone());
|
||||
if present.is_some() && cfg!(debug_assertions) {
|
||||
@@ -394,10 +451,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &T) -> bool {
|
||||
self.0.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, layer: &T) {
|
||||
let present = self.0.remove(&layer.layer_desc().key());
|
||||
if present.is_none() && cfg!(debug_assertions) {
|
||||
|
||||
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
|
||||
Self::Exact(_) => Accuracy::Exact,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_exact(&self) -> bool {
|
||||
matches!(self, Self::Exact(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl LogicalSize {
|
||||
|
||||
@@ -30,10 +30,12 @@ use tokio::time::Instant;
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::feature_test as io_engine_feature_test;
|
||||
pub use io_engine::io_engine_for_bench;
|
||||
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
|
||||
mod metadata;
|
||||
mod open_options;
|
||||
use self::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
pub(crate) use api::DirectIoMode;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
@@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
|
||||
.join()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// For use in benchmark binaries only.
|
||||
///
|
||||
/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
|
||||
/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
|
||||
/// developer time trying to figure out why it's slow.
|
||||
///
|
||||
/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
|
||||
pub fn io_engine_for_bench() -> IoEngineKind {
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
panic!("This benchmark does I/O and can only give a representative result on Linux");
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
match feature_test().unwrap() {
|
||||
FeatureTestResult::PlatformPreferred(engine) => engine,
|
||||
FeatureTestResult::Worse {
|
||||
engine: _engine,
|
||||
remark,
|
||||
} => {
|
||||
panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ static const char *jwt_token = NULL;
|
||||
/* GUCs */
|
||||
static char *ConsoleURL = NULL;
|
||||
static bool ForwardDDL = true;
|
||||
static bool RegressTestMode = false;
|
||||
|
||||
/*
|
||||
* CURL docs say that this buffer must exist until we call curl_easy_cleanup
|
||||
@@ -802,6 +803,14 @@ NeonProcessUtility(
|
||||
case T_DropRoleStmt:
|
||||
HandleDropRole(castNode(DropRoleStmt, parseTree));
|
||||
break;
|
||||
case T_CreateTableSpaceStmt:
|
||||
if (!RegressTestMode)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("CREATE TABLESPACE is not supported on Neon")));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -864,6 +873,18 @@ InitControlPlaneConnector()
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.regress_test_mode",
|
||||
"Controls whether we are running in the regression test mode",
|
||||
NULL,
|
||||
&RegressTestMode,
|
||||
false,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
|
||||
if (!jwt_token)
|
||||
{
|
||||
|
||||
184
poetry.lock
generated
184
poetry.lock
generated
@@ -1,91 +1,103 @@
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
version = "2.3.5"
|
||||
description = "Happy Eyeballs for asyncio"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
|
||||
{file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
version = "3.9.4"
|
||||
version = "3.10.2"
|
||||
description = "Async http client/server framework (asyncio)"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
|
||||
{file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
|
||||
{file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
|
||||
{file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
|
||||
{file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
|
||||
{file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
|
||||
{file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
|
||||
{file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aiohappyeyeballs = ">=2.3.0"
|
||||
aiosignal = ">=1.1.2"
|
||||
async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
|
||||
attrs = ">=17.3.0"
|
||||
@@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0"
|
||||
yarl = ">=1.0,<2.0"
|
||||
|
||||
[package.extras]
|
||||
speedups = ["Brotli", "aiodns", "brotlicffi"]
|
||||
speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
|
||||
|
||||
[[package]]
|
||||
name = "aiopg"
|
||||
@@ -1514,6 +1526,20 @@ files = [
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "kafka-python"
|
||||
version = "2.0.2"
|
||||
description = "Pure Python client for Apache Kafka"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
|
||||
{file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
crc32c = ["crc32c"]
|
||||
|
||||
[[package]]
|
||||
name = "lazy-object-proxy"
|
||||
version = "1.10.0"
|
||||
@@ -3357,4 +3383,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
|
||||
content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "proxy"
|
||||
name = "proxy-core"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
@@ -9,8 +9,11 @@ default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
proxy-sasl = { version = "0.1", path = "../sasl" }
|
||||
|
||||
ahash.workspace = true
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
atomic-take.workspace = true
|
||||
@@ -30,7 +33,6 @@ dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
@@ -51,17 +53,15 @@ md5.workspace = true
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
parquet.workspace = true
|
||||
parquet_derive.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
pq_proto.workspace = true
|
||||
prometheus.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
remote_storage = { version = "0.1", path = "../../libs/remote_storage/" }
|
||||
reqwest.workspace = true
|
||||
reqwest-middleware = { workspace = true, features = ["json"] }
|
||||
reqwest-retry.workspace = true
|
||||
@@ -73,14 +73,13 @@ rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2 = { workspace = true, features = ["asm"] }
|
||||
sha2 = { workspace = true, features = ["asm", "oid"] }
|
||||
smol_str.workspace = true
|
||||
smallvec.workspace = true
|
||||
socket2.workspace = true
|
||||
subtle.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
@@ -92,6 +91,7 @@ tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
try-lock.workspace = true
|
||||
typed-json.workspace = true
|
||||
url.workspace = true
|
||||
urlencoding.workspace = true
|
||||
@@ -102,6 +102,14 @@ x509-parser.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
redis.workspace = true
|
||||
|
||||
# jwt stuff
|
||||
jose-jwa = "0.1.2"
|
||||
jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
|
||||
signature = "2"
|
||||
ecdsa = "0.16"
|
||||
p256 = "0.13"
|
||||
rsa = "0.9"
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
@@ -38,7 +38,7 @@ pub enum AuthErrorImpl {
|
||||
|
||||
/// SASL protocol errors (includes [SCRAM](crate::scram)).
|
||||
#[error(transparent)]
|
||||
Sasl(#[from] crate::sasl::Error),
|
||||
Sasl(#[from] proxy_sasl::sasl::Error),
|
||||
|
||||
#[error("Unsupported authentication method: {0}")]
|
||||
BadAuthMethod(Box<str>),
|
||||
@@ -148,3 +148,28 @@ impl ReportableError for AuthError {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for proxy_sasl::sasl::Error {
|
||||
fn to_string_client(&self) -> String {
|
||||
match self {
|
||||
proxy_sasl::sasl::Error::ChannelBindingFailed(m) => m.to_string(),
|
||||
proxy_sasl::sasl::Error::ChannelBindingBadMethod(m) => {
|
||||
format!("unsupported channel binding method {m}")
|
||||
}
|
||||
_ => "authentication protocol violation".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for proxy_sasl::sasl::Error {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
proxy_sasl::sasl::Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
|
||||
proxy_sasl::sasl::Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
|
||||
proxy_sasl::sasl::Error::BadClientMessage(_) => crate::error::ErrorKind::User,
|
||||
proxy_sasl::sasl::Error::MissingBinding => crate::error::ErrorKind::Service,
|
||||
proxy_sasl::sasl::Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
|
||||
proxy_sasl::sasl::Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
mod classic;
|
||||
mod hacks;
|
||||
pub mod jwt;
|
||||
mod link;
|
||||
|
||||
use std::net::IpAddr;
|
||||
@@ -8,6 +9,7 @@ use std::time::Duration;
|
||||
|
||||
use ipnet::{Ipv4Net, Ipv6Net};
|
||||
pub use link::LinkAuthError;
|
||||
use proxy_sasl::scram;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
use tracing::{info, warn};
|
||||
@@ -35,7 +37,7 @@ use crate::{
|
||||
},
|
||||
stream, url,
|
||||
};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||
use crate::{EndpointCacheKey, EndpointId, RoleName};
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -218,7 +220,7 @@ impl RateBucketInfo {
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
@@ -243,7 +245,7 @@ impl AuthenticationConfig {
|
||||
let limit_not_exceeded = self.rate_limiter.check(
|
||||
(
|
||||
endpoint_int,
|
||||
MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
|
||||
MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
|
||||
),
|
||||
password_weight,
|
||||
);
|
||||
@@ -274,7 +276,7 @@ impl AuthenticationConfig {
|
||||
///
|
||||
/// All authentication flows will emit an AuthenticationOk message if successful.
|
||||
async fn auth_quirks(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
api: &impl console::Api,
|
||||
user_info: ComputeUserInfoMaybeEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -303,8 +305,8 @@ async fn auth_quirks(
|
||||
let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
|
||||
|
||||
// check allowed list
|
||||
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
|
||||
if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
|
||||
}
|
||||
|
||||
if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
|
||||
@@ -356,7 +358,7 @@ async fn auth_quirks(
|
||||
}
|
||||
|
||||
async fn authenticate_with_secret(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
secret: AuthSecret,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -370,8 +372,8 @@ async fn authenticate_with_secret(
|
||||
let auth_outcome =
|
||||
validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
proxy_sasl::sasl::Outcome::Success(key) => key,
|
||||
proxy_sasl::sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(&*info.user));
|
||||
}
|
||||
@@ -421,7 +423,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
@@ -467,7 +469,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
pub async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
@@ -478,7 +480,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
|
||||
pub async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
@@ -492,7 +494,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
@@ -514,7 +516,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
@@ -557,9 +559,9 @@ mod tests {
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
|
||||
scram::{threadpool::ThreadPool, ServerSecret},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use proxy_sasl::scram::{threadpool::ThreadPool, ServerSecret};
|
||||
|
||||
use super::{auth_quirks, AuthRateLimiter};
|
||||
|
||||
@@ -571,7 +573,7 @@ mod tests {
|
||||
impl console::Api for Auth {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
|
||||
@@ -579,7 +581,7 @@ mod tests {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
|
||||
{
|
||||
@@ -591,7 +593,7 @@ mod tests {
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
unimplemented!()
|
||||
@@ -665,10 +667,14 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
secret: AuthSecret::Scram(
|
||||
ServerSecret::build_test_secret("my-secret-password")
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
@@ -723,7 +729,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&mut ctx,
|
||||
&ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
@@ -742,10 +748,14 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
secret: AuthSecret::Scram(
|
||||
ServerSecret::build_test_secret("my-secret-password")
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
@@ -775,7 +785,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let _creds = auth_quirks(
|
||||
&mut ctx,
|
||||
&ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
@@ -794,10 +804,14 @@ mod tests {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
secret: AuthSecret::Scram(
|
||||
ServerSecret::build_test_secret("my-secret-password")
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
@@ -828,7 +842,7 @@ mod tests {
|
||||
));
|
||||
|
||||
let creds = auth_quirks(
|
||||
&mut ctx,
|
||||
&ctx,
|
||||
&api,
|
||||
user_info,
|
||||
&mut stream,
|
||||
@@ -5,14 +5,14 @@ use crate::{
|
||||
config::AuthenticationConfig,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
sasl,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use proxy_sasl::sasl;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
creds: ComputeUserInfo,
|
||||
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
config: &'static AuthenticationConfig,
|
||||
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
AuthSecret::Scram(secret) => {
|
||||
info!("auth endpoint chooses SCRAM");
|
||||
let scram = auth::Scram(&secret, &mut *ctx);
|
||||
let scram = auth::Scram(&secret, ctx);
|
||||
|
||||
let auth_outcome = tokio::time::timeout(
|
||||
config.scram_protocol_timeout,
|
||||
@@ -7,9 +7,9 @@ use crate::{
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
stream::{self, Stream},
|
||||
};
|
||||
use proxy_sasl::sasl;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
@@ -18,7 +18,7 @@ use tracing::{info, warn};
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn authenticate_cleartext(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
secret: AuthSecret,
|
||||
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let ep = EndpointIdInt::from(&info.endpoint);
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
554
proxy/core/src/auth/backend/jwt.rs
Normal file
554
proxy/core/src/auth/backend/jwt.rs
Normal file
@@ -0,0 +1,554 @@
|
||||
use std::{future::Future, sync::Arc, time::Duration};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FetchAuthRulesFromCplane {
|
||||
#[allow(dead_code)]
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for FetchAuthRulesFromCplane {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Err(anyhow::anyhow!("not yet implemented"))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AuthRules {
|
||||
jwks_urls: Vec<url::Url>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntryLock {
|
||||
cached: ArcSwapOption<JwkCacheEntry>,
|
||||
lookup: tokio::sync::Semaphore,
|
||||
}
|
||||
|
||||
impl Default for JwkCacheEntryLock {
|
||||
fn default() -> Self {
|
||||
JwkCacheEntryLock {
|
||||
cached: ArcSwapOption::empty(),
|
||||
lookup: tokio::sync::Semaphore::new(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntryLock {
|
||||
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
|
||||
JwkRenewalPermit::acquire_permit(self).await
|
||||
}
|
||||
|
||||
fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
|
||||
JwkRenewalPermit::try_acquire_permit(self)
|
||||
}
|
||||
|
||||
async fn renew_jwks<F: FetchAuthRules>(
|
||||
&self,
|
||||
_permit: JwkRenewalPermit<'_>,
|
||||
client: &reqwest::Client,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
if let Some(cached) = guard {
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
if last_update < Duration::from_secs(300) {
|
||||
return Ok(cached);
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules.fetch_auth_rules().await?;
|
||||
let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
|
||||
rules.jwks_urls.len(),
|
||||
ahash::RandomState::new(),
|
||||
);
|
||||
// TODO(conrad): run concurrently
|
||||
for url in rules.jwks_urls {
|
||||
let req = client.get(url.clone());
|
||||
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
|
||||
match req.send().await.and_then(|r| r.error_for_status()) {
|
||||
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
|
||||
// I expect these failures would be quite sparse.
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
|
||||
Ok(r) => {
|
||||
let resp: http::Response<reqwest::Body> = r.into();
|
||||
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
|
||||
resp.into_body(),
|
||||
MAX_JWK_BODY_SIZE,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
|
||||
Ok(jwks) => {
|
||||
key_sets.insert(url, jwks);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let entry = Arc::new(JwkCacheEntry {
|
||||
last_retrieved: now,
|
||||
key_sets,
|
||||
});
|
||||
self.cached.swap(Some(Arc::clone(&entry)));
|
||||
|
||||
Ok(entry)
|
||||
}
|
||||
|
||||
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
client: &reqwest::Client,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
|
||||
// if we have no cached JWKs, try and get some
|
||||
let Some(cached) = guard else {
|
||||
let permit = self.acquire_permit().await;
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
|
||||
// check if the cached JWKs need updating.
|
||||
if last_update > MAX_RENEW {
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
if last_update > AUTO_RENEW {
|
||||
if let Some(permit) = self.try_acquire_permit() {
|
||||
tracing::debug!("JWKs should be renewed. Renewal permit acquired");
|
||||
let permit = permit.into_owned();
|
||||
let entry = self.clone();
|
||||
let client = client.clone();
|
||||
let fetch = fetch.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
}
|
||||
});
|
||||
} else {
|
||||
tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cached)
|
||||
}
|
||||
|
||||
async fn check_jwt<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
jwt: String,
|
||||
client: &reqwest::Client,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
// <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
|
||||
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
|
||||
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once(".")
|
||||
.context("not a valid compact JWT encoding")?;
|
||||
let (header, _payload) = header_payload
|
||||
.split_once(".")
|
||||
.context("not a valid compact JWT encoding")?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
.context("not a valid compact JWT encoding")?;
|
||||
let header = serde_json::from_slice::<JWTHeader>(&header)
|
||||
.context("not a valid compact JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
let kid = header.kid.context("missing key id")?;
|
||||
|
||||
let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let jwk = loop {
|
||||
let jwk = guard
|
||||
.key_sets
|
||||
.values()
|
||||
.flat_map(|jwks| &jwks.keys)
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
|
||||
|
||||
match jwk {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self.renew_jwks(permit, client, fetch).await?;
|
||||
}
|
||||
_ => {
|
||||
bail!("jwk not found");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
ensure!(
|
||||
jwk.is_supported(&header.alg),
|
||||
"signature algorithm not supported"
|
||||
);
|
||||
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("not a valid compact JWT encoding")?;
|
||||
match &jwk.key {
|
||||
jose_jwk::Key::Ec(key) => {
|
||||
verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
|
||||
}
|
||||
jose_jwk::Key::Rsa(key) => {
|
||||
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
|
||||
}
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
};
|
||||
|
||||
// TODO(conrad): verify iss, exp, nbf, etc...
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkCache {
|
||||
pub async fn check_jwt(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
jwt: String,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// try with just a read lock first
|
||||
let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
|
||||
let entry = match entry {
|
||||
Some(entry) => entry,
|
||||
None => {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(endpoint).or_default();
|
||||
Arc::clone(&*entry)
|
||||
}
|
||||
};
|
||||
|
||||
let fetch = FetchAuthRulesFromCplane { endpoint };
|
||||
entry.check_jwt(jwt, &self.client, &fetch).await
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
|
||||
use ecdsa::Signature;
|
||||
use signature::Verifier;
|
||||
|
||||
match key.crv {
|
||||
jose_jwk::EcCurves::P256 => {
|
||||
let pk =
|
||||
p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
|
||||
let key = p256::ecdsa::VerifyingKey::from(&pk);
|
||||
let sig = Signature::from_slice(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
key => bail!("unsupported ec key type {key:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn verify_rsa_signature(
|
||||
data: &[u8],
|
||||
sig: &[u8],
|
||||
key: &jose_jwk::Rsa,
|
||||
alg: &Option<jose_jwa::Algorithm>,
|
||||
) -> anyhow::Result<()> {
|
||||
use jose_jwa::{Algorithm, Signing};
|
||||
use rsa::{
|
||||
pkcs1v15::{Signature, VerifyingKey},
|
||||
RsaPublicKey,
|
||||
};
|
||||
|
||||
let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
|
||||
|
||||
match alg {
|
||||
Some(Algorithm::Signing(Signing::Rs256)) => {
|
||||
let key = VerifyingKey::<sha2::Sha256>::new(key);
|
||||
let sig = Signature::try_from(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
_ => bail!("invalid RSA signing algorithm"),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct JWTHeader<'a> {
|
||||
/// must be "JWT"
|
||||
typ: &'a str,
|
||||
/// must be a supported alg
|
||||
alg: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
kid: Option<&'a str>,
|
||||
}
|
||||
|
||||
struct JwkRenewalPermit<'a> {
|
||||
inner: Option<JwkRenewalPermitInner<'a>>,
|
||||
}
|
||||
|
||||
enum JwkRenewalPermitInner<'a> {
|
||||
Owned(Arc<JwkCacheEntryLock>),
|
||||
Borrowed(&'a Arc<JwkCacheEntryLock>),
|
||||
}
|
||||
|
||||
impl JwkRenewalPermit<'_> {
|
||||
fn into_owned(mut self) -> JwkRenewalPermit<'static> {
|
||||
JwkRenewalPermit {
|
||||
inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
|
||||
match from.lookup.acquire().await {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
JwkRenewalPermit {
|
||||
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
|
||||
}
|
||||
}
|
||||
Err(_) => panic!("semaphore should not be closed"),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
|
||||
match from.lookup.try_acquire() {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
Some(JwkRenewalPermit {
|
||||
inner: Some(JwkRenewalPermitInner::Borrowed(from)),
|
||||
})
|
||||
}
|
||||
Err(tokio::sync::TryAcquireError::NoPermits) => None,
|
||||
Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkRenewalPermitInner<'_> {
|
||||
fn into_owned(self) -> JwkRenewalPermitInner<'static> {
|
||||
match self {
|
||||
JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
|
||||
JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for JwkRenewalPermit<'_> {
|
||||
fn drop(&mut self) {
|
||||
let entry = match &self.inner {
|
||||
None => return,
|
||||
Some(JwkRenewalPermitInner::Owned(p)) => p,
|
||||
Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
|
||||
};
|
||||
entry.lookup.add_permits(1);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
|
||||
|
||||
use base64::URL_SAFE_NO_PAD;
|
||||
use bytes::Bytes;
|
||||
use http::Response;
|
||||
use http_body_util::Full;
|
||||
use hyper1::service::service_fn;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use rand::rngs::OsRng;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
|
||||
let sk = p256::SecretKey::random(&mut OsRng);
|
||||
let pk = sk.public_key().into();
|
||||
let jwk = jose_jwk::Jwk {
|
||||
key: jose_jwk::Key::Ec(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
|
||||
let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
|
||||
let pk = sk.to_public_key().into();
|
||||
let jwk = jose_jwk::Jwk {
|
||||
key: jose_jwk::Key::Rsa(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let header = JWTHeader {
|
||||
typ: "JWT",
|
||||
alg: jose_jwa::Algorithm::Signing(sig),
|
||||
kid: Some(&kid),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
}};
|
||||
|
||||
let header =
|
||||
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
|
||||
fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
|
||||
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
|
||||
use rsa::pkcs1v15::SigningKey;
|
||||
use rsa::signature::SignatureEncoding;
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
|
||||
let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn renew() {
|
||||
let (rs1, jwk1) = new_rsa_jwk("1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk("2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), ec2);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
|
||||
let service = service_fn(move |req| {
|
||||
let foo_jwks = foo_jwks.clone();
|
||||
let bar_jwks = bar_jwks.clone();
|
||||
async move {
|
||||
let jwks = match req.uri().path() {
|
||||
"/foo" => &foo_jwks,
|
||||
"/bar" => &bar_jwks,
|
||||
_ => {
|
||||
return Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new()));
|
||||
}
|
||||
};
|
||||
let body = serde_json::to_vec(jwks).unwrap();
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body)))
|
||||
}
|
||||
});
|
||||
|
||||
let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
|
||||
let server = hyper1::server::conn::http1::Builder::new();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
let (s, _) = listener.accept().await.unwrap();
|
||||
let serve = server.serve_connection(TokioIo::new(s), service.clone());
|
||||
tokio::spawn(serve.into_future());
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Ok(AuthRules {
|
||||
jwks_urls: vec![
|
||||
format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(jwt1, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt2, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt3, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt4, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
|
||||
}
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
@@ -84,7 +84,7 @@ pub fn endpoint_sni(
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
pub fn parse(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<&HashSet<String>>,
|
||||
@@ -249,8 +249,8 @@ mod tests {
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
@@ -264,8 +264,8 @@ mod tests {
|
||||
("database", "world"), // should be ignored
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
@@ -279,9 +279,9 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
|
||||
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
|
||||
@@ -296,8 +296,8 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
@@ -311,8 +311,8 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
@@ -329,8 +329,8 @@ mod tests {
|
||||
),
|
||||
]);
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
@@ -344,8 +344,8 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
|
||||
]);
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
@@ -359,9 +359,9 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
|
||||
|
||||
@@ -374,16 +374,16 @@ mod tests {
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
Ok(())
|
||||
@@ -397,10 +397,9 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let err =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -417,10 +416,9 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let err =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
UnknownCommonName { cn } => {
|
||||
assert_eq!(cn, "localhost");
|
||||
@@ -438,9 +436,9 @@ mod tests {
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
user_info.options.get_cache_key("project"),
|
||||
@@ -2,16 +2,17 @@
|
||||
|
||||
use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
|
||||
use crate::{
|
||||
config::TlsServerEndPoint,
|
||||
console::AuthSecret,
|
||||
context::RequestMonitoring,
|
||||
intern::EndpointIdInt,
|
||||
sasl,
|
||||
scram::{self, threadpool::ThreadPool},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
|
||||
use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
|
||||
use proxy_sasl::{
|
||||
sasl,
|
||||
scram::{self, threadpool::ThreadPool, TlsServerEndPoint},
|
||||
};
|
||||
use std::{io, sync::Arc};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
@@ -27,7 +28,7 @@ pub trait AuthMethod {
|
||||
pub struct Begin;
|
||||
|
||||
/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
|
||||
|
||||
impl AuthMethod for Scram<'_> {
|
||||
#[inline(always)]
|
||||
@@ -56,7 +57,7 @@ impl AuthMethod for PasswordHack {
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword {
|
||||
pub pool: Arc<ThreadPool>,
|
||||
pub pool: Arc<ThreadPool<EndpointIdInt>>,
|
||||
pub endpoint: EndpointIdInt,
|
||||
pub secret: AuthSecret,
|
||||
}
|
||||
@@ -155,7 +156,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
|
||||
// Initial client message contains the chosen auth method's name.
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
@@ -168,15 +169,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
match sasl.method {
|
||||
SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
|
||||
SCRAM_SHA_256_PLUS => {
|
||||
ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
|
||||
}
|
||||
SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
|
||||
SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
|
||||
_ => {}
|
||||
}
|
||||
info!("client chooses {}", sasl.method);
|
||||
|
||||
let outcome = sasl::SaslStream::new(self.stream, sasl.message)
|
||||
let outcome = sasl::SaslStream::new(&mut self.stream.framed, sasl.message)
|
||||
.authenticate(scram::Exchange::new(
|
||||
secret,
|
||||
rand::random,
|
||||
@@ -193,7 +192,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
}
|
||||
|
||||
pub(crate) async fn validate_password_and_exchange(
|
||||
pool: &ThreadPool,
|
||||
pool: &ThreadPool<EndpointIdInt>,
|
||||
endpoint: EndpointIdInt,
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
@@ -208,7 +207,8 @@ pub(crate) async fn validate_password_and_exchange(
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
|
||||
let outcome =
|
||||
proxy_sasl::scram::exchange(pool, endpoint, &scram_secret, password).await?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
@@ -68,7 +68,7 @@ impl EndpointsCache {
|
||||
ready: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
return true;
|
||||
}
|
||||
@@ -371,7 +371,8 @@ impl Cache for ProjectInfoCacheImpl {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{scram::ServerSecret, ProjectId};
|
||||
use crate::ProjectId;
|
||||
use proxy_sasl::scram::ServerSecret;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_project_info_cache_settings() {
|
||||
@@ -103,8 +103,12 @@ impl ConnCfg {
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
pub fn reuse_password(&mut self, other: Self) {
|
||||
if let Some(password) = other.get_auth() {
|
||||
self.auth(password);
|
||||
if let Some(password) = other.get_password() {
|
||||
self.password(password);
|
||||
}
|
||||
|
||||
if let Some(keys) = other.get_auth_keys() {
|
||||
self.auth_keys(keys);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,64 +124,48 @@ impl ConnCfg {
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
let mut client_encoding = false;
|
||||
for (k, v) in params.iter() {
|
||||
match k {
|
||||
"user" => {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if self.get_user().is_none() {
|
||||
self.user(v);
|
||||
}
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
|
||||
self.user(user);
|
||||
}
|
||||
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(params) {
|
||||
self.options(&options);
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.application_name(app_name);
|
||||
}
|
||||
|
||||
// TODO: This is especially ugly...
|
||||
if let Some(replication) = params.get("replication") {
|
||||
use tokio_postgres::config::ReplicationMode;
|
||||
match replication {
|
||||
"true" | "on" | "yes" | "1" => {
|
||||
self.replication_mode(ReplicationMode::Physical);
|
||||
}
|
||||
"database" => {
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
if self.get_dbname().is_none() {
|
||||
self.dbname(v);
|
||||
}
|
||||
}
|
||||
"options" => {
|
||||
// Don't add `options` if they were only used for specifying a project.
|
||||
// Connection pools don't support `options`, because they affect backend startup.
|
||||
if let Some(options) = filtered_options(v) {
|
||||
self.options(&options);
|
||||
}
|
||||
}
|
||||
|
||||
// the special ones in tokio-postgres that we don't want being set by the user
|
||||
"dbname" => {}
|
||||
"password" => {}
|
||||
"sslmode" => {}
|
||||
"host" => {}
|
||||
"port" => {}
|
||||
"connect_timeout" => {}
|
||||
"keepalives" => {}
|
||||
"keepalives_idle" => {}
|
||||
"keepalives_interval" => {}
|
||||
"keepalives_retries" => {}
|
||||
"target_session_attrs" => {}
|
||||
"channel_binding" => {}
|
||||
"max_backend_message_size" => {}
|
||||
|
||||
"client_encoding" => {
|
||||
client_encoding = true;
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param("client_encoding", v);
|
||||
}
|
||||
|
||||
_ => {
|
||||
// only error should be from bad null bytes,
|
||||
// but we've already checked for those.
|
||||
_ = self.param(k, v);
|
||||
self.replication_mode(ReplicationMode::Logical);
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
if !client_encoding {
|
||||
// for compatibility since we removed it from tokio-postgres
|
||||
self.param("client_encoding", "UTF8").unwrap();
|
||||
}
|
||||
|
||||
// TODO: extend the list of the forwarded startup parameters.
|
||||
// Currently, tokio-postgres doesn't allow us to pass
|
||||
// arbitrary parameters, but the ones above are a good start.
|
||||
//
|
||||
// This and the reverse params problem can be better addressed
|
||||
// in a bespoke connection machinery (a new library for that sake).
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,12 +276,12 @@ impl ConnCfg {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
allow_self_signed_compute: bool,
|
||||
aux: MetricsAuxInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
drop(pause);
|
||||
|
||||
@@ -316,14 +304,14 @@ impl ConnCfg {
|
||||
)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
drop(pause);
|
||||
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
cold_start_info = ctx.cold_start_info.as_str(),
|
||||
cold_start_info = ctx.cold_start_info().as_str(),
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
||||
self.0.get_ssl_mode()
|
||||
);
|
||||
@@ -342,7 +330,7 @@ impl ConnCfg {
|
||||
params,
|
||||
cancel_closure,
|
||||
aux,
|
||||
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
|
||||
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
|
||||
};
|
||||
|
||||
Ok(connection)
|
||||
@@ -350,9 +338,10 @@ impl ConnCfg {
|
||||
}
|
||||
|
||||
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
|
||||
fn filtered_options(options: &str) -> Option<String> {
|
||||
fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = StartupMessageParams::parse_options_raw(options)
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
@@ -424,23 +413,27 @@ mod tests {
|
||||
#[test]
|
||||
fn test_filtered_options() {
|
||||
// Empty options is unlikely to be useful anyway.
|
||||
assert_eq!(filtered_options(""), None);
|
||||
let params = StartupMessageParams::new([("options", "")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// It's likely that clients will only use options to specify endpoint/project.
|
||||
let params = "project=foo";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
let params = StartupMessageParams::new([("options", "project=foo")]);
|
||||
assert_eq!(filtered_options(¶ms), None);
|
||||
|
||||
// Same, because unescaped whitespaces are no-op.
|
||||
let params = " project=foo ";
|
||||
assert_eq!(filtered_options(params), None);
|
||||
let params = StartupMessageParams::new([("options", " project=foo ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), None);
|
||||
|
||||
let params = r"\ project=foo \ ";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ "));
|
||||
let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ "));
|
||||
|
||||
let params = "project = foo";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
|
||||
let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2";
|
||||
assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
|
||||
let params = StartupMessageParams::new([(
|
||||
"options",
|
||||
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
|
||||
)]);
|
||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||
}
|
||||
}
|
||||
@@ -1,27 +1,26 @@
|
||||
use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
console::locks::ApiLocks,
|
||||
intern::EndpointIdInt,
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
Host,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use itertools::Itertools;
|
||||
use proxy_sasl::scram::{threadpool::ThreadPool, TlsServerEndPoint};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use rustls::{
|
||||
crypto::ring::sign,
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
};
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{error, info};
|
||||
use x509_parser::oid_registry;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
@@ -31,11 +30,8 @@ pub struct ProxyConfig {
|
||||
pub http_config: HttpConfig,
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
pub require_client_ip: bool,
|
||||
pub disable_ip_check_for_http: bool,
|
||||
pub redis_rps_limit: Vec<RateBucketInfo>,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
pub aws_region: String,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
pub connect_compute_locks: ApiLocks<Host>,
|
||||
pub connect_to_compute_retry_config: RetryConfig,
|
||||
@@ -55,14 +51,13 @@ pub struct TlsConfig {
|
||||
}
|
||||
|
||||
pub struct HttpConfig {
|
||||
pub request_timeout: tokio::time::Duration,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub thread_pool: Arc<ThreadPool>,
|
||||
pub thread_pool: Arc<ThreadPool<EndpointIdInt>>,
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
@@ -130,66 +125,6 @@ pub fn configure_tls(
|
||||
})
|
||||
}
|
||||
|
||||
/// Channel binding parameter
|
||||
///
|
||||
/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
|
||||
/// Description: The hash of the TLS server's certificate as it
|
||||
/// appears, octet for octet, in the server's Certificate message. Note
|
||||
/// that the Certificate message contains a certificate_list, in which
|
||||
/// the first element is the server's certificate.
|
||||
///
|
||||
/// The hash function is to be selected as follows:
|
||||
///
|
||||
/// * if the certificate's signatureAlgorithm uses a single hash
|
||||
/// function, and that hash function is either MD5 or SHA-1, then use SHA-256;
|
||||
///
|
||||
/// * if the certificate's signatureAlgorithm uses a single hash
|
||||
/// function and that hash function neither MD5 nor SHA-1, then use
|
||||
/// the hash function associated with the certificate's
|
||||
/// signatureAlgorithm;
|
||||
///
|
||||
/// * if the certificate's signatureAlgorithm uses no hash functions or
|
||||
/// uses multiple hash functions, then this channel binding type's
|
||||
/// channel bindings are undefined at this time (updates to is channel
|
||||
/// binding type may occur to address this issue if it ever arises).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum TlsServerEndPoint {
|
||||
Sha256([u8; 32]),
|
||||
Undefined,
|
||||
}
|
||||
|
||||
impl TlsServerEndPoint {
|
||||
pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
|
||||
let sha256_oids = [
|
||||
// I'm explicitly not adding MD5 or SHA1 here... They're bad.
|
||||
oid_registry::OID_SIG_ECDSA_WITH_SHA256,
|
||||
oid_registry::OID_PKCS1_SHA256WITHRSA,
|
||||
];
|
||||
|
||||
let pem = x509_parser::parse_x509_certificate(cert)
|
||||
.context("Failed to parse PEM object from cerficiate")?
|
||||
.1;
|
||||
|
||||
info!(subject = %pem.subject, "parsing TLS certificate");
|
||||
|
||||
let reg = oid_registry::OidRegistry::default().with_all_crypto();
|
||||
let oid = pem.signature_algorithm.oid();
|
||||
let alg = reg.get(oid);
|
||||
if sha256_oids.contains(oid) {
|
||||
let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
|
||||
info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
|
||||
Ok(Self::Sha256(tls_server_end_point))
|
||||
} else {
|
||||
error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
|
||||
Ok(Self::Undefined)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn supported(&self) -> bool {
|
||||
!matches!(self, TlsServerEndPoint::Undefined)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct CertResolver {
|
||||
certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
|
||||
@@ -16,9 +16,10 @@ use crate::{
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
|
||||
scram, EndpointCacheKey,
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use dashmap::DashMap;
|
||||
use proxy_sasl::scram;
|
||||
use std::{hash::Hash, sync::Arc, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
@@ -292,7 +293,7 @@ pub struct NodeInfo {
|
||||
impl NodeInfo {
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
timeout: Duration,
|
||||
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
|
||||
self.config
|
||||
@@ -330,20 +331,20 @@ pub(crate) trait Api {
|
||||
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
@@ -363,7 +364,7 @@ pub enum ConsoleBackend {
|
||||
impl Api for ConsoleBackend {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
@@ -378,7 +379,7 @@ impl Api for ConsoleBackend {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
|
||||
use ConsoleBackend::*;
|
||||
@@ -393,7 +394,7 @@ impl Api for ConsoleBackend {
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
|
||||
use ConsoleBackend::*;
|
||||
@@ -469,15 +470,15 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
timeout: Duration,
|
||||
epoch: std::time::Duration,
|
||||
metrics: &'static ApiLockMetrics,
|
||||
) -> prometheus::Result<Self> {
|
||||
Ok(Self {
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
node_locks: DashMap::with_shard_amount(shards),
|
||||
config,
|
||||
timeout,
|
||||
epoch,
|
||||
metrics,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
|
||||
@@ -5,7 +5,7 @@ use super::{
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
||||
};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
use crate::{
|
||||
console::{
|
||||
@@ -15,6 +15,7 @@ use crate::{
|
||||
BranchId, EndpointId, ProjectId,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use proxy_sasl::scram;
|
||||
use std::{str::FromStr, sync::Arc};
|
||||
use thiserror::Error;
|
||||
use tokio_postgres::{config::SslMode, Client};
|
||||
@@ -158,7 +159,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(
|
||||
@@ -168,7 +169,7 @@ impl super::Api for Api {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
Ok((
|
||||
@@ -182,7 +183,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_ctx: &RequestMonitoring,
|
||||
_user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute().map_ok(Cached::new_uncached).await
|
||||
@@ -13,10 +13,11 @@ use crate::{
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::WakeComputeRateLimiter,
|
||||
scram, EndpointCacheKey,
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use futures::TryFutureExt;
|
||||
use proxy_sasl::scram;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
@@ -57,7 +58,7 @@ impl Api {
|
||||
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
if !self
|
||||
@@ -69,7 +70,7 @@ impl Api {
|
||||
info!("endpoint is not valid, skipping the request");
|
||||
return Ok(AuthInfo::default());
|
||||
}
|
||||
let request_id = ctx.session_id.to_string();
|
||||
let request_id = ctx.session_id().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let request = self
|
||||
@@ -77,7 +78,7 @@ impl Api {
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id)])
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.query(&[
|
||||
("application_name", application_name.as_str()),
|
||||
("project", user_info.endpoint.as_str()),
|
||||
@@ -87,7 +88,7 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
@@ -130,10 +131,10 @@ impl Api {
|
||||
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
let request_id = ctx.session_id.to_string();
|
||||
let request_id = ctx.session_id().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let mut request_builder = self
|
||||
@@ -141,7 +142,7 @@ impl Api {
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id)])
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.query(&[
|
||||
("application_name", application_name.as_str()),
|
||||
("project", user_info.endpoint.as_str()),
|
||||
@@ -156,7 +157,7 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
@@ -192,7 +193,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
let normalized_ep = &user_info.endpoint.normalize();
|
||||
@@ -226,7 +227,7 @@ impl super::Api for Api {
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
let normalized_ep = &user_info.endpoint.normalize();
|
||||
@@ -268,7 +269,7 @@ impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let key = user_info.endpoint_cache_key();
|
||||
@@ -7,13 +7,14 @@ use smol_str::SmolStr;
|
||||
use std::net::IpAddr;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{field::display, info, info_span, Span};
|
||||
use try_lock::TryLock;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
console::messages::{ColdStartInfo, MetricsAuxInfo},
|
||||
error::ErrorKind,
|
||||
intern::{BranchIdInt, ProjectIdInt},
|
||||
metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
|
||||
metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
|
||||
DbName, EndpointId, RoleName,
|
||||
};
|
||||
|
||||
@@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
|
||||
///
|
||||
/// This data should **not** be used for connection logic, only for observability and limiting purposes.
|
||||
/// All connection logic should instead use strongly typed state machines, not a bunch of Options.
|
||||
pub struct RequestMonitoring {
|
||||
pub struct RequestMonitoring(
|
||||
/// To allow easier use of the ctx object, we have interior mutability.
|
||||
/// I would typically use a RefCell but that would break the `Send` requirements
|
||||
/// so we need something with thread-safety. `TryLock` is a cheap alternative
|
||||
/// that offers similar semantics to a `RefCell` but with synchronisation.
|
||||
TryLock<RequestMonitoringInner>,
|
||||
);
|
||||
|
||||
struct RequestMonitoringInner {
|
||||
pub peer_addr: IpAddr,
|
||||
pub session_id: Uuid,
|
||||
pub protocol: Protocol,
|
||||
@@ -85,7 +94,7 @@ impl RequestMonitoring {
|
||||
role = tracing::field::Empty,
|
||||
);
|
||||
|
||||
Self {
|
||||
let inner = RequestMonitoringInner {
|
||||
peer_addr,
|
||||
session_id,
|
||||
protocol,
|
||||
@@ -110,7 +119,9 @@ impl RequestMonitoring {
|
||||
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
|
||||
latency_timer: LatencyTimer::new(protocol),
|
||||
disconnect_timestamp: None,
|
||||
}
|
||||
};
|
||||
|
||||
Self(TryLock::new(inner))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -119,48 +130,177 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
pub fn console_application_name(&self) -> String {
|
||||
let this = self.0.try_lock().expect("should not deadlock");
|
||||
format!(
|
||||
"{}/{}",
|
||||
self.application.as_deref().unwrap_or_default(),
|
||||
self.protocol
|
||||
this.application.as_deref().unwrap_or_default(),
|
||||
this.protocol
|
||||
)
|
||||
}
|
||||
|
||||
pub fn set_rejected(&mut self, rejected: bool) {
|
||||
self.rejected = Some(rejected);
|
||||
pub fn set_rejected(&self, rejected: bool) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.rejected = Some(rejected);
|
||||
}
|
||||
|
||||
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||
pub fn set_cold_start_info(&self, info: ColdStartInfo) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_cold_start_info(info);
|
||||
}
|
||||
|
||||
pub fn set_db_options(&self, options: StartupMessageParams) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.set_application(options.get("application_name").map(SmolStr::from));
|
||||
if let Some(user) = options.get("user") {
|
||||
this.set_user(user.into());
|
||||
}
|
||||
if let Some(dbname) = options.get("database") {
|
||||
this.set_dbname(dbname.into());
|
||||
}
|
||||
|
||||
this.pg_options = Some(options);
|
||||
}
|
||||
|
||||
pub fn set_project(&self, x: MetricsAuxInfo) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
if this.endpoint_id.is_none() {
|
||||
this.set_endpoint_id(x.endpoint_id.as_str().into())
|
||||
}
|
||||
this.branch = Some(x.branch_id);
|
||||
this.project = Some(x.project_id);
|
||||
this.set_cold_start_info(x.cold_start_info);
|
||||
}
|
||||
|
||||
pub fn set_project_id(&self, project_id: ProjectIdInt) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.project = Some(project_id);
|
||||
}
|
||||
|
||||
pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_endpoint_id(endpoint_id);
|
||||
}
|
||||
|
||||
pub fn set_dbname(&self, dbname: DbName) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_dbname(dbname);
|
||||
}
|
||||
|
||||
pub fn set_user(&self, user: RoleName) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_user(user);
|
||||
}
|
||||
|
||||
pub fn set_auth_method(&self, auth_method: AuthMethod) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.auth_method = Some(auth_method);
|
||||
}
|
||||
|
||||
pub fn has_private_peer_addr(&self) -> bool {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.has_private_peer_addr()
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&self, kind: ErrorKind) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
// Do not record errors from the private address to metrics.
|
||||
if !this.has_private_peer_addr() {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
}
|
||||
if let Some(ep) = &this.endpoint_id {
|
||||
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
|
||||
let label = metric.with_labels(kind);
|
||||
metric.get_metric(label).measure(ep);
|
||||
}
|
||||
this.error_kind = Some(kind);
|
||||
}
|
||||
|
||||
pub fn set_success(&self) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.success = true;
|
||||
}
|
||||
|
||||
pub fn log_connect(&self) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.log_connect();
|
||||
}
|
||||
|
||||
pub fn protocol(&self) -> Protocol {
|
||||
self.0.try_lock().expect("should not deadlock").protocol
|
||||
}
|
||||
|
||||
pub fn span(&self) -> Span {
|
||||
self.0.try_lock().expect("should not deadlock").span.clone()
|
||||
}
|
||||
|
||||
pub fn session_id(&self) -> Uuid {
|
||||
self.0.try_lock().expect("should not deadlock").session_id
|
||||
}
|
||||
|
||||
pub fn peer_addr(&self) -> IpAddr {
|
||||
self.0.try_lock().expect("should not deadlock").peer_addr
|
||||
}
|
||||
|
||||
pub fn cold_start_info(&self) -> ColdStartInfo {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.cold_start_info
|
||||
}
|
||||
|
||||
pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
|
||||
LatencyTimerPause {
|
||||
ctx: self,
|
||||
start: tokio::time::Instant::now(),
|
||||
waiting_for,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn success(&self) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.latency_timer
|
||||
.success()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
ctx: &'a RequestMonitoring,
|
||||
start: tokio::time::Instant,
|
||||
waiting_for: Waiting,
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimerPause<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.ctx
|
||||
.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.latency_timer
|
||||
.unpause(self.start, self.waiting_for);
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestMonitoringInner {
|
||||
fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||
self.cold_start_info = info;
|
||||
self.latency_timer.cold_start_info(info);
|
||||
}
|
||||
|
||||
pub fn set_db_options(&mut self, options: StartupMessageParams) {
|
||||
self.set_application(options.get("application_name").map(SmolStr::from));
|
||||
if let Some(user) = options.get("user") {
|
||||
self.set_user(user.into());
|
||||
}
|
||||
if let Some(dbname) = options.get("database") {
|
||||
self.set_dbname(dbname.into());
|
||||
}
|
||||
|
||||
self.pg_options = Some(options);
|
||||
}
|
||||
|
||||
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
||||
if self.endpoint_id.is_none() {
|
||||
self.set_endpoint_id(x.endpoint_id.as_str().into())
|
||||
}
|
||||
self.branch = Some(x.branch_id);
|
||||
self.project = Some(x.project_id);
|
||||
self.set_cold_start_info(x.cold_start_info);
|
||||
}
|
||||
|
||||
pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
|
||||
self.project = Some(project_id);
|
||||
}
|
||||
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
||||
fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
||||
if self.endpoint_id.is_none() {
|
||||
self.span.record("ep", display(&endpoint_id));
|
||||
let metric = &Metrics::get().proxy.connecting_endpoints;
|
||||
@@ -176,44 +316,23 @@ impl RequestMonitoring {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_dbname(&mut self, dbname: DbName) {
|
||||
fn set_dbname(&mut self, dbname: DbName) {
|
||||
self.dbname = Some(dbname);
|
||||
}
|
||||
|
||||
pub fn set_user(&mut self, user: RoleName) {
|
||||
fn set_user(&mut self, user: RoleName) {
|
||||
self.span.record("role", display(&user));
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
|
||||
self.auth_method = Some(auth_method);
|
||||
}
|
||||
|
||||
pub fn has_private_peer_addr(&self) -> bool {
|
||||
fn has_private_peer_addr(&self) -> bool {
|
||||
match self.peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&mut self, kind: ErrorKind) {
|
||||
// Do not record errors from the private address to metrics.
|
||||
if !self.has_private_peer_addr() {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
}
|
||||
if let Some(ep) = &self.endpoint_id {
|
||||
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
|
||||
let label = metric.with_labels(kind);
|
||||
metric.get_metric(label).measure(ep);
|
||||
}
|
||||
self.error_kind = Some(kind);
|
||||
}
|
||||
|
||||
pub fn set_success(&mut self) {
|
||||
self.success = true;
|
||||
}
|
||||
|
||||
pub fn log_connect(&mut self) {
|
||||
fn log_connect(&mut self) {
|
||||
let outcome = if self.success {
|
||||
ConnectOutcome::Success
|
||||
} else {
|
||||
@@ -256,7 +375,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RequestMonitoring {
|
||||
impl Drop for RequestMonitoringInner {
|
||||
fn drop(&mut self) {
|
||||
if self.sender.is_some() {
|
||||
self.log_connect();
|
||||
@@ -23,7 +23,7 @@ use utils::backoff;
|
||||
|
||||
use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
|
||||
|
||||
use super::{RequestMonitoring, LOG_CHAN};
|
||||
use super::{RequestMonitoringInner, LOG_CHAN};
|
||||
|
||||
#[derive(clap::Args, Clone, Debug)]
|
||||
pub struct ParquetUploadArgs {
|
||||
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RequestMonitoring> for RequestData {
|
||||
fn from(value: &RequestMonitoring) -> Self {
|
||||
impl From<&RequestMonitoringInner> for RequestData {
|
||||
fn from(value: &RequestMonitoringInner) -> Self {
|
||||
Self {
|
||||
session_id: value.session_id,
|
||||
peer_addr: value.peer_addr.to_string(),
|
||||
@@ -6,6 +6,12 @@ pub mod health_server;
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use http_body_util::BodyExt;
|
||||
use hyper1::body::Body;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
@@ -96,6 +102,33 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<D> {
|
||||
// We could use `b.limited().collect().await.to_bytes()` here
|
||||
// but this ends up being slightly more efficient as far as I can tell.
|
||||
|
||||
// check the lower bound of the size hint.
|
||||
// in reqwest, this value is influenced by the Content-Length header.
|
||||
let lower_bound = match usize::try_from(b.size_hint().lower()) {
|
||||
Ok(bound) if bound <= limit => bound,
|
||||
_ => bail!("content length exceeds limit"),
|
||||
};
|
||||
let mut bytes = Vec::with_capacity(lower_bound);
|
||||
|
||||
while let Some(frame) = b.frame().await.transpose()? {
|
||||
if let Ok(data) = frame.into_data() {
|
||||
if bytes.len() + data.len() > limit {
|
||||
bail!("content length exceeds limit")
|
||||
}
|
||||
bytes.extend_from_slice(&data);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::from_slice::<D>(&bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -21,13 +21,13 @@ pub mod intern;
|
||||
pub mod jemalloc;
|
||||
pub mod logging;
|
||||
pub mod metrics;
|
||||
pub mod parse;
|
||||
// pub mod parse;
|
||||
pub mod protocol2;
|
||||
pub mod proxy;
|
||||
pub mod rate_limiter;
|
||||
pub mod redis;
|
||||
pub mod sasl;
|
||||
pub mod scram;
|
||||
// pub mod sasl;
|
||||
// pub mod scram;
|
||||
pub mod serverless;
|
||||
pub mod stream;
|
||||
pub mod url;
|
||||
@@ -2,13 +2,14 @@ use std::sync::{Arc, OnceLock};
|
||||
|
||||
use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
|
||||
label::StaticLabelSet,
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
use proxy_sasl::scram::threadpool::ThreadPoolMetrics;
|
||||
use tokio::time::{self, Instant};
|
||||
|
||||
use crate::console::messages::ColdStartInfo;
|
||||
@@ -370,6 +371,7 @@ pub struct CancellationRequest {
|
||||
pub kind: CancellationOutcome,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Waiting {
|
||||
Cplane,
|
||||
Client,
|
||||
@@ -398,12 +400,6 @@ pub struct LatencyTimer {
|
||||
outcome: ConnectOutcome,
|
||||
}
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
timer: &'a mut LatencyTimer,
|
||||
start: time::Instant,
|
||||
waiting_for: Waiting,
|
||||
}
|
||||
|
||||
impl LatencyTimer {
|
||||
pub fn new(protocol: Protocol) -> Self {
|
||||
Self {
|
||||
@@ -417,11 +413,13 @@ impl LatencyTimer {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
|
||||
LatencyTimerPause {
|
||||
timer: self,
|
||||
start: Instant::now(),
|
||||
waiting_for,
|
||||
pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
|
||||
let dur = start.elapsed();
|
||||
match waiting_for {
|
||||
Waiting::Cplane => self.accumulated.cplane += dur,
|
||||
Waiting::Client => self.accumulated.client += dur,
|
||||
Waiting::Compute => self.accumulated.compute += dur,
|
||||
Waiting::RetryTimeout => self.accumulated.retry += dur,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,18 +436,6 @@ impl LatencyTimer {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimerPause<'_> {
|
||||
fn drop(&mut self) {
|
||||
let dur = self.start.elapsed();
|
||||
match self.waiting_for {
|
||||
Waiting::Cplane => self.timer.accumulated.cplane += dur,
|
||||
Waiting::Client => self.timer.accumulated.client += dur,
|
||||
Waiting::Compute => self.timer.accumulated.compute += dur,
|
||||
Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
pub enum ConnectOutcome {
|
||||
Success,
|
||||
@@ -561,78 +547,3 @@ pub enum RedisEventsCount {
|
||||
PasswordUpdate,
|
||||
AllowedIpsUpdate,
|
||||
}
|
||||
|
||||
pub struct ThreadPoolWorkers(usize);
|
||||
pub struct ThreadPoolWorkerId(pub usize);
|
||||
|
||||
impl LabelValue for ThreadPoolWorkerId {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0 as i64)
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelGroup for ThreadPoolWorkerId {
|
||||
fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
|
||||
v.write_value(LabelName::from_str("worker"), self);
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelGroupSet for ThreadPoolWorkers {
|
||||
type Group<'a> = ThreadPoolWorkerId;
|
||||
|
||||
fn cardinality(&self) -> Option<usize> {
|
||||
Some(self.0)
|
||||
}
|
||||
|
||||
fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
|
||||
Some(value)
|
||||
}
|
||||
|
||||
fn decode_dense(&self, value: usize) -> Self::Group<'_> {
|
||||
ThreadPoolWorkerId(value)
|
||||
}
|
||||
|
||||
type Unique = usize;
|
||||
|
||||
fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
|
||||
Some(value.0)
|
||||
}
|
||||
|
||||
fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
|
||||
ThreadPoolWorkerId(*value)
|
||||
}
|
||||
}
|
||||
|
||||
impl LabelSet for ThreadPoolWorkers {
|
||||
type Value<'a> = ThreadPoolWorkerId;
|
||||
|
||||
fn dynamic_cardinality(&self) -> Option<usize> {
|
||||
Some(self.0)
|
||||
}
|
||||
|
||||
fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
|
||||
(value.0 < self.0).then_some(value.0)
|
||||
}
|
||||
|
||||
fn decode(&self, value: usize) -> Self::Value<'_> {
|
||||
ThreadPoolWorkerId(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedCardinalitySet for ThreadPoolWorkers {
|
||||
fn cardinality(&self) -> usize {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(workers: usize))]
|
||||
pub struct ThreadPoolMetrics {
|
||||
pub injector_queue_depth: Gauge,
|
||||
#[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
|
||||
#[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
|
||||
pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
|
||||
}
|
||||
@@ -113,18 +113,18 @@ pub async fn task_main(
|
||||
}
|
||||
};
|
||||
|
||||
let mut ctx = RequestMonitoring::new(
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
);
|
||||
let span = ctx.span.clone();
|
||||
let span = ctx.span();
|
||||
|
||||
let startup = Box::pin(
|
||||
handle_client(
|
||||
config,
|
||||
&mut ctx,
|
||||
&ctx,
|
||||
cancellation_handler,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {
|
||||
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
conn_gauge: NumClientConnectionsGuard<'static>,
|
||||
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
|
||||
info!(
|
||||
protocol = %ctx.protocol,
|
||||
protocol = %ctx.protocol(),
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
let metrics = &Metrics::get().proxy;
|
||||
let proto = ctx.protocol;
|
||||
let proto = ctx.protocol();
|
||||
let _request_gauge = metrics.connection_requests.guard(proto);
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
let record_handshake_error = !ctx.has_private_peer_addr();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
return Ok(cancellation_handler
|
||||
.cancel_session(cancel_key_data, ctx.session_id)
|
||||
.cancel_session(cancel_key_data, ctx.session_id())
|
||||
.await
|
||||
.map(|()| None)?)
|
||||
}
|
||||
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
|
||||
type Error: From<Self::ConnectError>;
|
||||
async fn connect_once(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError>;
|
||||
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
|
||||
pub trait ComputeConnectBackend {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
|
||||
fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
|
||||
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
||||
#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
|
||||
async fn connect_once(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<PostgresConnection, Self::Error> {
|
||||
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
||||
/// Try to connect to the compute node, retrying if necessary.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
ctx: &mut RequestMonitoring,
|
||||
ctx: &RequestMonitoring,
|
||||
mechanism: &M,
|
||||
user_info: &B,
|
||||
allow_self_signed_compute: bool,
|
||||
@@ -126,7 +126,7 @@ where
|
||||
.await
|
||||
{
|
||||
Ok(res) => {
|
||||
ctx.latency_timer.success();
|
||||
ctx.success();
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
@@ -178,7 +178,7 @@ where
|
||||
.await
|
||||
{
|
||||
Ok(res) => {
|
||||
ctx.latency_timer.success();
|
||||
ctx.success();
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
@@ -209,9 +209,7 @@ where
|
||||
let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
|
||||
num_retries += 1;
|
||||
|
||||
let pause = ctx
|
||||
.latency_timer
|
||||
.pause(crate::metrics::Waiting::RetryTimeout);
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
|
||||
time::sleep(wait_duration).await;
|
||||
drop(pause);
|
||||
}
|
||||
@@ -10,6 +10,7 @@ use tracing::{info, warn};
|
||||
use crate::{
|
||||
auth::endpoint_sni,
|
||||
config::{TlsConfig, PG_ALPN_PROTOCOL},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
metrics::Metrics,
|
||||
proxy::ERR_INSECURE_CONNECTION,
|
||||
@@ -67,6 +68,7 @@ pub enum HandshakeData<S> {
|
||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &RequestMonitoring,
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
record_handshake_error: bool,
|
||||
@@ -80,8 +82,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||
loop {
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
info!("received {msg:?}");
|
||||
|
||||
use FeStartupPacket::*;
|
||||
match msg {
|
||||
SslRequest { direct } => match stream.get_ref() {
|
||||
@@ -145,16 +145,20 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let conn_info = tls_stream.get_ref().1;
|
||||
|
||||
// try parse endpoint
|
||||
let ep = conn_info
|
||||
.server_name()
|
||||
.and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
|
||||
if let Some(ep) = ep {
|
||||
ctx.set_endpoint_id(ep);
|
||||
}
|
||||
|
||||
// check the ALPN, if exists, as required.
|
||||
match conn_info.alpn_protocol() {
|
||||
None | Some(PG_ALPN_PROTOCOL) => {}
|
||||
Some(other) => {
|
||||
// try parse ep for better error
|
||||
let ep = conn_info.server_name().and_then(|sni| {
|
||||
endpoint_sni(sni, &tls.common_names).ok().flatten()
|
||||
});
|
||||
let alpn = String::from_utf8_lossy(other);
|
||||
warn!(?ep, %alpn, "unexpected ALPN");
|
||||
warn!(%alpn, "unexpected ALPN");
|
||||
return Err(HandshakeError::ProtocolViolation);
|
||||
}
|
||||
}
|
||||
@@ -198,7 +202,12 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
.await?;
|
||||
}
|
||||
|
||||
info!(?version, session_type = "normal", "successful handshake");
|
||||
info!(
|
||||
?version,
|
||||
?params,
|
||||
session_type = "normal",
|
||||
"successful handshake"
|
||||
);
|
||||
break Ok(HandshakeData::Startup(stream, params));
|
||||
}
|
||||
// downgrade protocol version
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user