mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-24 05:40:36 +00:00
Compare commits
1 Commits
release-67
...
disable-ch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2efad33ab9 |
@@ -13,7 +13,6 @@
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!compute/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
|
||||
46
.github/workflows/_benchmarking_preparation.yml
vendored
46
.github/workflows/_benchmarking_preparation.yml
vendored
@@ -3,23 +3,19 @@ name: Prepare benchmarking databases by restoring dumps
|
||||
on:
|
||||
workflow_call:
|
||||
# no inputs needed
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
setup-databases:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
||||
database: [ clickbench, tpch, userexample ]
|
||||
|
||||
|
||||
env:
|
||||
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
@@ -27,10 +23,7 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -39,13 +32,13 @@ jobs:
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
aws-rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
aws-aurora-serverless-v2-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
@@ -53,17 +46,10 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -71,23 +57,23 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
- name: Create benchmark_restore_status table if it does not exist
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
||||
# we use an advisory lock
|
||||
run: |
|
||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
||||
SELECT pg_advisory_lock(4711);
|
||||
SELECT pg_advisory_lock(4711);
|
||||
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
|
||||
databasename text primary key,
|
||||
restore_done boolean
|
||||
);
|
||||
SELECT pg_advisory_unlock(4711);
|
||||
"
|
||||
|
||||
|
||||
- name: Check if restore is already done
|
||||
id: check-restore-done
|
||||
env:
|
||||
@@ -121,7 +107,7 @@ jobs:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
mkdir -p /tmp/dumps
|
||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
||||
|
||||
- name: Replace database name in connection string
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
@@ -140,17 +126,17 @@ jobs:
|
||||
else
|
||||
new_connstr="${base_connstr}/${DATABASE_NAME}"
|
||||
fi
|
||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Restore dump
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
|
||||
# the following works only with larger computes:
|
||||
# the following works only with larger computes:
|
||||
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
|
||||
# we add the || true because:
|
||||
# the dumps were created with Neon and contain neon extensions that are not
|
||||
# the dumps were created with Neon and contain neon extensions that are not
|
||||
# available in RDS, so we will always report an error, but we can ignore it
|
||||
run: |
|
||||
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
|
||||
|
||||
14
.github/workflows/_build-and-test-locally.yml
vendored
14
.github/workflows/_build-and-test-locally.yml
vendored
@@ -236,7 +236,9 @@ jobs:
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
for io_buffer_alignment in 0 1 512 ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
@@ -255,15 +257,7 @@ jobs:
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: |
|
||||
# Use tar to copy files matching the pattern, preserving the paths in the destionation
|
||||
tar c \
|
||||
pg_install/v* \
|
||||
pg_install/build/*/src/test/regress/*.so \
|
||||
pg_install/build/*/src/test/regress/pg_regress \
|
||||
pg_install/build/*/src/test/isolation/isolationtester \
|
||||
pg_install/build/*/src/test/isolation/pg_isolation_regress \
|
||||
| tar x -C /tmp/neon
|
||||
run: cp -a pg_install /tmp/neon/pg_install
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
|
||||
119
.github/workflows/benchmarking.yml
vendored
119
.github/workflows/benchmarking.yml
vendored
@@ -12,6 +12,7 @@ on:
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
region_id:
|
||||
@@ -58,7 +59,7 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -67,10 +68,12 @@ jobs:
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -83,10 +86,7 @@ jobs:
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: ${{ matrix.IMAGE }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -164,10 +164,6 @@ jobs:
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
@@ -178,21 +174,12 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -280,7 +267,7 @@ jobs:
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default='["self-hosted", "us-east-2", "x64"]'
|
||||
runner_azure='["self-hosted", "eastus2", "x64"]'
|
||||
image_default="neondatabase/build-tools:pinned"
|
||||
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
@@ -357,7 +344,7 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -384,7 +371,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
- name: Configure AWS credentials # necessary on Azure runners
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
@@ -505,15 +492,17 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
@@ -522,16 +511,13 @@ jobs:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
|
||||
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: ${{ matrix.IMAGE }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -541,26 +527,17 @@ jobs:
|
||||
# instead of using Neon artifacts containing pgbench
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
run: |
|
||||
# Just to make it easier to test things locally on macOS (with arm64)
|
||||
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
|
||||
|
||||
cd /home/nonroot
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
|
||||
LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
|
||||
export LD_LIBRARY_PATH
|
||||
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
|
||||
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
/tmp/neon/pg_install/v16/bin/pgbench --version
|
||||
/tmp/neon/pg_install/v16/bin/psql --version
|
||||
|
||||
@@ -582,7 +559,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials
|
||||
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
@@ -643,10 +620,6 @@ jobs:
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -665,22 +638,12 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -751,10 +714,6 @@ jobs:
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -772,22 +731,12 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -857,10 +806,6 @@ jobs:
|
||||
|
||||
user-examples-compare:
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -877,22 +822,12 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
|
||||
71
.github/workflows/build_and_test.yml
vendored
71
.github/workflows/build_and_test.yml
vendored
@@ -602,20 +602,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version:
|
||||
# Much data was already generated on old PG versions with bullseye's
|
||||
# libraries, the locales of which can cause data incompatibilities.
|
||||
# However, new PG versions should check if they can be built on newer
|
||||
# images, as that reduces the support burden of old and ancient
|
||||
# distros.
|
||||
- pg: v14
|
||||
debian: bullseye-slim
|
||||
- pg: v15
|
||||
debian: bullseye-slim
|
||||
- pg: v16
|
||||
debian: bullseye-slim
|
||||
- pg: v17
|
||||
debian: bookworm-slim
|
||||
version: [ v14, v15, v16, v17 ]
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
@@ -658,46 +645,41 @@ jobs:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build neon extensions test image
|
||||
if: matrix.version.pg == 'v16'
|
||||
if: matrix.version == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
file: Dockerfile.compute-node
|
||||
target: neon-pg-ext-test
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
|
||||
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
# We pick 16, because that builds on debian 11 with older glibc (and is
|
||||
# thus compatible with newer glibc), rather than 17 on Debian 12, as
|
||||
# that isn't guaranteed to be compatible with Debian 11
|
||||
if: matrix.version.pg == 'v16'
|
||||
if: matrix.version == 'v17'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
target: compute-tools-image
|
||||
@@ -706,11 +688,10 @@ jobs:
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
DEBIAN_FLAVOR=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/Dockerfile.compute-node
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||
|
||||
@@ -773,7 +754,7 @@ jobs:
|
||||
matrix:
|
||||
version: [ v14, v15, v16, v17 ]
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.35.0
|
||||
VM_BUILDER_VERSION: v0.29.3
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -798,7 +779,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=compute/vm-image-spec.yaml \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
@@ -862,9 +843,6 @@ jobs:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # for `aws-actions/configure-aws-credentials`
|
||||
|
||||
env:
|
||||
VERSIONS: v14 v15 v16 v17
|
||||
|
||||
@@ -909,19 +887,13 @@ jobs:
|
||||
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
|
||||
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Configure AWS-prod credentials
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
mask-aws-account-id: true
|
||||
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
|
||||
|
||||
- name: Login to prod ECR
|
||||
uses: docker/login-action@v3
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
with:
|
||||
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
|
||||
password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
|
||||
|
||||
- name: Copy all images to prod ECR
|
||||
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
@@ -1190,9 +1162,10 @@ jobs:
|
||||
|
||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||
|
||||
for pg_version in v14 v15 v16 v17; do
|
||||
# TODO Add v17
|
||||
for pg_version in v14 v15 v16; do
|
||||
# We run less tests for debug builds, so we don't need to promote them
|
||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then
|
||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -1234,6 +1207,8 @@ jobs:
|
||||
# Usually we do `needs: [...]`
|
||||
needs:
|
||||
- build-and-test-locally
|
||||
# XXX: Temporarily disabled, while we investigate an unexpected failure with it
|
||||
#- check-submodules
|
||||
- check-codestyle-python
|
||||
- check-codestyle-rust
|
||||
- promote-images
|
||||
|
||||
102
.github/workflows/cloud-regress.yml
vendored
102
.github/workflows/cloud-regress.yml
vendored
@@ -1,102 +0,0 @@
|
||||
name: Cloud Regression Test
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '45 1 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
run: |
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
run: |
|
||||
set +x
|
||||
DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
|
||||
echo "::add-mask::${DBPASS//\//}"
|
||||
echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
run: |
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
|
||||
USER=$(echo "${ph}" | cut -c 22-)
|
||||
MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
|
||||
sed -i.bak "s/${ph}/${MD5}/" expected/password.out
|
||||
done
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # on-call-staging-stream
|
||||
slack-message: |
|
||||
Periodic pg_regress on staging: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
4
.github/workflows/trigger-e2e-tests.yml
vendored
4
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -102,12 +102,12 @@ jobs:
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
|
||||
300
Cargo.lock
generated
300
Cargo.lock
generated
@@ -90,9 +90,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.8"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
|
||||
checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
@@ -269,9 +269,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "aws-config"
|
||||
version = "1.5.1"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2ac9889352d632214df943e26740c46a0f3da6e329fbd28164fe7ae1b061da7b"
|
||||
checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -300,9 +300,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-credential-types"
|
||||
version = "1.2.1"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da"
|
||||
checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -312,16 +312,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-runtime"
|
||||
version = "1.4.3"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468"
|
||||
checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-sigv4",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"aws-types",
|
||||
@@ -329,7 +328,6 @@ dependencies = [
|
||||
"fastrand 2.0.0",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"tracing",
|
||||
@@ -338,9 +336,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-iam"
|
||||
version = "1.46.0"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "053df3024ea2ed0431359b3cddecc92dcfadeaedf71dd497292b39e37e597b46"
|
||||
checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -361,9 +359,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-s3"
|
||||
version = "1.52.0"
|
||||
version = "1.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f571deb0a80c20d21d9f3e8418c1712af9ff4bf399d057e5549a934eca4844e2"
|
||||
checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"aws-credential-types",
|
||||
@@ -396,9 +394,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sso"
|
||||
version = "1.30.0"
|
||||
version = "1.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebb97e44983752cf7e12968c5f569a5d7562dbbc67006755c331d9d9c99580ae"
|
||||
checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -418,9 +416,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-ssooidc"
|
||||
version = "1.31.0"
|
||||
version = "1.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad061d977235898e4a97ecbd5d882786cca41b4828943584dc792dcc35eb3d3c"
|
||||
checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -440,9 +438,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sts"
|
||||
version = "1.30.0"
|
||||
version = "1.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "300ce43d1f7f4eb023e57d38b0921d964e8e62bed7f82f6b7849e7eab7a14575"
|
||||
checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
@@ -463,9 +461,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sigv4"
|
||||
version = "1.2.4"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68"
|
||||
checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-eventstream",
|
||||
@@ -503,9 +501,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-checksums"
|
||||
version = "0.60.12"
|
||||
version = "0.60.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23"
|
||||
checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
|
||||
dependencies = [
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-types",
|
||||
@@ -524,9 +522,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-eventstream"
|
||||
version = "0.60.5"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
|
||||
checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
@@ -535,9 +533,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-http"
|
||||
version = "0.60.11"
|
||||
version = "0.60.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
|
||||
checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
|
||||
dependencies = [
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -575,9 +573,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime"
|
||||
version = "1.7.1"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87"
|
||||
checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
@@ -589,7 +587,6 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body 1.0.0",
|
||||
"httparse",
|
||||
"hyper 0.14.30",
|
||||
"hyper-rustls 0.24.0",
|
||||
"once_cell",
|
||||
@@ -602,9 +599,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime-api"
|
||||
version = "1.7.2"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96"
|
||||
checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-types",
|
||||
@@ -619,9 +616,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-types"
|
||||
version = "1.2.7"
|
||||
version = "1.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b"
|
||||
checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
|
||||
dependencies = [
|
||||
"base64-simd",
|
||||
"bytes",
|
||||
@@ -645,23 +642,24 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-xml"
|
||||
version = "0.60.9"
|
||||
version = "0.60.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc"
|
||||
checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
|
||||
dependencies = [
|
||||
"xmlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-types"
|
||||
version = "1.3.3"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
|
||||
checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"http 0.2.9",
|
||||
"rustc_version",
|
||||
"tracing",
|
||||
]
|
||||
@@ -1225,7 +1223,6 @@ dependencies = [
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -1324,7 +1321,7 @@ dependencies = [
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
@@ -1879,9 +1876,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.2"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
|
||||
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||
dependencies = [
|
||||
"humantime",
|
||||
"is-terminal",
|
||||
@@ -3371,82 +3368,102 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.24.0"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
|
||||
checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.13.0"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab"
|
||||
checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"opentelemetry",
|
||||
"reqwest 0.12.4",
|
||||
"http 0.2.9",
|
||||
"opentelemetry_api",
|
||||
"reqwest 0.11.19",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.17.0"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727"
|
||||
checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-core",
|
||||
"http 1.1.0",
|
||||
"opentelemetry",
|
||||
"http 0.2.9",
|
||||
"opentelemetry-http",
|
||||
"opentelemetry-proto",
|
||||
"opentelemetry-semantic-conventions",
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
"prost 0.13.3",
|
||||
"reqwest 0.12.4",
|
||||
"prost",
|
||||
"reqwest 0.11.19",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.7.0"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
|
||||
checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
"prost 0.13.3",
|
||||
"tonic 0.12.2",
|
||||
"prost",
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.16.0"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05"
|
||||
checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_api"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"indexmap 1.9.3",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.24.1"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
|
||||
checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"opentelemetry_api",
|
||||
"ordered-float 3.9.2",
|
||||
"percent-encoding",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -3462,6 +3479,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "3.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.3"
|
||||
@@ -3552,6 +3578,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"camino",
|
||||
"clap",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
@@ -3590,6 +3617,7 @@ dependencies = [
|
||||
"enumset",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
@@ -3709,6 +3737,7 @@ dependencies = [
|
||||
"clap",
|
||||
"criterion",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex-literal",
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
@@ -4204,17 +4233,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.11.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.13.3",
|
||||
"prost-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4231,7 +4250,7 @@ dependencies = [
|
||||
"multimap",
|
||||
"petgraph",
|
||||
"prettyplease 0.1.25",
|
||||
"prost 0.11.9",
|
||||
"prost",
|
||||
"prost-types",
|
||||
"regex",
|
||||
"syn 1.0.109",
|
||||
@@ -4252,26 +4271,13 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.12.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.11.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
|
||||
dependencies = [
|
||||
"prost 0.11.9",
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4294,7 +4300,6 @@ dependencies = [
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"ecdsa 0.16.9",
|
||||
@@ -4302,6 +4307,7 @@ dependencies = [
|
||||
"fallible-iterator",
|
||||
"framed-websockets",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
@@ -4368,6 +4374,7 @@ dependencies = [
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
@@ -4812,9 +4819,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.5.3"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45"
|
||||
checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -5132,6 +5139,7 @@ dependencies = [
|
||||
"desim",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
@@ -5694,14 +5702,15 @@ dependencies = [
|
||||
"futures",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost 0.11.9",
|
||||
"prost",
|
||||
"tokio",
|
||||
"tonic 0.9.2",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
"utils",
|
||||
@@ -5721,6 +5730,7 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
@@ -5773,6 +5783,7 @@ dependencies = [
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"itertools 0.10.5",
|
||||
@@ -6025,7 +6036,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"integer-encoding",
|
||||
"ordered-float",
|
||||
"ordered-float 2.10.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6127,9 +6138,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.38.1"
|
||||
version = "1.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
|
||||
checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
@@ -6171,9 +6182,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-macros"
|
||||
version = "2.3.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
|
||||
checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6348,7 +6359,7 @@ dependencies = [
|
||||
"hyper-timeout",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.11.9",
|
||||
"prost",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"tokio",
|
||||
@@ -6360,27 +6371,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c6f6ba989e4b2c58ae83d862d3a3e27690b6e3ae630d0deb59f3697f32aa88ad"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.13.3",
|
||||
"tokio-stream",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.9.2"
|
||||
@@ -6428,10 +6418,11 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.40"
|
||||
version = "0.1.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
||||
checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"log",
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
@@ -6451,9 +6442,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.27"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
||||
checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -6462,9 +6453,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.32"
|
||||
version = "0.1.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||
checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
@@ -6482,22 +6473,21 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.2.0"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
||||
checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"once_cell",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.25.0"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
|
||||
checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
@@ -6506,7 +6496,6 @@ dependencies = [
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6521,9 +6510,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.18"
|
||||
version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"once_cell",
|
||||
@@ -6547,7 +6536,6 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
"opentelemetry_sdk",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -6727,7 +6715,6 @@ dependencies = [
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
@@ -7003,16 +6990,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "0.25.2"
|
||||
@@ -7282,6 +7259,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"crossbeam-utils",
|
||||
"crypto-bigint 0.5.5",
|
||||
"der 0.7.8",
|
||||
"deranged",
|
||||
@@ -7314,12 +7292,13 @@ dependencies = [
|
||||
"once_cell",
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
"prost 0.11.9",
|
||||
"prost",
|
||||
"quote",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest 0.11.19",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.21.11",
|
||||
"scopeguard",
|
||||
@@ -7340,9 +7319,12 @@ dependencies = [
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
|
||||
25
Cargo.toml
25
Cargo.toml
@@ -53,14 +53,14 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.5", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.52"
|
||||
aws-sdk-iam = "1.46.0"
|
||||
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.26"
|
||||
aws-sdk-iam = "1.15.0"
|
||||
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.2"
|
||||
aws-smithy-types = "1.1.9"
|
||||
aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
|
||||
aws-types = "1.2.0"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
@@ -116,10 +116,9 @@ notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.24"
|
||||
opentelemetry_sdk = "0.24"
|
||||
opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.16"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
@@ -132,7 +131,7 @@ rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
|
||||
reqwest-middleware = "0.3.0"
|
||||
reqwest-retry = "0.5"
|
||||
routerify = "3"
|
||||
@@ -178,8 +177,8 @@ toml_edit = "0.22"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-opentelemetry = "0.25"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
|
||||
@@ -13,9 +13,6 @@ RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
#
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
# regression tests
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
@@ -27,7 +24,6 @@ RUN set -e \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
gdb \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
|
||||
@@ -3,34 +3,17 @@ ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
ARG DEBIAN_FLAVOR=bullseye-slim
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:$DEBIAN_FLAVOR AS build-deps
|
||||
ARG DEBIAN_FLAVOR
|
||||
|
||||
RUN case $DEBIAN_FLAVOR in \
|
||||
# Version-specific installs for Bullseye (PG14-PG16):
|
||||
# The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
|
||||
# Install newer version (3.25) from backports.
|
||||
bullseye*) \
|
||||
echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
|
||||
VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
|
||||
;; \
|
||||
# Version-specific installs for Bookworm (PG17):
|
||||
bookworm*) \
|
||||
VERSION_INSTALLS="cmake"; \
|
||||
;; \
|
||||
esac && \
|
||||
apt update && \
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
|
||||
$VERSION_INSTALLS
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -104,7 +87,7 @@ FROM build-deps AS postgis-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
@@ -215,6 +198,27 @@ FROM build-deps AS h3-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
case "$(uname -m)" in \
|
||||
"x86_64") \
|
||||
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
|
||||
;; \
|
||||
"aarch64") \
|
||||
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
mkdir -p /h3/usr/ && \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
@@ -276,7 +280,7 @@ FROM build-deps AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||
# because we build the images on different machines than where we run them.
|
||||
@@ -362,7 +366,7 @@ FROM build-deps AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
COPY patches/rum.patch /rum.patch
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
@@ -500,6 +504,8 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
@@ -588,6 +594,7 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake \
|
||||
libboost-iostreams1.74-dev \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
@@ -752,7 +759,7 @@ ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl libclang-dev && \
|
||||
apt-get install -y curl libclang-dev cmake && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
@@ -862,28 +869,6 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-session-jwt-build"
|
||||
# Compile "pg_session_jwt" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-session-jwt-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
# sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_session_jwt.control && \
|
||||
# echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_session_jwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
@@ -980,7 +965,6 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1043,47 +1027,10 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
|
||||
ARG DEBIAN_FLAVOR
|
||||
FROM debian:bullseye-slim AS compute-tools-image
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgbouncer"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
|
||||
ARG DEBIAN_FLAVOR
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
libevent-dev \
|
||||
libtool \
|
||||
pkg-config
|
||||
|
||||
# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
|
||||
ENV PGBOUNCER_TAG=pgbouncer_1_22_1
|
||||
RUN set -e \
|
||||
&& git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
|
||||
&& cd pgbouncer \
|
||||
&& ./autogen.sh \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) dist_man_MANS= \
|
||||
&& make install dist_man_MANS=
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "postgres-exporter" and "sql-exporter"
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
@@ -1131,7 +1078,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY compute/patches/rum.patch /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -1139,9 +1086,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY compute/patches/pg_hint_plan.patch /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY compute/patches/pg_cron.patch /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
@@ -1150,7 +1097,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
|
||||
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
|
||||
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
|
||||
COPY compute/patches/pg_anon.patch /ext-src
|
||||
COPY patches/pg_anon.patch /ext-src
|
||||
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
|
||||
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
@@ -1168,6 +1115,11 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
apt-get update && apt-get install -y cmake
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
@@ -1192,8 +1144,7 @@ ENV PGDATABASE=postgres
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:$DEBIAN_FLAVOR
|
||||
ARG DEBIAN_FLAVOR
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
@@ -1209,50 +1160,23 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# pgbouncer and its config
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
|
||||
|
||||
# Metrics exporter binaries and configuration files
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# liblz4-1 for lz4
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
|
||||
|
||||
RUN apt update && \
|
||||
case $DEBIAN_FLAVOR in \
|
||||
# Version-specific installs for Bullseye (PG14-PG16):
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# libgdal28, libproj19 for PostGIS
|
||||
bullseye*) \
|
||||
VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
|
||||
;; \
|
||||
# Version-specific installs for Bookworm (PG17):
|
||||
# libicu72, locales for collations (including ICU and plpgsql_check)
|
||||
# libgdal32, libproj25 for PostGIS
|
||||
bookworm*) \
|
||||
VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
|
||||
;; \
|
||||
esac && \
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
libicu67 \
|
||||
liblz4-1 \
|
||||
libreadline8 \
|
||||
libboost-iostreams1.74.0 \
|
||||
@@ -1261,16 +1185,17 @@ RUN apt update && \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4 \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates \
|
||||
$VERSION_INSTALLS && \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
This directory contains files that are needed to build the compute
|
||||
images, or included in the compute images.
|
||||
|
||||
Dockerfile.compute-node
|
||||
To build the compute image
|
||||
|
||||
vm-image-spec.yaml
|
||||
Instructions for vm-builder, to turn the compute-node image into
|
||||
corresponding vm-compute-node image.
|
||||
|
||||
etc/
|
||||
Configuration files included in /etc in the compute image
|
||||
|
||||
patches/
|
||||
Some extensions need to be patched to work with Neon. This
|
||||
directory contains such patches. They are applied to the extension
|
||||
sources in Dockerfile.compute-node
|
||||
|
||||
In addition to these, postgres itself, the neon postgres extension,
|
||||
and compute_ctl are built and copied into the compute image by
|
||||
Dockerfile.compute-node.
|
||||
@@ -1,331 +0,0 @@
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: connection_counts
|
||||
type: gauge
|
||||
help: 'Connection counts'
|
||||
key_labels:
|
||||
- datname
|
||||
- state
|
||||
values: [count]
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
- metric_name: getpage_wait_seconds_count
|
||||
type: counter
|
||||
help: 'Number of getpage requests'
|
||||
values: [getpage_wait_seconds_count]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_sum
|
||||
type: counter
|
||||
help: 'Time spent in getpage requests'
|
||||
values: [getpage_wait_seconds_sum]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_requests_total
|
||||
type: counter
|
||||
help: 'Number of getpage issued for prefetching'
|
||||
values: [getpage_prefetch_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_sync_requests_total
|
||||
type: counter
|
||||
help: 'Number of synchronous getpage issued'
|
||||
values: [getpage_sync_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_misses_total
|
||||
type: counter
|
||||
help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
|
||||
values: [getpage_prefetch_misses_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_discards_total
|
||||
type: counter
|
||||
help: 'Number of prefetch responses issued but not used'
|
||||
values: [getpage_prefetch_discards_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_requests_sent_total
|
||||
type: counter
|
||||
help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
|
||||
values: [pageserver_requests_sent_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_disconnects_total
|
||||
type: counter
|
||||
help: 'Number of times that the connection to the pageserver was lost'
|
||||
values: [pageserver_disconnects_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_send_flushes_total
|
||||
type: counter
|
||||
help: 'Number of flushes to the pageserver connection'
|
||||
values: [pageserver_send_flushes_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_buckets
|
||||
type: counter
|
||||
help: 'Histogram buckets of getpage request latency'
|
||||
key_labels:
|
||||
- bucket_le
|
||||
values: [value]
|
||||
query_ref: getpage_wait_seconds_buckets
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: compute_current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
select
|
||||
case
|
||||
when pg_catalog.pg_is_in_recovery()
|
||||
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
end as lsn;
|
||||
|
||||
- metric_name: compute_receive_lsn
|
||||
type: gauge
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery()
|
||||
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
- metric_name: replication_delay_bytes
|
||||
type: gauge
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
help: 'Time since last LSN was replayed'
|
||||
key_labels:
|
||||
values: [replication_delay_seconds]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
- metric_name: checkpoints_req
|
||||
type: gauge
|
||||
help: 'Number of requested checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_req]
|
||||
query: |
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: checkpoints_timed
|
||||
type: gauge
|
||||
help: 'Number of scheduled checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_timed]
|
||||
query: |
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: compute_logical_snapshot_files
|
||||
type: gauge
|
||||
help: 'Number of snapshot files in pg_logical/snapshot'
|
||||
key_labels:
|
||||
- timeline_id
|
||||
values: [num_logical_snapshot_files]
|
||||
query: |
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
||||
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
||||
-- completely built. We only WAL-log the completely built snapshot files.
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
||||
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
||||
|
||||
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
||||
- metric_name: logical_slot_restart_lsn
|
||||
type: gauge
|
||||
help: 'restart_lsn of logical slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [restart_lsn]
|
||||
query: |
|
||||
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
from pg_replication_slots
|
||||
where slot_type = 'logical';
|
||||
|
||||
- metric_name: compute_subscriptions_count
|
||||
type: gauge
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
||||
key_labels:
|
||||
- enabled
|
||||
values: [subscriptions_count]
|
||||
query: |
|
||||
select subenabled::text as enabled, count(*) as subscriptions_count
|
||||
from pg_subscription
|
||||
group by subenabled;
|
||||
|
||||
- metric_name: retained_wal
|
||||
type: gauge
|
||||
help: 'Retained WAL in inactive replication slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [retained_wal]
|
||||
query: |
|
||||
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_is_lost]
|
||||
query: |
|
||||
SELECT slot_name,
|
||||
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
|
||||
queries:
|
||||
- query_name: neon_perf_counters
|
||||
query: |
|
||||
WITH c AS (
|
||||
SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
|
||||
)
|
||||
SELECT d.*
|
||||
FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
|
||||
getpage_wait_seconds_count numeric,
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
pageserver_requests_sent_total numeric,
|
||||
pageserver_disconnects_total numeric,
|
||||
pageserver_send_flushes_total numeric
|
||||
);
|
||||
|
||||
- query_name: getpage_wait_seconds_buckets
|
||||
query: |
|
||||
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
|
||||
@@ -1,55 +0,0 @@
|
||||
collector_name: neon_collector_autoscaling
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
@@ -1,17 +0,0 @@
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
auth_type=scram-sha-256
|
||||
auth_user=cloud_admin
|
||||
auth_dbname=postgres
|
||||
client_tls_sslmode=disable
|
||||
server_tls_sslmode=disable
|
||||
pool_mode=transaction
|
||||
max_client_conn=10000
|
||||
default_pool_size=64
|
||||
max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter for autoscaling-agent
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector_autoscaling]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector_autoscaling.yml"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,117 +0,0 @@
|
||||
# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
|
||||
---
|
||||
commands:
|
||||
- name: cgconfigparser
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
|
||||
# running it as root.
|
||||
- name: chmod-resize-swap
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/resize-swap'
|
||||
- name: chmod-set-disk-quota
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
|
||||
- name: sql-exporter-autoscaling
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-sudoers
|
||||
content: |
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = postgres;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
&& echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
|
||||
&& test ! -e /etc/security || ( \
|
||||
echo '* - nofile 1048576' >>/etc/security/limits.conf \
|
||||
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
|
||||
)
|
||||
|
||||
# Allow postgres user (compute_ctl) to run swap resizer.
|
||||
# Need to install sudo in order to allow this.
|
||||
#
|
||||
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
@@ -21,7 +21,6 @@ nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -44,7 +44,6 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
@@ -152,7 +151,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
|
||||
|
||||
Ok(ProcessCliResult {
|
||||
connstr,
|
||||
@@ -163,7 +161,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
spec_json,
|
||||
spec_path,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -176,7 +173,6 @@ struct ProcessCliResult<'clap> {
|
||||
spec_json: Option<&'clap String>,
|
||||
spec_path: Option<&'clap String>,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<&'clap String>,
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
@@ -218,7 +214,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
.extract(&startup_tracing_carrier)
|
||||
.attach();
|
||||
@@ -297,7 +293,6 @@ fn wait_spec(
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
http_port,
|
||||
..
|
||||
}: ProcessCliResult,
|
||||
@@ -378,7 +373,6 @@ fn wait_spec(
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -387,7 +381,6 @@ struct WaitSpecResult {
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<String>,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
@@ -397,7 +390,6 @@ fn start_postgres(
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
}: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
@@ -411,7 +403,6 @@ fn start_postgres(
|
||||
);
|
||||
// before we release the mutex, fetch the swap size (if any) for later.
|
||||
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
|
||||
let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
@@ -431,8 +422,8 @@ fn start_postgres(
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_mib, "resized swap");
|
||||
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_gib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
@@ -441,29 +432,10 @@ fn start_postgres(
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set disk quota if the compute spec says so
|
||||
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
|
||||
(disk_quota_bytes, set_disk_quota_for_fs)
|
||||
{
|
||||
match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
|
||||
Ok(()) => {
|
||||
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%disk_quota_bytes, %size_mib, "set disk quota");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to set disk quota");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
compute.state_changed.notify_all();
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
@@ -478,7 +450,16 @@ fn start_postgres(
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
compute.set_failed_status(err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
@@ -769,11 +750,6 @@ fn cli() -> clap::Command {
|
||||
.long("resize-swap-on-bind")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("set-disk-quota-for-fs")
|
||||
.long("set-disk-quota-for-fs")
|
||||
.value_name("SET_DISK_QUOTA_FOR_FS")
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -306,13 +305,6 @@ impl ComputeNode {
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn set_failed_status(&self, err: anyhow::Error) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.lock().unwrap().status
|
||||
}
|
||||
@@ -718,7 +710,7 @@ impl ComputeNode {
|
||||
info!("running initdb");
|
||||
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
|
||||
Command::new(initdb_bin)
|
||||
.args(["--pgdata", pgdata])
|
||||
.args(["-D", pgdata])
|
||||
.output()
|
||||
.expect("cannot start initdb process");
|
||||
|
||||
@@ -1060,26 +1052,19 @@ impl ComputeNode {
|
||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary {
|
||||
if !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::with_compute_ctl_tmp_override(
|
||||
pgdata_path,
|
||||
"neon.max_cluster_size=-1",
|
||||
|| {
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
self.post_apply_config()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
let startup_end_time = Utc::now();
|
||||
@@ -1138,9 +1123,6 @@ impl ComputeNode {
|
||||
//
|
||||
// Use that as a default location and pattern, except macos where core dumps are written
|
||||
// to /cores/ directory by default.
|
||||
//
|
||||
// With default Linux settings, the core dump file is called just "core", so check for
|
||||
// that too.
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
@@ -1152,17 +1134,8 @@ impl ComputeNode {
|
||||
let files = fs::read_dir(core_dump_dir)?;
|
||||
let cores = files.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
|
||||
let is_core_dump = match entry.file_name().to_str()? {
|
||||
n if n.starts_with("core.") => true,
|
||||
"core" => true,
|
||||
_ => false,
|
||||
};
|
||||
if is_core_dump {
|
||||
Some(entry.path())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
|
||||
Some(entry.path())
|
||||
});
|
||||
|
||||
// Print backtrace for each core dump
|
||||
@@ -1413,36 +1386,6 @@ LIMIT 100",
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
|
||||
/// Waits until current thread receives a state changed notification and
|
||||
/// the pageserver connection strings has changed.
|
||||
///
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
if !unchanged {
|
||||
info!("Pageserver config changed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
|
||||
@@ -11,17 +11,9 @@ use crate::compute::ComputeNode;
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
let state = compute.state.lock().unwrap();
|
||||
let mut state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
|
||||
// Re-check the status after waking up
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got configuration request");
|
||||
state.status = ComputeStatus::Configuration;
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
use anyhow::Context;
|
||||
|
||||
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
|
||||
|
||||
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
|
||||
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
|
||||
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
|
||||
let size_kb = size_bytes / 1024;
|
||||
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
|
||||
let child_result = std::process::Command::new("/usr/bin/sudo")
|
||||
.arg(DISK_QUOTA_BIN)
|
||||
.arg(size_kb.to_string())
|
||||
.arg(fs_mountpoint)
|
||||
.spawn();
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow::anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
|
||||
}
|
||||
@@ -10,7 +10,6 @@ pub mod http;
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod lsn_lease;
|
||||
mod migration;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
@@ -22,7 +23,8 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
// Initialize OpenTelemetry
|
||||
let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
|
||||
let otlp_layer =
|
||||
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
|
||||
|
||||
// Put it all together
|
||||
tracing_subscriber::registry()
|
||||
|
||||
@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Request succeeded, sleeping for {} seconds",
|
||||
"Succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
|
||||
thread::sleep(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,7 +89,10 @@ fn acquire_lsn_lease_with_retry(
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token.clone());
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
config
|
||||
})
|
||||
@@ -105,11 +108,9 @@ fn acquire_lsn_lease_with_retry(
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
|
||||
retry_period_ms as u64,
|
||||
));
|
||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,94 +0,0 @@
|
||||
//! Branch mappings for convenience
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct BranchMappings {
|
||||
/// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
/// --tenant_id is not explicitly specified. This comes from the branches.
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
impl BranchMappings {
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let existing_values = self.mappings.entry(branch_name.clone()).or_default();
|
||||
|
||||
let existing_ids = existing_values
|
||||
.iter()
|
||||
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
|
||||
|
||||
if let Some((_, old_timeline_id)) = existing_ids {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_branch_timeline_id(
|
||||
&self,
|
||||
branch_name: &str,
|
||||
tenant_id: TenantId,
|
||||
) -> Option<TimelineId> {
|
||||
// If it looks like a timeline ID, return it as it is
|
||||
if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
|
||||
return Some(timeline_id);
|
||||
}
|
||||
|
||||
self.mappings
|
||||
.get(branch_name)?
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
self.mappings
|
||||
.iter()
|
||||
.flat_map(|(name, tenant_timelines)| {
|
||||
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
|
||||
(TenantTimelineId::new(tenant_id, timeline_id), name.clone())
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let content = &toml::to_string_pretty(self)?;
|
||||
fs::write(path, content).with_context(|| {
|
||||
format!(
|
||||
"Failed to write branch information into path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
|
||||
let branches_file_contents = fs::read_to_string(path)?;
|
||||
Ok(toml::from_str(branches_file_contents.as_str())?)
|
||||
}
|
||||
}
|
||||
@@ -561,7 +561,6 @@ impl Endpoint {
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -113,7 +113,7 @@ impl SafekeeperNode {
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
extra_opts: &[String],
|
||||
extra_opts: Vec<String>,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
print!(
|
||||
@@ -196,7 +196,7 @@ impl SafekeeperNode {
|
||||
]);
|
||||
}
|
||||
|
||||
args.extend_from_slice(extra_opts);
|
||||
args.extend(extra_opts);
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
|
||||
@@ -346,14 +346,7 @@ impl StorageController {
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
let initdb_args = [
|
||||
"--pgdata",
|
||||
pg_data_path.as_ref(),
|
||||
"--username",
|
||||
&username(),
|
||||
"--no-sync",
|
||||
"--no-instructions",
|
||||
];
|
||||
let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
|
||||
tracing::info!(
|
||||
"Initializing storage controller database with args: {:?}",
|
||||
initdb_args
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
availability_zone_id,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -27,10 +27,6 @@ yanked = "warn"
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2024-0376"
|
||||
reason = "gRPC endpoints in Neon are not exposed externally"
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a minature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
# AUX file v2
|
||||
|
||||
## Summary
|
||||
|
||||
This is a retrospective RFC describing a new storage strategy for AUX files.
|
||||
|
||||
## Motivation
|
||||
|
||||
The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`.
|
||||
Every time the compute node streams a `neon-file` record to the pageserver, it will
|
||||
update the aux file hash map, and then write the serialized hash map into the key.
|
||||
This creates serious space bloat. There was a fix to log delta records (i.e., update
|
||||
a key in the hash map) to the aux file key. In this way, the pageserver only stores
|
||||
the deltas at each of the LSNs. However, this improved v1 storage strategy still
|
||||
requires us to store everything in an aux file cache in memory, because we cannot
|
||||
fetch a single key (or file) from the compound `AUX_FILES_KEY`.
|
||||
|
||||
### Prior art
|
||||
|
||||
For storing large amount of small files, we can use a key-value store where the key
|
||||
is the filename and the value is the file content.
|
||||
|
||||
## Requirements
|
||||
|
||||
- No space bloat, fixed space amplification.
|
||||
- No write bloat, fixed write amplification.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver
|
||||
|
||||
## Sparse Keyspace
|
||||
|
||||
In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF
|
||||
exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior
|
||||
assumption, there are code that traverses the keyspace by iterating every single key.
|
||||
|
||||
```rust
|
||||
loop {
|
||||
// do something
|
||||
key = key.next();
|
||||
}
|
||||
```
|
||||
|
||||
If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run.
|
||||
Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would
|
||||
exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead,
|
||||
they should fetch all the layer files in the key range, and then do a merge of them.
|
||||
|
||||
In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`.
|
||||
|
||||
## AUX v2 Keyspace and Key Mapping
|
||||
|
||||
Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the
|
||||
keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash
|
||||
of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`.
|
||||
|
||||
For example, `pg_logical/mappings/test1` will be encoded as:
|
||||
|
||||
```
|
||||
62 0000 01 01 7F8B83D94F7081693471ABF91C
|
||||
^ aux prefix
|
||||
^ assigned prefix of pg_logical/
|
||||
^ assigned prefix of mappings/
|
||||
^ 13B FNV hash of test1
|
||||
^ not used due to key representation
|
||||
```
|
||||
|
||||
The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace.
|
||||
|
||||
Note that inside pageserver, there are two representations of the keys: the 18B full key representation
|
||||
and the 16B compact key representation. For the 18B representation, some fields have restricted ranges
|
||||
of values. Therefore, the aux keys only use the 16B compact portion of the full key.
|
||||
|
||||
It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of
|
||||
each of the aux key is an array that contains all filenames and file content that should be stored in
|
||||
this key.
|
||||
|
||||
We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before,
|
||||
and we do not need addition code to support reconstructing the value. We simply get the latest image from
|
||||
the storage.
|
||||
|
||||
## Inbound Logical Replication Key Mapping
|
||||
|
||||
For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data.
|
||||
This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during
|
||||
generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
|
||||
|
||||
## Sparse Keyspace Read Path
|
||||
|
||||
There are two places we need to read the aux files from the pageserver:
|
||||
|
||||
* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
|
||||
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
|
||||
|
||||
## Compaction and Image Layer Generation
|
||||
|
||||
With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage.
|
||||
|
||||
* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes.
|
||||
* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later.
|
||||
|
||||
## Migration
|
||||
|
||||
We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration).
|
||||
|
||||
During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached
|
||||
with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2).
|
||||
|
||||
If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program.
|
||||
|
||||
The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes.
|
||||
@@ -1,343 +0,0 @@
|
||||
# Independent compute release
|
||||
|
||||
Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus)
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes an approach to fully independent compute release flow. It attempts to
|
||||
cover the following features:
|
||||
|
||||
- Process is automated as much as possible to minimize human errors.
|
||||
- Compute<->storage protocol compatibility is ensured.
|
||||
- A transparent release history is available with an easy rollback strategy.
|
||||
- Although not in the scope of this document, there is a viable way to extend the proposed release
|
||||
flow to achieve the canary and/or blue-green deployment strategies.
|
||||
|
||||
## Motivation
|
||||
|
||||
Previously, the compute release was tightly coupled to the storage release. This meant that once
|
||||
some storage nodes got restarted with a newer version, all new compute starts using these nodes
|
||||
automatically got a new version. Thus, two releases happen in parallel, which increases the blast
|
||||
radius and makes ownership fuzzy.
|
||||
|
||||
Now, we practice a manual v0 independent compute release flow -- after getting a new compute release
|
||||
image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws:
|
||||
|
||||
1. It's a simple but fairly manual process, as you need to click through a few pages.
|
||||
2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag.
|
||||
3. We now require an additional approval in the Admin UI, which partially solves the 2.,
|
||||
but also makes the whole process pretty annoying, as you constantly need to go back
|
||||
and forth between two people.
|
||||
|
||||
## Non-goals
|
||||
|
||||
It's not the goal of this document to propose a design for some general-purpose release tool like Helm.
|
||||
The document considers how the current compute fleet is orchestrated at Neon. Even if we later
|
||||
decide to split the control plane further (e.g., introduce a separate compute controller), the proposed
|
||||
release process shouldn't change much, i.e., the releases table and API will reside in
|
||||
one of the parts.
|
||||
|
||||
Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They
|
||||
were kept in mind, though, so it's expected that the proposed approach will lay down the foundation
|
||||
for implementing them in future iterations.
|
||||
|
||||
## Impacted components
|
||||
|
||||
Compute, control plane, CI, observability (some Grafana dashboards may require changes).
|
||||
|
||||
## Prior art
|
||||
|
||||
One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/).
|
||||
|
||||
In the code:
|
||||
|
||||
- [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43)
|
||||
- [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40)
|
||||
- [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42)
|
||||
|
||||
TL;DR it has several important attributes:
|
||||
|
||||
- Revision -- unique release ID/primary key. It is not the same as the application version,
|
||||
because the same version can be deployed several times, e.g., after a newer version rollback.
|
||||
- App version -- version of the application chart/code.
|
||||
- Config -- set of overrides to the default config of the application.
|
||||
- Status -- current status of the release in the history.
|
||||
- Timestamps -- tracks when a release was created and deployed.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### Separate release branch
|
||||
|
||||
We will use a separate release branch, `release-compute`, to have a clean history for releases and commits.
|
||||
In order to avoid confusion with storage releases, we will use a different prefix for compute [git release
|
||||
tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for
|
||||
Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant,
|
||||
but it's better to have image and git tags in sync.
|
||||
|
||||
Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage
|
||||
compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely
|
||||
independent. The only constraint we want is that it must monotonically increase within the same release branch.
|
||||
|
||||
### Compute config/settings manifest
|
||||
|
||||
We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure:
|
||||
|
||||
```yaml
|
||||
pg_settings:
|
||||
# Common settings for primaries and secondaries of all versions.
|
||||
common:
|
||||
wal_log_hints: "off"
|
||||
max_wal_size: "1024"
|
||||
|
||||
per_version:
|
||||
14:
|
||||
# Common settings for both replica and primary of version PG 14
|
||||
common:
|
||||
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
|
||||
15:
|
||||
common:
|
||||
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
|
||||
# Settings that should be applied only to
|
||||
replica:
|
||||
# Available only starting Postgres 15th
|
||||
recovery_prefetch: "off"
|
||||
# ...
|
||||
17:
|
||||
common:
|
||||
# For example, if third-party `extension_x` is not yet available for PG 17
|
||||
shared_preload_libraries: "neon,pg_stat_statements"
|
||||
replica:
|
||||
recovery_prefetch: "off"
|
||||
```
|
||||
|
||||
**N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string)
|
||||
without units for all numeric settings. That's how the control plane currently operates.
|
||||
|
||||
The priority of settings will be (a higher number is a higher priority):
|
||||
|
||||
1. Any static and hard-coded settings in the control plane
|
||||
2. `pg_settings->common`
|
||||
3. Per-version `common`
|
||||
4. Per-version `replica`
|
||||
5. Any per-user/project/endpoint overrides in the control plane
|
||||
6. Any dynamic setting calculated based on the compute size
|
||||
|
||||
**N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely
|
||||
overridden if specified on some level. Make sure that you include all necessary extensions in it when you
|
||||
do any overrides.
|
||||
|
||||
**N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes
|
||||
do for particular projects and customers. That's usually some ad-hoc work and images are based on
|
||||
the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute
|
||||
release. If for some reason that's not true, and further overrides are needed, it's also possible to do
|
||||
on the project level together with pinning the image, so it's on-call/engineer/support responsibility to
|
||||
ensure that compute starts with the specified custom image. The only real risk is that compute image will get
|
||||
stale and settings from new releases will drift away, so eventually it will get something incompatible,
|
||||
but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings
|
||||
receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely,
|
||||
then together with pinning the image we could also pin the matching release revision in the control plane.
|
||||
|
||||
The compute team will own the content of `compute/manifest.yaml`.
|
||||
|
||||
### Control plane: releases table
|
||||
|
||||
In order to store information about releases, the control plane will use a table `compute_releases` with the following
|
||||
schema:
|
||||
|
||||
```sql
|
||||
CREATE TABLE compute_releases (
|
||||
-- Unique release ID
|
||||
-- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent
|
||||
-- services. We have the same situation with Helm releases as well because they could be deployed and rolled back
|
||||
-- independently in different clusters.
|
||||
revision BIGSERIAL PRIMARY KEY,
|
||||
-- Numeric version of the compute image, e.g. 9057
|
||||
version BIGINT NOT NULL,
|
||||
-- Compute image tag, e.g. `release-9057`
|
||||
tag TEXT NOT NULL,
|
||||
-- Current release status. Currently, it will be a simple enum
|
||||
-- * `deployed` -- release is deployed and used for new compute starts.
|
||||
-- Exactly one release can have this status at a time.
|
||||
-- * `superseded` -- release has been replaced by a newer one.
|
||||
-- But we can always extend it in the future when we need more statuses
|
||||
-- for more complex deployment strategies.
|
||||
status TEXT NOT NULL,
|
||||
-- Any additional metadata for compute in the corresponding release
|
||||
manifest JSONB NOT NULL,
|
||||
-- Timestamp when release record was created in the control plane database
|
||||
created_at TIMESTAMP NOT NULL DEFAULT now(),
|
||||
-- Timestamp when release deployment was finished
|
||||
deployed_at TIMESTAMP
|
||||
);
|
||||
```
|
||||
|
||||
We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of
|
||||
old computes started using the image from one of the previous releases. Yet, when users want to reconfigure
|
||||
them without restarting, the control plane needs to know what settings are applicable to them, so we also need
|
||||
information about the previous releases that are readily available. There could be some other auxiliary info
|
||||
needed as well: supported extensions, compute flags, etc.
|
||||
|
||||
**N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g.,
|
||||
it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet,
|
||||
we could've started some computes with the first deployment and some with the second. Thus, when we need to
|
||||
look up the manifest for the compute by its image tag, we will see two records in the table with the same tag,
|
||||
but different revision numbers. We can assume that this could happen only in case of rollbacks, so we
|
||||
can just take the latest revision for the given tag.
|
||||
|
||||
### Control plane: management API
|
||||
|
||||
The control plane will implement new API methods to manage releases:
|
||||
|
||||
1. `POST /management/api/v2/compute_releases` to create a new release. With payload
|
||||
|
||||
```json
|
||||
{
|
||||
"version": 9057,
|
||||
"tag": "release-9057",
|
||||
"manifest": {}
|
||||
}
|
||||
```
|
||||
|
||||
and response
|
||||
|
||||
```json
|
||||
{
|
||||
"revision": 53,
|
||||
"version": 9057,
|
||||
"tag": "release-9057",
|
||||
"status": "deployed",
|
||||
"manifest": {},
|
||||
"created_at": "2024-08-15T15:52:01.0000Z",
|
||||
"deployed_at": "2024-08-15T15:52:01.0000Z",
|
||||
}
|
||||
```
|
||||
|
||||
Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane
|
||||
will get information about all available extensions not bundled into compute image. The corresponding
|
||||
workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make
|
||||
it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that,
|
||||
we put a constraint that new custom extension requires new compute release, which is good for the safety,
|
||||
but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new
|
||||
images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all;
|
||||
v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle.
|
||||
|
||||
**N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable
|
||||
to assume that we could retry the request several times, even though it's already succeeded. Although it's
|
||||
not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane
|
||||
should check if the latest release is identical and just return `304 Not Modified` in this case.
|
||||
|
||||
2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload
|
||||
including the revision of the release to rollback to:
|
||||
|
||||
```json
|
||||
{
|
||||
"revision": 52
|
||||
}
|
||||
```
|
||||
|
||||
Rollback marks the current release as `superseded` and creates a new release with all the same data as the
|
||||
requested revision, but with a new revision number.
|
||||
|
||||
This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any
|
||||
available tag. It's still nice to have for on-call and any urgent matters, for example, if we need
|
||||
to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting
|
||||
all the necessary data for the new release payload.
|
||||
|
||||
### Compute->storage compatibility tests
|
||||
|
||||
In order to safely release new compute versions independently from storage, we need to ensure that the currently
|
||||
deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility
|
||||
in storage, but newer computes may require a newer storage version.
|
||||
|
||||
Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48)
|
||||
`storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure
|
||||
compatibility between storage and compute:
|
||||
|
||||
1. Pick the latest storage release tag and use it as `storage_image_tag`.
|
||||
2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`.
|
||||
Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged.
|
||||
3. Trigger e2e tests as usual.
|
||||
|
||||
### Release flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
|
||||
actor oncall as Compute on-call person
|
||||
participant neon as neondatabase/neon
|
||||
|
||||
box private
|
||||
participant cloud as neondatabase/cloud
|
||||
participant exts as neondatabase/build-custom-extensions
|
||||
participant infra as neondatabase/infra
|
||||
end
|
||||
|
||||
box cloud
|
||||
participant preprod as Pre-prod control plane
|
||||
participant prod as Production control plane
|
||||
participant k8s as Compute k8s
|
||||
end
|
||||
|
||||
oncall ->> neon: Open release PR into release-compute
|
||||
|
||||
activate neon
|
||||
neon ->> cloud: CI: trigger e2e compatibility tests
|
||||
activate cloud
|
||||
cloud -->> neon: CI: e2e tests pass
|
||||
deactivate cloud
|
||||
neon ->> neon: CI: pass PR checks, get approvals
|
||||
deactivate neon
|
||||
|
||||
oncall ->> neon: Merge release PR into release-compute
|
||||
|
||||
activate neon
|
||||
neon ->> neon: CI: pass checks, build and push images
|
||||
neon ->> exts: CI: trigger extensions build
|
||||
activate exts
|
||||
exts -->> neon: CI: extensions are ready
|
||||
deactivate exts
|
||||
neon ->> neon: CI: create release tag
|
||||
neon ->> infra: Trigger release workflow using the produced tag
|
||||
deactivate neon
|
||||
|
||||
activate infra
|
||||
infra ->> infra: CI: pass checks
|
||||
infra ->> preprod: Release new compute image to pre-prod automatically <br/> POST /management/api/v2/compute_releases
|
||||
activate preprod
|
||||
preprod -->> infra: 200 OK
|
||||
deactivate preprod
|
||||
|
||||
infra ->> infra: CI: wait for per-region production deploy approvals
|
||||
oncall ->> infra: CI: approve deploys region by region
|
||||
infra ->> k8s: Prewarm new compute image
|
||||
infra ->> prod: POST /management/api/v2/compute_releases
|
||||
activate prod
|
||||
prod -->> infra: 200 OK
|
||||
deactivate prod
|
||||
deactivate infra
|
||||
```
|
||||
|
||||
## Further work
|
||||
|
||||
As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies.
|
||||
For example, we can pass a fraction of the total compute starts that should use the new release. Then we can
|
||||
mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it
|
||||
to `deployed` status. If not, we can roll back to the previous one.
|
||||
|
||||
## Alternatives
|
||||
|
||||
In theory, we can try using Helm as-is:
|
||||
|
||||
1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read.
|
||||
N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy.
|
||||
2. The control plane will read it and start using the new compute version for new starts.
|
||||
|
||||
Drawbacks:
|
||||
|
||||
1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different
|
||||
deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled
|
||||
by control plane, so it makes it much more tricky.
|
||||
2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use
|
||||
`helm` cli and/or K8s UIs like K8sLens.
|
||||
3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility
|
||||
purpose (see above) control plane may need some auxiliary info from the previous releases.
|
||||
@@ -50,16 +50,6 @@ pub struct ComputeSpec {
|
||||
#[serde(default)]
|
||||
pub swap_size_bytes: Option<u64>,
|
||||
|
||||
/// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs
|
||||
/// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the
|
||||
/// spec is first received.
|
||||
///
|
||||
/// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's
|
||||
/// spec generation doesn't need to be aware of the actual compute it's running on, while
|
||||
/// guaranteeing gradual rollout of disk quota.
|
||||
#[serde(default)]
|
||||
pub disk_quota_bytes: Option<u64>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
@@ -278,22 +268,6 @@ pub struct GenericOption {
|
||||
/// declare a `trait` on it.
|
||||
pub type GenericOptions = Option<Vec<GenericOption>>;
|
||||
|
||||
/// Configured the local-proxy application with the relevant JWKS and roles it should
|
||||
/// use for authorizing connect requests using JWT.
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct LocalProxySpec {
|
||||
pub jwks: Vec<JwksSettings>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct JwksSettings {
|
||||
pub id: String,
|
||||
pub role_names: Vec<String>,
|
||||
pub jwks_url: String,
|
||||
pub provider_name: String,
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -58,7 +57,7 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub availability_zone_id: AvailabilityZone,
|
||||
pub availability_zone_id: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -75,19 +74,10 @@ pub struct TenantPolicyRequest {
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct AvailabilityZone(pub String);
|
||||
|
||||
impl Display for AvailabilityZone {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
|
||||
pub preferred_az_ids: HashMap<TenantShardId, String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -37,11 +37,14 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
/// ```mermaid
|
||||
/// stateDiagram-v2
|
||||
///
|
||||
/// [*] --> Loading: spawn_load()
|
||||
/// [*] --> Attaching: spawn_attach()
|
||||
///
|
||||
/// Loading --> Activating: activate()
|
||||
/// Attaching --> Activating: activate()
|
||||
/// Activating --> Active: infallible
|
||||
///
|
||||
/// Loading --> Broken: load() failure
|
||||
/// Attaching --> Broken: attach() failure
|
||||
///
|
||||
/// Active --> Stopping: set_stopping(), part of shutdown & detach
|
||||
@@ -65,6 +68,10 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
)]
|
||||
#[serde(tag = "slug", content = "data")]
|
||||
pub enum TenantState {
|
||||
/// This tenant is being loaded from local disk.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
Loading,
|
||||
/// This tenant is being attached to the pageserver.
|
||||
///
|
||||
/// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
|
||||
@@ -114,6 +121,8 @@ impl TenantState {
|
||||
// But, our attach task might still be fetching the remote timelines, etc.
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
Self::Active => Attached,
|
||||
@@ -182,11 +191,10 @@ impl LsnLease {
|
||||
}
|
||||
|
||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||
///
|
||||
/// XXX: We used to have more variants here, but now it's just one, which makes this rather
|
||||
/// useless. Remove, once we've checked that there's no client code left that looks at this.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum ActivatingFrom {
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
|
||||
Loading,
|
||||
/// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
|
||||
Attaching,
|
||||
}
|
||||
@@ -1554,8 +1562,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn tenantstatus_activating_serde() {
|
||||
let states = [TenantState::Activating(ActivatingFrom::Attaching)];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
let states = [
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
];
|
||||
let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
|
||||
|
||||
let actual = serde_json::to_string(&states).unwrap();
|
||||
|
||||
@@ -1570,7 +1581,13 @@ mod tests {
|
||||
fn tenantstatus_activating_strum() {
|
||||
// tests added, because we use these for metrics
|
||||
let examples = [
|
||||
(line!(), TenantState::Loading, "Loading"),
|
||||
(line!(), TenantState::Attaching, "Attaching"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Loading),
|
||||
"Activating",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Activating(ActivatingFrom::Attaching),
|
||||
|
||||
@@ -984,7 +984,6 @@ pub fn short_error(e: &QueryError) -> String {
|
||||
}
|
||||
|
||||
fn log_query_error(query: &str, e: &QueryError) {
|
||||
// If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`.
|
||||
match e {
|
||||
QueryError::Disconnected(ConnectionError::Io(io_error)) => {
|
||||
if is_expected_io_error(io_error) {
|
||||
|
||||
@@ -93,9 +93,9 @@ impl Conf {
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("initdb")?
|
||||
.arg("--pgdata")
|
||||
.arg("-D")
|
||||
.arg(&self.datadir)
|
||||
.args(["--username", "postgres", "--no-instructions", "--no-sync"])
|
||||
.args(["-U", "postgres", "--no-instructions", "--no-sync"])
|
||||
.output()?;
|
||||
debug!("initdb output: {:?}", output);
|
||||
ensure!(
|
||||
|
||||
@@ -6,14 +6,12 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
hyper.workspace = true
|
||||
opentelemetry = { workspace = true, features = ["trace"] }
|
||||
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry = { workspace = true, features=["rt-tokio"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions.workspace = true
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tracing-subscriber.workspace = true # For examples in docs
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use tracing_subscriber::prelude::*;
|
||||
//! use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
//!
|
||||
//! #[tokio::main]
|
||||
//! async fn main() {
|
||||
@@ -21,7 +22,7 @@
|
||||
//! .with_writer(std::io::stderr);
|
||||
//!
|
||||
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
|
||||
//! let otlp_layer = tracing_utils::init_tracing("my_application").await;
|
||||
//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
|
||||
//!
|
||||
//! // Put it all together
|
||||
//! tracing_subscriber::registry()
|
||||
@@ -34,14 +35,14 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod http;
|
||||
|
||||
use opentelemetry::trace::TracerProvider;
|
||||
use opentelemetry::sdk::Resource;
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use tracing::Subscriber;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
use tracing_subscriber::Layer;
|
||||
use opentelemetry_otlp::WithExportConfig;
|
||||
use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
|
||||
|
||||
pub use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
|
||||
pub mod http;
|
||||
|
||||
/// Set up OpenTelemetry exporter, using configuration from environment variables.
|
||||
///
|
||||
@@ -70,10 +71,7 @@ use tracing_subscriber::Layer;
|
||||
///
|
||||
/// This doesn't block, but is marked as 'async' to hint that this must be called in
|
||||
/// asynchronous execution context.
|
||||
pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
|
||||
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
|
||||
return None;
|
||||
};
|
||||
@@ -82,10 +80,9 @@ where
|
||||
|
||||
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
|
||||
/// tasks.
|
||||
pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
pub fn init_tracing_without_runtime(
|
||||
service_name: &str,
|
||||
) -> Option<opentelemetry::sdk::trace::Tracer> {
|
||||
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
|
||||
return None;
|
||||
};
|
||||
@@ -116,36 +113,54 @@ where
|
||||
Some(init_tracing_internal(service_name.to_string()))
|
||||
}
|
||||
|
||||
fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
|
||||
let exporter = opentelemetry_otlp::new_exporter().http();
|
||||
fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
|
||||
// Set up exporter from the OTEL_EXPORTER_* environment variables
|
||||
let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
|
||||
|
||||
// TODO: opentelemetry::global::set_error_handler() with custom handler that
|
||||
// bypasses default tracing layers, but logs regular looking log
|
||||
// messages.
|
||||
// XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
|
||||
// OpenTelemetry spec at
|
||||
// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
|
||||
// the full exporter URL is formed by appending "/v1/traces" to the value
|
||||
// of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
|
||||
// that with the grpc-tonic exporter. Other exporters, like the HTTP
|
||||
// exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
|
||||
// appending "/v1/traces".
|
||||
//
|
||||
// See https://github.com/open-telemetry/opentelemetry-rust/pull/950
|
||||
//
|
||||
// Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
|
||||
// the endpoint url with the "/v1/traces" path ourselves. If the bug is
|
||||
// fixed in a later version, we can remove this code. But if we don't
|
||||
// remember to remove this, it won't do any harm either, as the crate will
|
||||
// just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
|
||||
// is set directly with `with_endpoint`.
|
||||
if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
|
||||
if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
|
||||
if !endpoint.ends_with('/') {
|
||||
endpoint.push('/');
|
||||
}
|
||||
endpoint.push_str("v1/traces");
|
||||
exporter = exporter.with_endpoint(endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
// Propagate trace information in the standard W3C TraceContext format.
|
||||
opentelemetry::global::set_text_map_propagator(
|
||||
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
|
||||
opentelemetry::sdk::propagation::TraceContextPropagator::new(),
|
||||
);
|
||||
|
||||
let tracer = opentelemetry_otlp::new_pipeline()
|
||||
opentelemetry_otlp::new_pipeline()
|
||||
.tracing()
|
||||
.with_exporter(exporter)
|
||||
.with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
|
||||
Resource::new(vec![KeyValue::new(
|
||||
.with_trace_config(
|
||||
opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
|
||||
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
|
||||
service_name,
|
||||
)]),
|
||||
))
|
||||
.install_batch(opentelemetry_sdk::runtime::Tokio)
|
||||
)])),
|
||||
)
|
||||
.install_batch(opentelemetry::runtime::Tokio)
|
||||
.expect("could not initialize opentelemetry exporter")
|
||||
.tracer("global");
|
||||
|
||||
tracing_opentelemetry::layer().with_tracer(tracer)
|
||||
}
|
||||
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
|
||||
@@ -19,7 +19,6 @@ bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
git-version.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
|
||||
@@ -92,10 +92,6 @@ pub mod toml_edit_ext;
|
||||
|
||||
pub mod circuit_breaker;
|
||||
|
||||
// Re-export used in macro. Avoids adding git-version as dep in target crates.
|
||||
#[doc(hidden)]
|
||||
pub use git_version;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
@@ -135,7 +131,7 @@ macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
// this should try GIT_VERSION first only then git_version::git_version!
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! {
|
||||
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
|
||||
prefix = "",
|
||||
fallback = "unknown",
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
|
||||
@@ -27,6 +27,7 @@ crc32c.workspace = true
|
||||
either.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
|
||||
@@ -736,22 +736,4 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_init_lsn_lease(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<LsnLease> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, LsnLeaseRequest { lsn })
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -10,6 +10,7 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command};
|
||||
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::config::PageserverIdentity;
|
||||
use pageserver::controller_upcall_client::ControllerUpcallClient;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||
@@ -396,7 +396,7 @@ fn start_pageserver(
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
remote_storage.clone(),
|
||||
ControllerUpcallClient::new(conf, &shutdown_pageserver),
|
||||
ControlPlaneClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
if let Some(deletion_workers) = deletion_workers {
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
controller_api::{AvailabilityZone, NodeRegisterRequest},
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
@@ -17,12 +17,9 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
|
||||
/// The Pageserver's client for using the storage controller upcall API: this is a small API
|
||||
/// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
|
||||
///
|
||||
/// The server presenting this API may either be the storage controller or some other
|
||||
/// service (such as the Neon control plane) providing a store of generation numbers.
|
||||
pub struct ControllerUpcallClient {
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
pub struct ControlPlaneClient {
|
||||
http_client: reqwest::Client,
|
||||
base_url: Url,
|
||||
node_id: NodeId,
|
||||
@@ -48,7 +45,7 @@ pub trait ControlPlaneGenerationsApi {
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
|
||||
}
|
||||
|
||||
impl ControllerUpcallClient {
|
||||
impl ControlPlaneClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
||||
@@ -117,7 +114,7 @@ impl ControllerUpcallClient {
|
||||
}
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(
|
||||
&self,
|
||||
@@ -151,10 +148,10 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
|
||||
match az_id_from_metadata {
|
||||
Some(az_id) => Some(AvailabilityZone(az_id)),
|
||||
Some(az_id) => Some(az_id),
|
||||
None => {
|
||||
tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
|
||||
conf.availability_zone.clone().map(AvailabilityZone)
|
||||
conf.availability_zone.clone()
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -219,38 +216,29 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
.join("validate")
|
||||
.expect("Failed to build validate path");
|
||||
|
||||
// When sending validate requests, break them up into chunks so that we
|
||||
// avoid possible edge cases of generating any HTTP requests that
|
||||
// require database I/O across many thousands of tenants.
|
||||
let mut result: HashMap<TenantShardId, bool> = HashMap::with_capacity(tenants.len());
|
||||
for tenant_chunk in (tenants).chunks(128) {
|
||||
let request = ValidateRequest {
|
||||
tenants: tenant_chunk
|
||||
.iter()
|
||||
.map(|(id, generation)| ValidateRequestTenant {
|
||||
id: *id,
|
||||
gen: (*generation).into().expect(
|
||||
"Generation should always be valid for a Tenant doing deletions",
|
||||
),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
let request = ValidateRequest {
|
||||
tenants: tenants
|
||||
.into_iter()
|
||||
.map(|(id, gen)| ValidateRequestTenant {
|
||||
id,
|
||||
gen: gen
|
||||
.into()
|
||||
.expect("Generation should always be valid for a Tenant doing deletions"),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"control-plane-client-validate-sleep",
|
||||
&self.cancel
|
||||
);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
let response: ValidateResponse =
|
||||
self.retry_http_forever(&re_attach_path, request).await?;
|
||||
for rt in response.tenants {
|
||||
result.insert(rt.id, rt.valid);
|
||||
}
|
||||
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
Ok(result.into_iter().collect())
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|rt| (rt.id, rt.valid))
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
@@ -622,7 +622,7 @@ impl DeletionQueue {
|
||||
/// If remote_storage is None, then the returned workers will also be None.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
controller_upcall_client: Option<C>,
|
||||
control_plane_client: Option<C>,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
||||
where
|
||||
@@ -662,7 +662,7 @@ impl DeletionQueue {
|
||||
conf,
|
||||
backend_rx,
|
||||
executor_tx,
|
||||
controller_upcall_client,
|
||||
control_plane_client,
|
||||
lsn_table.clone(),
|
||||
cancel.clone(),
|
||||
),
|
||||
@@ -704,7 +704,7 @@ mod test {
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
use crate::{
|
||||
controller_upcall_client::RetryForeverError,
|
||||
control_plane_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
@@ -25,8 +25,8 @@ use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::controller_upcall_client::RetryForeverError;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
@@ -61,7 +61,7 @@ where
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
// Client for calling into control plane API for validation of deletes
|
||||
controller_upcall_client: Option<C>,
|
||||
control_plane_client: Option<C>,
|
||||
|
||||
// DeletionLists which are waiting generation validation. Not safe to
|
||||
// execute until [`validate`] has processed them.
|
||||
@@ -94,7 +94,7 @@ where
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
controller_upcall_client: Option<C>,
|
||||
control_plane_client: Option<C>,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -102,7 +102,7 @@ where
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
controller_upcall_client,
|
||||
control_plane_client,
|
||||
lsn_table,
|
||||
pending_lists: Vec::new(),
|
||||
validated_lists: Vec::new(),
|
||||
@@ -145,8 +145,8 @@ where
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
|
||||
match controller_upcall_client
|
||||
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
|
||||
match control_plane_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -56,7 +56,6 @@ use utils::http::endpoint::request_span;
|
||||
use utils::http::request::must_parse_query_param;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
@@ -81,6 +80,7 @@ use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
@@ -589,10 +589,6 @@ async fn timeline_create_handler(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
HttpErrorBody::from_msg("tenant shutting down".to_string()),
|
||||
@@ -824,7 +820,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
|
||||
let lease = if with_lease {
|
||||
timeline
|
||||
.init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.inspect_err(|_| {
|
||||
warn!("fail to grant a lease to {}", lsn);
|
||||
})
|
||||
@@ -1692,18 +1688,9 @@ async fn lsn_lease_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let result = async {
|
||||
timeline
|
||||
.init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(
|
||||
e.context(format!("invalid lsn lease request at {lsn}")),
|
||||
)
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
@@ -1719,13 +1706,8 @@ async fn timeline_gc_handler(
|
||||
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let gc_result = state
|
||||
.tenant_manager
|
||||
.immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)
|
||||
.await?;
|
||||
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
||||
|
||||
json_response(StatusCode::OK, gc_result)
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
pub mod controller_upcall_client;
|
||||
pub mod control_plane_client;
|
||||
pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
|
||||
@@ -8,8 +8,6 @@ use metrics::{
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use strum::{EnumCount, VariantNames};
|
||||
use strum_macros::{IntoStaticStr, VariantNames};
|
||||
use tracing::warn;
|
||||
@@ -1385,7 +1383,7 @@ impl SmgrQueryTimePerTimeline {
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> Option<impl Drop + 'a> {
|
||||
) -> Option<impl Drop + '_> {
|
||||
let start = Instant::now();
|
||||
|
||||
self.global_started[op as usize].inc();
|
||||
@@ -1510,7 +1508,6 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
pub(crate) struct BasebackupQueryTime {
|
||||
ok: Histogram,
|
||||
error: Histogram,
|
||||
client_error: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
@@ -1524,7 +1521,6 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
BasebackupQueryTime {
|
||||
ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
|
||||
error: vec.get_metric_with_label_values(&["error"]).unwrap(),
|
||||
client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1538,7 +1534,7 @@ impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
@@ -1561,7 +1557,7 @@ impl BasebackupQueryTime {
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
@@ -1580,15 +1576,10 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
// If you want to change categorize of a specific error, also change it in `log_query_error`.
|
||||
let metric = match res {
|
||||
Ok(_) => &self.parent.ok,
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
&self.parent.client_error
|
||||
}
|
||||
Err(_) => &self.parent.error,
|
||||
let metric = if res.is_ok() {
|
||||
&self.parent.ok
|
||||
} else {
|
||||
&self.parent.error
|
||||
};
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
@@ -3217,38 +3208,45 @@ pub(crate) mod tenant_throttling {
|
||||
|
||||
impl TimelineGet {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
|
||||
let per_tenant_label_values = &[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
];
|
||||
TimelineGet {
|
||||
count_accounted_start: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]),
|
||||
}
|
||||
},
|
||||
count_accounted_finish: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
|
||||
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]),
|
||||
}
|
||||
},
|
||||
wait_time: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_USECS.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_USECS_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]),
|
||||
}
|
||||
},
|
||||
count_throttled: {
|
||||
GlobalAndPerTenantIntCounter {
|
||||
global: WAIT_COUNT.with_label_values(&[KIND]),
|
||||
per_tenant: WAIT_COUNT_PER_TENANT
|
||||
.with_label_values(per_tenant_label_values),
|
||||
per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
|
||||
KIND,
|
||||
&tenant_shard_id.tenant_id.to_string(),
|
||||
&tenant_shard_id.shard_slug().to_string(),
|
||||
]),
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
@@ -273,20 +273,10 @@ async fn page_service_conn_main(
|
||||
info!("Postgres client disconnected ({io_error})");
|
||||
Ok(())
|
||||
} else {
|
||||
let tenant_id = conn_handler.timeline_handles.tenant_id();
|
||||
Err(io_error).context(format!(
|
||||
"Postgres connection error for tenant_id={:?} client at peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
))
|
||||
Err(io_error).context("Postgres connection error")
|
||||
}
|
||||
}
|
||||
other => {
|
||||
let tenant_id = conn_handler.timeline_handles.tenant_id();
|
||||
other.context(format!(
|
||||
"Postgres query error for tenant_id={:?} client peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
))
|
||||
}
|
||||
other => other.context("Postgres query error"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -350,10 +340,6 @@ impl TimelineHandles {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_id(&self) -> Option<TenantId> {
|
||||
self.wrapper.tenant_id.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TenantManagerWrapper {
|
||||
@@ -833,7 +819,7 @@ impl PageServerHandler {
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
let lease = timeline
|
||||
.renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.inspect_err(|e| {
|
||||
warn!("{e}");
|
||||
})
|
||||
|
||||
@@ -840,36 +840,6 @@ impl Timeline {
|
||||
Ok(total_size * BLCKSZ as u64)
|
||||
}
|
||||
|
||||
/// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
|
||||
/// for gc-compaction.
|
||||
///
|
||||
/// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
|
||||
/// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
|
||||
/// be kept only for a specific range of LSN.
|
||||
///
|
||||
/// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
|
||||
/// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
|
||||
/// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
|
||||
/// determine which keys to retain/drop for gc-compaction.
|
||||
///
|
||||
/// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
|
||||
/// to be retained for each of the branch LSN.
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_gc_compaction_keyspace(
|
||||
&self,
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
let metadata_key_begin = Key::metadata_key_range().start;
|
||||
let aux_v1_key = AUX_FILES_KEY;
|
||||
let dense_keyspace = KeySpace {
|
||||
ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
|
||||
};
|
||||
Ok((
|
||||
dense_keyspace,
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
|
||||
@@ -21,7 +21,6 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
@@ -183,54 +182,27 @@ pub struct TenantSharedResources {
|
||||
pub(super) struct AttachedTenantConf {
|
||||
tenant_conf: TenantConfOpt,
|
||||
location: AttachedLocationConfig,
|
||||
/// The deadline before which we are blocked from GC so that
|
||||
/// leases have a chance to be renewed.
|
||||
lsn_lease_deadline: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||
// Sets a deadline before which we cannot proceed to GC due to lsn lease.
|
||||
//
|
||||
// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
|
||||
// length, we guarantee that all the leases we granted before will have a chance to renew
|
||||
// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
|
||||
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
|
||||
Some(
|
||||
tokio::time::Instant::now()
|
||||
+ tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(LsnLease::DEFAULT_LENGTH),
|
||||
)
|
||||
} else {
|
||||
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
|
||||
// because we don't do GC in these modes.
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
tenant_conf,
|
||||
location,
|
||||
lsn_lease_deadline,
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
|
||||
}
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
tenant_conf: location_conf.tenant_conf,
|
||||
location: *attach_conf,
|
||||
}),
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
self.lsn_lease_deadline
|
||||
.map(|d| tokio::time::Instant::now() < d)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
struct TimelinePreload {
|
||||
timeline_id: TimelineId,
|
||||
@@ -591,8 +563,6 @@ pub enum CreateTimelineError {
|
||||
AncestorLsn(anyhow::Error),
|
||||
#[error("ancestor timeline is not active")]
|
||||
AncestorNotActive,
|
||||
#[error("ancestor timeline is archived")]
|
||||
AncestorArchived,
|
||||
#[error("tenant shutting down")]
|
||||
ShuttingDown,
|
||||
#[error(transparent)]
|
||||
@@ -1728,11 +1698,6 @@ impl Tenant {
|
||||
return Err(CreateTimelineError::AncestorNotActive);
|
||||
}
|
||||
|
||||
if ancestor_timeline.is_archived() == Some(true) {
|
||||
info!("tried to branch archived timeline");
|
||||
return Err(CreateTimelineError::AncestorArchived);
|
||||
}
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
*lsn = lsn.align();
|
||||
|
||||
@@ -1850,11 +1815,6 @@ impl Tenant {
|
||||
info!("Skipping GC in location state {:?}", conf.location);
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
|
||||
if conf.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
info!("Skipping GC because lsn lease deadline is not reached");
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
}
|
||||
|
||||
let _guard = match self.gc_block.start().await {
|
||||
@@ -2008,6 +1968,9 @@ impl Tenant {
|
||||
TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
|
||||
}
|
||||
TenantState::Loading => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Loading);
|
||||
}
|
||||
TenantState::Attaching => {
|
||||
*current_state = TenantState::Activating(ActivatingFrom::Attaching);
|
||||
}
|
||||
@@ -2188,7 +2151,7 @@ impl Tenant {
|
||||
async fn set_stopping(
|
||||
&self,
|
||||
progress: completion::Barrier,
|
||||
_allow_transition_from_loading: bool,
|
||||
allow_transition_from_loading: bool,
|
||||
allow_transition_from_attaching: bool,
|
||||
) -> Result<(), SetStoppingError> {
|
||||
let mut rx = self.state.subscribe();
|
||||
@@ -2203,6 +2166,7 @@ impl Tenant {
|
||||
);
|
||||
false
|
||||
}
|
||||
TenantState::Loading => allow_transition_from_loading,
|
||||
TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
|
||||
})
|
||||
.await
|
||||
@@ -2221,6 +2185,13 @@ impl Tenant {
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Loading => {
|
||||
if !allow_transition_from_loading {
|
||||
unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
|
||||
};
|
||||
*current_state = TenantState::Stopping { progress };
|
||||
true
|
||||
}
|
||||
TenantState::Active => {
|
||||
// FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
|
||||
// are created after the transition to Stopping. That's harmless, as the Timelines
|
||||
@@ -2276,7 +2247,7 @@ impl Tenant {
|
||||
// The load & attach routines own the tenant state until it has reached `Active`.
|
||||
// So, wait until it's done.
|
||||
rx.wait_for(|state| match state {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
info!(
|
||||
"waiting for {} to turn Active|Broken|Stopping",
|
||||
<&'static str>::from(state)
|
||||
@@ -2296,7 +2267,7 @@ impl Tenant {
|
||||
let reason = reason.to_string();
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Activating(_) | TenantState::Attaching => {
|
||||
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
|
||||
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
|
||||
}
|
||||
TenantState::Active => {
|
||||
@@ -2340,7 +2311,7 @@ impl Tenant {
|
||||
loop {
|
||||
let current_state = receiver.borrow_and_update().clone();
|
||||
match current_state {
|
||||
TenantState::Attaching | TenantState::Activating(_) => {
|
||||
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
||||
// in these states, there's a chance that we can reach ::Active
|
||||
self.activate_now();
|
||||
match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
|
||||
@@ -2663,8 +2634,6 @@ impl Tenant {
|
||||
Arc::new(AttachedTenantConf {
|
||||
tenant_conf: new_tenant_conf.clone(),
|
||||
location: inner.location,
|
||||
// Attached location is not changed, no need to update lsn lease deadline.
|
||||
lsn_lease_deadline: inner.lsn_lease_deadline,
|
||||
})
|
||||
});
|
||||
|
||||
@@ -3658,7 +3627,7 @@ impl Tenant {
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
@@ -3922,9 +3891,9 @@ async fn run_initdb(
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["-D", initdb_target_dir.as_ref()])
|
||||
.args(["-U", &conf.superuser])
|
||||
.args(["-E", "utf8"])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
@@ -4175,7 +4144,7 @@ pub(crate) mod harness {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Attaching,
|
||||
TenantState::Loading,
|
||||
self.conf,
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt::from(self.tenant_conf.clone()),
|
||||
@@ -4496,17 +4465,13 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -7283,17 +7248,9 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
#[tokio::test]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
|
||||
.await
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
@@ -7321,33 +7278,24 @@ mod tests {
|
||||
|
||||
let leased_lsns = [0x30, 0x50, 0x70];
|
||||
let mut leases = Vec::new();
|
||||
leased_lsns.iter().for_each(|n| {
|
||||
leases.push(
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect("lease request should succeed"),
|
||||
);
|
||||
let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
|
||||
leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
|
||||
Ok(())
|
||||
});
|
||||
|
||||
let updated_lease_0 = timeline
|
||||
.renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
|
||||
.expect("lease renewal should succeed");
|
||||
assert_eq!(
|
||||
updated_lease_0.valid_until, leases[0].valid_until,
|
||||
" Renewing with shorter lease should not change the lease."
|
||||
);
|
||||
// Renewing with shorter lease should not change the lease.
|
||||
let updated_lease_0 =
|
||||
timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
|
||||
assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
|
||||
|
||||
let updated_lease_1 = timeline
|
||||
.renew_lsn_lease(
|
||||
Lsn(leased_lsns[1]),
|
||||
timeline.get_lsn_lease_length() * 2,
|
||||
&ctx,
|
||||
)
|
||||
.expect("lease renewal should succeed");
|
||||
assert!(
|
||||
updated_lease_1.valid_until > leases[1].valid_until,
|
||||
"Renewing with a long lease should renew lease with later expiration time."
|
||||
);
|
||||
// Renewing with a long lease should renew lease with later expiration time.
|
||||
let updated_lease_1 = timeline.make_lsn_lease(
|
||||
Lsn(leased_lsns[1]),
|
||||
timeline.get_lsn_lease_length() * 2,
|
||||
&ctx,
|
||||
)?;
|
||||
|
||||
assert!(updated_lease_1.valid_until > leases[1].valid_until);
|
||||
|
||||
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
|
||||
info!(
|
||||
@@ -7364,8 +7312,7 @@ mod tests {
|
||||
&CancellationToken::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
.await?;
|
||||
|
||||
// Keeping everything <= Lsn(0x80) b/c leases:
|
||||
// 0/10: initdb layer
|
||||
@@ -7379,16 +7326,13 @@ mod tests {
|
||||
// Make lease on a already GC-ed LSN.
|
||||
// 0/80 does not have a valid lease + is below latest_gc_cutoff
|
||||
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect_err("lease request on GC-ed LSN should fail");
|
||||
let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
|
||||
assert!(res.is_err());
|
||||
|
||||
// Should still be able to renew a currently valid lease
|
||||
// Assumption: original lease to is still valid for 0/50.
|
||||
// (use `Timeline::init_lsn_lease` for testing so it always does validation)
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect("lease renewal with validation should succeed");
|
||||
let _ =
|
||||
timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ use itertools::Itertools;
|
||||
use super::storage_layer::LayerName;
|
||||
|
||||
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
|
||||
///
|
||||
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
|
||||
///
|
||||
/// ```plain
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
//! We cannot use global or default config instead, because wrong settings
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
@@ -440,6 +441,29 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<toml_edit::Item> for TenantConfOpt {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
|
||||
match item {
|
||||
toml_edit::Item::Value(value) => {
|
||||
let d = value.into_deserializer();
|
||||
return serde_path_to_error::deserialize(d)
|
||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||
}
|
||||
toml_edit::Item::Table(table) => {
|
||||
let deserializer =
|
||||
toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
|
||||
return serde_path_to_error::deserialize(deserializer)
|
||||
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||
}
|
||||
_ => {
|
||||
bail!("expected non-inline table but found {item}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a conversion from our internal tenant config object to the one used
|
||||
/// in external APIs.
|
||||
impl From<TenantConfOpt> for models::TenantConfig {
|
||||
|
||||
@@ -1,12 +1,29 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
use tokio::time::Instant;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
/// GcBlock provides persistent (per-timeline) gc blocking.
|
||||
#[derive(Default)]
|
||||
struct Storage {
|
||||
timelines_blocked: TimelinesBlocked,
|
||||
/// The deadline before which we are blocked from GC so that
|
||||
/// leases have a chance to be renewed.
|
||||
lsn_lease_deadline: Option<Instant>,
|
||||
}
|
||||
|
||||
impl Storage {
|
||||
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
self.lsn_lease_deadline
|
||||
.map(|d| Instant::now() < d)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
|
||||
/// blocking.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
@@ -49,6 +66,17 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
|
||||
///
|
||||
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
|
||||
/// length, we guarantee that all the leases we granted before will have a chance to renew
|
||||
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
|
||||
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
|
||||
let deadline = Instant::now() + lsn_lease_length;
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
g.lsn_lease_deadline = Some(deadline);
|
||||
}
|
||||
|
||||
/// Describe the current gc blocking reasons.
|
||||
///
|
||||
/// TODO: make this json serializable.
|
||||
@@ -74,7 +102,7 @@ impl GcBlock {
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
@@ -105,7 +133,7 @@ impl GcBlock {
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.entry(timeline.timeline_id) {
|
||||
match g.timelines_blocked.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
@@ -119,7 +147,7 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.len();
|
||||
let remaining_blocks = g.timelines_blocked.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
@@ -144,11 +172,11 @@ impl GcBlock {
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.is_empty() {
|
||||
if g.timelines_blocked.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.remove(&timeline.timeline_id);
|
||||
g.timelines_blocked.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
@@ -159,10 +187,11 @@ impl GcBlock {
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
assert!(g.timelines_blocked.is_empty());
|
||||
g.timelines_blocked
|
||||
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
@@ -176,6 +205,7 @@ pub(super) struct Guard<'a> {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
tenant_blocked_by_lsn_lease_deadline: bool,
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
@@ -184,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
|
||||
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -193,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.retain(|_key, value| {
|
||||
g.timelines_blocked.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
if !g.is_empty() {
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
@@ -208,14 +240,17 @@ impl BlockingReasons {
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
if g.is_empty() {
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.timelines_blocked
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::controller_upcall_client::{
|
||||
ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError,
|
||||
use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
@@ -122,7 +122,7 @@ pub(crate) enum ShardSelector {
|
||||
Known(ShardIndex),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControllerUpcallClient function: rather
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
/// than the serializable struct, we build this enum that encapsulates
|
||||
/// the invariant that attached tenants always have generations.
|
||||
///
|
||||
@@ -219,11 +219,7 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
+ TEMP_FILE_SUFFIX;
|
||||
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
|
||||
fs::rename(path.as_ref(), &tmp_path).await?;
|
||||
fs::File::open(parent)
|
||||
.await?
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("safe_rename_tenant_dir")?;
|
||||
fs::File::open(parent).await?.sync_all().await?;
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
@@ -345,7 +341,7 @@ async fn init_load_generations(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
);
|
||||
emergency_generations(tenant_confs)
|
||||
} else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
|
||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
@@ -953,6 +949,12 @@ impl TenantManager {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
if attach_conf.attach_mode == AttachmentMode::Single {
|
||||
tenant
|
||||
.gc_block
|
||||
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
}
|
||||
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
@@ -2197,82 +2199,6 @@ impl TenantManager {
|
||||
|
||||
Ok((wanted_bytes, shard_count as u32))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn immediate_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<GcResult, ApiError> {
|
||||
let tenant = {
|
||||
let guard = self.tenants.read().unwrap();
|
||||
guard
|
||||
.get(&tenant_shard_id)
|
||||
.cloned()
|
||||
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?
|
||||
};
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx: RequestContext =
|
||||
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
// we need to synchronize with drop completion for python tests without polling for
|
||||
// log messages
|
||||
if let Ok(result) = result.as_mut() {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||
js.spawn(layer.wait_drop());
|
||||
}
|
||||
tracing::info!(
|
||||
total = js.len(),
|
||||
"starting to wait for the gc'd layers to be dropped"
|
||||
);
|
||||
while let Some(res) = js.join_next().await {
|
||||
res.expect("wait_drop should not panic");
|
||||
}
|
||||
}
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false).ok();
|
||||
let rtc = timeline.as_ref().map(|x| &x.remote_client);
|
||||
|
||||
if let Some(rtc) = rtc {
|
||||
// layer drops schedule actions on remote timeline client to actually do the
|
||||
// deletions; don't care about the shutdown error, just exit fast
|
||||
drop(rtc.wait_completion().await);
|
||||
}
|
||||
}
|
||||
|
||||
result.map_err(|e| match e {
|
||||
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
|
||||
GcError::TimelineNotFound => {
|
||||
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
|
||||
}
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -2417,7 +2343,7 @@ enum TenantSlotDropError {
|
||||
/// Errors that can happen any time we are walking the tenant map to try and acquire
|
||||
/// the TenantSlot for a particular tenant.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapError {
|
||||
pub enum TenantMapError {
|
||||
// Tried to read while initializing
|
||||
#[error("tenant map is still initializing")]
|
||||
StillInitializing,
|
||||
@@ -2447,7 +2373,7 @@ pub(crate) enum TenantMapError {
|
||||
/// The `old_value` may be dropped before the SlotGuard is dropped, by calling
|
||||
/// `drop_old_value`. It is an error to call this without shutting down
|
||||
/// the conents of `old_value`.
|
||||
pub(crate) struct SlotGuard {
|
||||
pub struct SlotGuard {
|
||||
tenant_shard_id: TenantShardId,
|
||||
old_value: Option<TenantSlot>,
|
||||
upserted: bool,
|
||||
@@ -2840,6 +2766,81 @@ use {
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn immediate_gc(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<GcResult, ApiError> {
|
||||
let tenant = {
|
||||
let guard = TENANTS.read().unwrap();
|
||||
guard
|
||||
.get(&tenant_shard_id)
|
||||
.cloned()
|
||||
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?
|
||||
};
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx: RequestContext =
|
||||
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
// we need to synchronize with drop completion for python tests without polling for
|
||||
// log messages
|
||||
if let Ok(result) = result.as_mut() {
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||
js.spawn(layer.wait_drop());
|
||||
}
|
||||
tracing::info!(
|
||||
total = js.len(),
|
||||
"starting to wait for the gc'd layers to be dropped"
|
||||
);
|
||||
while let Some(res) = js.join_next().await {
|
||||
res.expect("wait_drop should not panic");
|
||||
}
|
||||
}
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false).ok();
|
||||
let rtc = timeline.as_ref().map(|x| &x.remote_client);
|
||||
|
||||
if let Some(rtc) = rtc {
|
||||
// layer drops schedule actions on remote timeline client to actually do the
|
||||
// deletions; don't care about the shutdown error, just exit fast
|
||||
drop(rtc.wait_completion().await);
|
||||
}
|
||||
}
|
||||
|
||||
result.map_err(|e| match e {
|
||||
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
|
||||
GcError::TimelineNotFound => {
|
||||
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
|
||||
}
|
||||
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
@@ -178,7 +178,6 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -186,7 +185,6 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -234,7 +232,6 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
//! Common traits and structs for layers
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod filter_iterator;
|
||||
pub mod image_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -39,12 +39,12 @@ use crate::tenant::disk_btree::{
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
@@ -589,9 +589,7 @@ impl DeltaLayerWriterInner {
|
||||
);
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("delta_layer sync_all")?;
|
||||
file.sync_all().await?;
|
||||
|
||||
trace!("created delta layer {}", self.path);
|
||||
|
||||
@@ -1023,30 +1021,13 @@ impl DeltaLayerInner {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter().rev() {
|
||||
if Some(meta.meta.key) == ignore_key_with_err {
|
||||
continue;
|
||||
}
|
||||
let blob_read = meta.read(&view).await;
|
||||
let blob_read = match blob_read {
|
||||
Ok(buf) => buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
ignore_key_with_err = Some(meta.meta.key);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let value = Value::des(&blob_read);
|
||||
|
||||
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
|
||||
let value = match value {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
@@ -1135,7 +1116,7 @@ impl DeltaLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
};
|
||||
use futures::stream::TryStreamExt;
|
||||
|
||||
@@ -1185,8 +1166,8 @@ impl DeltaLayerInner {
|
||||
|
||||
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||
|
||||
let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||
let read_mode = VectoredReadCoalesceMode::get();
|
||||
|
||||
let max_read_size = self
|
||||
.max_vectored_read_bytes
|
||||
@@ -1230,12 +1211,12 @@ impl DeltaLayerInner {
|
||||
{
|
||||
None
|
||||
} else {
|
||||
read_builder.replace(ChunkedVectoredReadBuilder::new(
|
||||
read_builder.replace(VectoredReadBuilder::new(
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
align,
|
||||
read_mode,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1262,21 +1243,21 @@ impl DeltaLayerInner {
|
||||
buf.reserve(read.size());
|
||||
let res = reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let view = BufView::new_slice(&res.buf);
|
||||
|
||||
for blob in res.blobs {
|
||||
let key = blob.meta.key;
|
||||
let lsn = blob.meta.lsn;
|
||||
|
||||
let data = blob.read(&view).await?;
|
||||
let data = &res.buf[blob.start..blob.end];
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
Value::des(&data)
|
||||
Value::des(data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"blob failed to deserialize for {}: {:?}",
|
||||
blob,
|
||||
utils::Hex(&data)
|
||||
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
|
||||
blob.meta.key,
|
||||
blob.meta.lsn,
|
||||
blob.start,
|
||||
blob.end,
|
||||
utils::Hex(data)
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
@@ -1284,15 +1265,15 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
per_blob_copy.clear();
|
||||
per_blob_copy.extend_from_slice(&data);
|
||||
per_blob_copy.extend_from_slice(data);
|
||||
|
||||
let (tmp, res) = writer
|
||||
.put_value_bytes(
|
||||
@@ -1557,11 +1538,8 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let blob_read = meta.read(&view).await?;
|
||||
let value = Value::des(&blob_read)?;
|
||||
|
||||
let value = Value::des(&frozen_buf[meta.start..meta.end])?;
|
||||
next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
@@ -1938,13 +1916,9 @@ pub(crate) mod test {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
|
||||
.await?;
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = meta.read(&view).await?;
|
||||
assert_eq!(
|
||||
&value[..],
|
||||
&entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
|
||||
);
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
|
||||
@@ -1,205 +0,0 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::{KeySpace, SparseKeySpace},
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
|
||||
///
|
||||
/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
|
||||
/// to be retained.
|
||||
pub struct FilterIterator<'a> {
|
||||
inner: MergeIterator<'a>,
|
||||
retain_key_filters: Vec<Range<Key>>,
|
||||
current_filter_idx: usize,
|
||||
}
|
||||
|
||||
impl<'a> FilterIterator<'a> {
|
||||
pub fn create(
|
||||
inner: MergeIterator<'a>,
|
||||
dense_keyspace: KeySpace,
|
||||
sparse_keyspace: SparseKeySpace,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut retain_key_filters = Vec::new();
|
||||
retain_key_filters.extend(dense_keyspace.ranges);
|
||||
retain_key_filters.extend(sparse_keyspace.0.ranges);
|
||||
retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
|
||||
// Verify key filters are non-overlapping and sorted
|
||||
for window in retain_key_filters.windows(2) {
|
||||
if window[0].end > window[1].start {
|
||||
bail!(
|
||||
"Key filters are overlapping: {:?} and {:?}",
|
||||
window[0],
|
||||
window[1]
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
inner,
|
||||
retain_key_filters,
|
||||
current_filter_idx: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(item) = self.inner.next().await? {
|
||||
while self.current_filter_idx < self.retain_key_filters.len()
|
||||
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
{
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
self.current_filter_idx += 1;
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
if self.current_filter_idx >= self.retain_key_filters.len() {
|
||||
// We already exhausted all filters, so we should return now
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter (nothing)
|
||||
return Ok(None);
|
||||
}
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
return Ok(Some(item));
|
||||
}
|
||||
// If the key is not contained in the key retaining filters, continue to the next item.
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::produce_delta_layer,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
async fn assert_filter_iter_equal(
|
||||
filter_iter: &mut FilterIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = filter_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 100;
|
||||
let test_deltas1 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(5)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(110),
|
||||
get_key(1000)..get_key(2000),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[5..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..100].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(0)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(95),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[0..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..95].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
}
|
||||
}
|
||||
@@ -36,12 +36,11 @@ use crate::tenant::disk_btree::{
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
@@ -548,15 +547,15 @@ impl ImageLayerInner {
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
|
||||
key_count += 1;
|
||||
writer
|
||||
.put_image(meta.meta.key, img_buf.into_bytes(), ctx)
|
||||
.put_image(meta.meta.key, img_buf, ctx)
|
||||
.await
|
||||
.context(format!("Storing key {}", meta.meta.key))?;
|
||||
}
|
||||
@@ -603,28 +602,13 @@ impl ImageLayerInner {
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
let img_buf = match img_buf {
|
||||
Ok(img_buf) => img_buf,
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
))),
|
||||
);
|
||||
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
reconstruct_state.update_key(
|
||||
&meta.meta.key,
|
||||
self.lsn,
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
Value::Image(img_buf),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -889,9 +873,7 @@ impl ImageLayerWriterInner {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("image_layer sync_all")?;
|
||||
file.sync_all().await?;
|
||||
|
||||
trace!("created image layer {}", self.path);
|
||||
|
||||
@@ -1043,15 +1025,10 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let frozen_buf: Bytes = blobs_buf.buf.freeze();
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
next_batch.push_back((
|
||||
meta.meta.key,
|
||||
self.image_layer.lsn,
|
||||
Value::Image(img_buf.into_bytes()),
|
||||
));
|
||||
let img_buf = frozen_buf.slice(meta.start..meta.end);
|
||||
next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
|
||||
}
|
||||
self.key_values_batch = next_batch;
|
||||
Ok(())
|
||||
|
||||
@@ -330,6 +330,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
let mut first = true;
|
||||
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -480,7 +481,8 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
|
||||
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
|
||||
let delta = now - prev;
|
||||
info!(
|
||||
n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
|
||||
n_seconds=%format_args!("{:.3}",
|
||||
delta.as_secs_f64()),
|
||||
count_accounted = count_accounted_finish, // don't break existing log scraping
|
||||
count_throttled,
|
||||
sum_throttled_usecs,
|
||||
|
||||
@@ -66,7 +66,6 @@ use std::{
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
|
||||
@@ -113,7 +112,7 @@ use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
completion,
|
||||
generation::Generation,
|
||||
@@ -1325,53 +1324,27 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`.
|
||||
pub(crate) fn init_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
self.make_lsn_lease(lsn, length, true, ctx)
|
||||
}
|
||||
|
||||
/// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period.
|
||||
pub(crate) fn renew_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
self.make_lsn_lease(lsn, length, false, ctx)
|
||||
}
|
||||
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN.
|
||||
///
|
||||
/// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error
|
||||
/// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present.
|
||||
///
|
||||
/// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease.
|
||||
/// The returned lease is therefore the maximum between the existing lease and the requesting lease.
|
||||
fn make_lsn_lease(
|
||||
/// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
|
||||
/// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
|
||||
/// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
|
||||
/// the requesting lease.
|
||||
pub(crate) fn make_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
init: bool,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
let lease = {
|
||||
// Normalize the requested LSN to be aligned, and move to the first record
|
||||
// if it points to the beginning of the page (header).
|
||||
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
|
||||
|
||||
let mut gc_info = self.gc_info.write().unwrap();
|
||||
|
||||
let valid_until = SystemTime::now() + length;
|
||||
|
||||
let entry = gc_info.leases.entry(lsn);
|
||||
|
||||
match entry {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
let lease = {
|
||||
if let Entry::Occupied(mut occupied) = entry {
|
||||
let existing_lease = occupied.get_mut();
|
||||
if valid_until > existing_lease.valid_until {
|
||||
existing_lease.valid_until = valid_until;
|
||||
@@ -1383,28 +1356,20 @@ impl Timeline {
|
||||
}
|
||||
|
||||
existing_lease.clone()
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
conf.location.attach_mode == AttachmentMode::Single
|
||||
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
} else {
|
||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
|
||||
let dt: DateTime<Utc> = valid_until.into();
|
||||
info!("lease created, valid until {}", dt);
|
||||
vacant.insert(LsnLease { valid_until }).clone()
|
||||
entry.or_insert(LsnLease { valid_until }).clone()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
lease
|
||||
};
|
||||
|
||||
Ok(lease)
|
||||
@@ -1981,6 +1946,8 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
// TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
|
||||
#[allow(unused)]
|
||||
pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -3630,7 +3597,7 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
@@ -3869,20 +3836,16 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
|
||||
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
|
||||
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
|
||||
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
|
||||
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
|
||||
// and hence before the compaction task starts.
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called concurrently, this should not happen"
|
||||
)));
|
||||
anyhow::bail!("repartition() called concurrently, this should not happen");
|
||||
};
|
||||
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
|
||||
if lsn < *partition_lsn {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"repartition() called with LSN going backwards, this should not happen"
|
||||
)));
|
||||
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
|
||||
}
|
||||
|
||||
let distance = lsn.0 - partition_lsn.0;
|
||||
@@ -4484,12 +4447,6 @@ pub(crate) enum CompactionError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
matches!(self, CompactionError::ShuttingDown)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CollectKeySpaceError> for CompactionError {
|
||||
fn from(err: CollectKeySpaceError) -> Self {
|
||||
match err {
|
||||
|
||||
@@ -31,7 +31,6 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
|
||||
use crate::page_cache;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
@@ -390,7 +389,7 @@ impl Timeline {
|
||||
// error but continue.
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
if !self.cancel.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
(1, false)
|
||||
@@ -1773,7 +1772,6 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
);
|
||||
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
@@ -1822,12 +1820,7 @@ impl Timeline {
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
|
||||
@@ -30,8 +30,8 @@ use crate::{
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
|
||||
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
|
||||
storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -557,8 +557,6 @@ impl Timeline {
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
// It can happen sometimes that we hit this instead of the cancellation token firing above
|
||||
Err(CalculateSyntheticSizeError::Cancelled) => {}
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
|
||||
@@ -16,9 +16,8 @@
|
||||
//! Note that the vectored blob api does *not* go through the page cache.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
@@ -36,123 +35,11 @@ pub struct BlobMeta {
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// A view into the vectored blobs read buffer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum BufView<'a> {
|
||||
Slice(&'a [u8]),
|
||||
Bytes(bytes::Bytes),
|
||||
}
|
||||
|
||||
impl<'a> BufView<'a> {
|
||||
/// Creates a new slice-based view on the blob.
|
||||
pub fn new_slice(slice: &'a [u8]) -> Self {
|
||||
Self::Slice(slice)
|
||||
}
|
||||
|
||||
/// Creates a new [`bytes::Bytes`]-based view on the blob.
|
||||
pub fn new_bytes(bytes: bytes::Bytes) -> Self {
|
||||
Self::Bytes(bytes)
|
||||
}
|
||||
|
||||
/// Convert the view into `Bytes`.
|
||||
///
|
||||
/// If using slice as the underlying storage, the copy will be an O(n) operation.
|
||||
pub fn into_bytes(self) -> Bytes {
|
||||
match self {
|
||||
BufView::Slice(slice) => Bytes::copy_from_slice(slice),
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a sub-view of the blob based on the range.
|
||||
fn view(&self, range: std::ops::Range<usize>) -> Self {
|
||||
match self {
|
||||
BufView::Slice(slice) => BufView::Slice(&slice[range]),
|
||||
BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BufView<'a> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> AsRef<[u8]> for BufView<'a> {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
BufView::Slice(slice) => slice,
|
||||
BufView::Bytes(bytes) => bytes.as_ref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for BufView<'a> {
|
||||
fn from(value: &'a [u8]) -> Self {
|
||||
Self::new_slice(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Bytes> for BufView<'_> {
|
||||
fn from(value: Bytes) -> Self {
|
||||
Self::new_bytes(value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
|
||||
/// subject to [`VectoredBlob::compression_bits`].
|
||||
/// Blob offsets into [`VectoredBlobsBuf::buf`]
|
||||
pub struct VectoredBlob {
|
||||
/// Blob metadata.
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
pub meta: BlobMeta,
|
||||
/// Start offset.
|
||||
start: usize,
|
||||
/// End offset.
|
||||
end: usize,
|
||||
/// Compression used on the the blob.
|
||||
compression_bits: u8,
|
||||
}
|
||||
|
||||
impl VectoredBlob {
|
||||
/// Reads a decompressed view of the blob.
|
||||
pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
|
||||
let view = buf.view(self.start..self.end);
|
||||
|
||||
match self.compression_bits {
|
||||
BYTE_UNCOMPRESSED => Ok(view),
|
||||
BYTE_ZSTD => {
|
||||
let mut decompressed_vec = Vec::new();
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder.write_all(&view).await?;
|
||||
decoder.flush().await?;
|
||||
// Zero-copy conversion from `Vec` to `Bytes`
|
||||
Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
|
||||
}
|
||||
bits => {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
|
||||
);
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VectoredBlob {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}@{}, {}..{}",
|
||||
self.meta.key, self.meta.lsn, self.start, self.end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
@@ -185,7 +72,171 @@ pub(crate) enum VectoredReadExtended {
|
||||
No,
|
||||
}
|
||||
|
||||
/// A vectored read builder that tries to coalesce all reads that fits in a chunk.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum VectoredReadCoalesceMode {
|
||||
/// Only coalesce exactly adjacent reads.
|
||||
AdjacentOnly,
|
||||
/// In addition to adjacent reads, also consider reads whose corresponding
|
||||
/// `end` and `start` offsets reside at the same chunk.
|
||||
Chunked(usize),
|
||||
}
|
||||
|
||||
impl VectoredReadCoalesceMode {
|
||||
/// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
|
||||
/// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
|
||||
pub(crate) fn get() -> Self {
|
||||
let align = virtual_file::get_io_buffer_alignment_raw();
|
||||
if align == 0 {
|
||||
VectoredReadCoalesceMode::AdjacentOnly
|
||||
} else {
|
||||
VectoredReadCoalesceMode::Chunked(align)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum VectoredReadBuilder {
|
||||
Adjacent(AdjacentVectoredReadBuilder),
|
||||
Chunked(ChunkedVectoredReadBuilder),
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
fn new_impl(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
match mode {
|
||||
VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
|
||||
AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
|
||||
),
|
||||
VectoredReadCoalesceMode::Chunked(chunk_size) => {
|
||||
Self::Chunked(ChunkedVectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
meta,
|
||||
max_read_size,
|
||||
chunk_size,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
|
||||
}
|
||||
|
||||
pub(crate) fn new_streaming(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, None, mode)
|
||||
}
|
||||
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.build(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.build(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.size(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.size(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct AdjacentVectoredReadBuilder {
|
||||
/// Start offset of the read.
|
||||
start: u64,
|
||||
// End offset of the read.
|
||||
end: u64,
|
||||
/// Start offset and metadata for each blob in this read
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
max_read_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl AdjacentVectoredReadBuilder {
|
||||
/// Start building a new vectored read.
|
||||
///
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, meta)
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
Self {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
/// offset matches with the current end of the vectored read
|
||||
/// and the resuting size is below the max read size
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let size = (end - start) as usize;
|
||||
let not_limited_by_max_read_size = {
|
||||
if let Some(max_read_size) = self.max_read_size {
|
||||
self.size() + size <= max_read_size
|
||||
} else {
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if self.end == start && not_limited_by_max_read_size {
|
||||
self.end = end;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
.expect("LSNs are ordered within vectored reads");
|
||||
|
||||
return VectoredReadExtended::Yes;
|
||||
}
|
||||
|
||||
VectoredReadExtended::No
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
VectoredRead {
|
||||
start: self.start,
|
||||
end: self.end,
|
||||
blobs_at: self.blobs_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct ChunkedVectoredReadBuilder {
|
||||
/// Start block number
|
||||
start_blk_no: usize,
|
||||
@@ -209,7 +260,7 @@ impl ChunkedVectoredReadBuilder {
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
fn new_impl(
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
@@ -232,25 +283,6 @@ impl ChunkedVectoredReadBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
align: usize,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
|
||||
}
|
||||
|
||||
pub(crate) fn new_streaming(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
align: usize,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, None, align)
|
||||
}
|
||||
|
||||
/// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
|
||||
///
|
||||
/// The resulting size also must be below the max read size.
|
||||
@@ -329,17 +361,17 @@ pub struct VectoredReadPlanner {
|
||||
|
||||
max_read_size: usize,
|
||||
|
||||
align: usize,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
pub fn new(max_read_size: usize) -> Self {
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size,
|
||||
align,
|
||||
mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,7 +432,7 @@ impl VectoredReadPlanner {
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Vec<VectoredRead> {
|
||||
let mut current_read_builder: Option<ChunkedVectoredReadBuilder> = None;
|
||||
let mut current_read_builder: Option<VectoredReadBuilder> = None;
|
||||
let mut reads = Vec::new();
|
||||
|
||||
for (key, blobs_for_key) in self.blobs {
|
||||
@@ -413,12 +445,12 @@ impl VectoredReadPlanner {
|
||||
};
|
||||
|
||||
if extended == VectoredReadExtended::No {
|
||||
let next_read_builder = ChunkedVectoredReadBuilder::new(
|
||||
let next_read_builder = VectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.max_read_size,
|
||||
self.align,
|
||||
self.mode,
|
||||
);
|
||||
|
||||
let prev_read_builder = current_read_builder.replace(next_read_builder);
|
||||
@@ -482,7 +514,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
);
|
||||
}
|
||||
|
||||
let buf = self
|
||||
let mut buf = self
|
||||
.file
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
.await?
|
||||
@@ -497,6 +529,9 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
// of a blob is implicit: the start of the next blob if one exists
|
||||
// or the end of the read.
|
||||
|
||||
// Some scratch space, put here for reusing the allocation
|
||||
let mut decompressed_vec = Vec::new();
|
||||
|
||||
for (blob_start, meta) in blobs_at {
|
||||
let blob_start_in_buf = blob_start - start_offset;
|
||||
let first_len_byte = buf[blob_start_in_buf as usize];
|
||||
@@ -522,14 +557,35 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
)
|
||||
};
|
||||
|
||||
let start = (blob_start_in_buf + size_length) as usize;
|
||||
let end = start + blob_size as usize;
|
||||
let start_raw = blob_start_in_buf + size_length;
|
||||
let end_raw = start_raw + blob_size;
|
||||
let (start, end);
|
||||
if compression_bits == BYTE_UNCOMPRESSED {
|
||||
start = start_raw as usize;
|
||||
end = end_raw as usize;
|
||||
} else if compression_bits == BYTE_ZSTD {
|
||||
let mut decoder =
|
||||
async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
|
||||
decoder
|
||||
.write_all(&buf[start_raw as usize..end_raw as usize])
|
||||
.await?;
|
||||
decoder.flush().await?;
|
||||
start = buf.len();
|
||||
buf.extend_from_slice(&decompressed_vec);
|
||||
end = buf.len();
|
||||
decompressed_vec.clear();
|
||||
} else {
|
||||
let error = std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
format!("invalid compression byte {compression_bits:x}"),
|
||||
);
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
metas.push(VectoredBlob {
|
||||
start,
|
||||
end,
|
||||
meta: *meta,
|
||||
compression_bits,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -543,7 +599,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
/// `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
read_builder: Option<ChunkedVectoredReadBuilder>,
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64)>,
|
||||
/// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
|
||||
@@ -554,21 +610,21 @@ pub struct StreamingVectoredReadPlanner {
|
||||
/// Size of the current batch
|
||||
cnt: usize,
|
||||
|
||||
align: usize,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
}
|
||||
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
assert!(max_read_size > 0);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
Self {
|
||||
read_builder: None,
|
||||
prev: None,
|
||||
max_cnt,
|
||||
max_read_size,
|
||||
cnt: 0,
|
||||
align,
|
||||
mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -617,11 +673,11 @@ impl StreamingVectoredReadPlanner {
|
||||
}
|
||||
None => {
|
||||
self.read_builder = {
|
||||
Some(ChunkedVectoredReadBuilder::new_streaming(
|
||||
Some(VectoredReadBuilder::new_streaming(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.align,
|
||||
self.mode,
|
||||
))
|
||||
};
|
||||
}
|
||||
@@ -947,7 +1003,7 @@ mod tests {
|
||||
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||
let meta = BlobMeta {
|
||||
key: Key::MIN,
|
||||
@@ -959,19 +1015,13 @@ mod tests {
|
||||
if idx + 1 == offsets.len() {
|
||||
continue;
|
||||
}
|
||||
let read_builder =
|
||||
ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
|
||||
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
|
||||
let read = read_builder.build();
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
let read_blob = &result.blobs[0];
|
||||
let view = BufView::new_slice(&result.buf);
|
||||
let read_buf = read_blob.read(&view).await?;
|
||||
assert_eq!(
|
||||
&blob[..],
|
||||
&read_buf[..],
|
||||
"mismatch for idx={idx} at offset={offset}"
|
||||
);
|
||||
let read_buf = &result.buf[read_blob.start..read_blob.end];
|
||||
assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
|
||||
buf = result.buf;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -466,7 +466,6 @@ impl VirtualFile {
|
||||
&[]
|
||||
};
|
||||
utils::crashsafe::overwrite(&final_path, &tmp_path, content)
|
||||
.maybe_fatal_err("crashsafe_overwrite")
|
||||
})
|
||||
.await
|
||||
.expect("blocking task is never aborted")
|
||||
@@ -476,7 +475,7 @@ impl VirtualFile {
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
|
||||
res.maybe_fatal_err("sync_all")
|
||||
res
|
||||
})
|
||||
}
|
||||
|
||||
@@ -484,7 +483,7 @@ impl VirtualFile {
|
||||
pub async fn sync_data(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
|
||||
res.maybe_fatal_err("sync_data")
|
||||
res
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1148,9 +1147,7 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize)
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
if set_io_buffer_alignment(io_buffer_alignment).is_err() {
|
||||
panic!(
|
||||
"IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
|
||||
);
|
||||
panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
|
||||
}
|
||||
io_engine::init(engine);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
@@ -1177,16 +1174,14 @@ fn get_open_files() -> &'static OpenFiles {
|
||||
|
||||
static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
|
||||
|
||||
/// Returns true if the alignment is a power of two and is greater or equal to 512.
|
||||
fn is_valid_io_buffer_alignment(align: usize) -> bool {
|
||||
align.is_power_of_two() && align >= 512
|
||||
/// Returns true if `x` is zero or a power of two.
|
||||
fn is_zero_or_power_of_two(x: usize) -> bool {
|
||||
(x == 0) || ((x & (x - 1)) == 0)
|
||||
}
|
||||
|
||||
/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
|
||||
/// not a power of two or less than 512 bytes.
|
||||
#[allow(unused)]
|
||||
pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
|
||||
if is_valid_io_buffer_alignment(align) {
|
||||
if is_zero_or_power_of_two(align) {
|
||||
IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -1194,19 +1189,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment.
|
||||
/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
|
||||
///
|
||||
/// This function should be used for getting the actual alignment value to use.
|
||||
pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
/// This function should be used to check the raw config value.
|
||||
pub(crate) fn get_io_buffer_alignment_raw() -> usize {
|
||||
let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
if cfg!(test) {
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
|
||||
if let Some(test_align) = utils::env::var(env_var_name) {
|
||||
if is_valid_io_buffer_alignment(test_align) {
|
||||
if is_zero_or_power_of_two(test_align) {
|
||||
test_align
|
||||
} else {
|
||||
panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
|
||||
panic!("IO buffer alignment ({test_align}) is not a power of two");
|
||||
}
|
||||
} else {
|
||||
align
|
||||
@@ -1216,6 +1211,14 @@ pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
|
||||
///
|
||||
/// This function should be used for getting the actual alignment value to use.
|
||||
pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
let align = get_io_buffer_alignment_raw();
|
||||
align.max(1)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::context::DownloadBehavior;
|
||||
|
||||
@@ -9,8 +9,6 @@ OBJS = \
|
||||
hll.o \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_pgversioncompat.o \
|
||||
neon_perf_counters.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
@@ -25,18 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = \
|
||||
neon--1.0.sql \
|
||||
neon--1.0--1.1.sql \
|
||||
neon--1.1--1.2.sql \
|
||||
neon--1.2--1.3.sql \
|
||||
neon--1.3--1.4.sql \
|
||||
neon--1.4--1.5.sql \
|
||||
neon--1.5--1.4.sql \
|
||||
neon--1.4--1.3.sql \
|
||||
neon--1.3--1.2.sql \
|
||||
neon--1.2--1.1.sql \
|
||||
neon--1.1--1.0.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -109,7 +109,6 @@ typedef struct FileCacheControl
|
||||
* reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
uint32 used_pages; /* number of used pages */
|
||||
uint32 limit; /* shared copy of lfc_size_limit */
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
@@ -906,10 +905,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
|
||||
}
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
@@ -964,7 +959,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
|
||||
entry->bitmap[(chunk_offs + i) >> 5] |=
|
||||
(1 << ((chunk_offs + i) & 31));
|
||||
}
|
||||
@@ -1057,11 +1051,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
case 5:
|
||||
key = "file_cache_used_pages";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->used_pages;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_utils.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "walproposer.h"
|
||||
@@ -332,7 +331,6 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
MyNeonCounters->pageserver_disconnects_total++;
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
@@ -739,8 +737,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
|
||||
MyNeonCounters->pageserver_requests_sent_total++;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
@@ -893,7 +889,6 @@ pageserver_flush(shardno_t shard_no)
|
||||
}
|
||||
else
|
||||
{
|
||||
MyNeonCounters->pageserver_send_flushes_total++;
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
@@ -927,7 +922,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
static Size
|
||||
PagestoreShmemSize(void)
|
||||
{
|
||||
return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
|
||||
return sizeof(PagestoreShmemState);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -937,7 +932,7 @@ PagestoreShmemInit(void)
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
pagestore_shared = ShmemInitStruct("libpagestore shared state",
|
||||
sizeof(PagestoreShmemState),
|
||||
PagestoreShmemSize(),
|
||||
&found);
|
||||
if (!found)
|
||||
{
|
||||
@@ -946,9 +941,6 @@ PagestoreShmemInit(void)
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
|
||||
NeonPerfCountersShmemInit();
|
||||
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
}
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
|
||||
|
||||
|
||||
CREATE FUNCTION get_backend_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION get_perf_counters()
|
||||
RETURNS SETOF RECORD
|
||||
AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
-- Show various metrics, for each backend. Note that the values are not reset
|
||||
-- when a backend exits. When a new backend starts with the backend ID, it will
|
||||
-- continue accumulating the values from where the old backend left. If you are
|
||||
-- only interested in the changes from your own session, store the values at the
|
||||
-- beginning of the session somewhere, and subtract them on subsequent calls.
|
||||
--
|
||||
-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
|
||||
CREATE VIEW neon_backend_perf_counters AS
|
||||
SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
|
||||
FROM get_backend_perf_counters() AS P (
|
||||
procno integer,
|
||||
pid integer,
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
|
||||
-- Summary across all backends. (This could also be implemented with
|
||||
-- an aggregate query over neon_backend_perf_counters view.)
|
||||
CREATE VIEW neon_perf_counters AS
|
||||
SELECT P.metric, P.bucket_le, P.value
|
||||
FROM get_perf_counters() AS P (
|
||||
metric text,
|
||||
bucket_le float8,
|
||||
value float8
|
||||
);
|
||||
@@ -1,4 +0,0 @@
|
||||
DROP VIEW IF EXISTS neon_perf_counters;
|
||||
DROP VIEW IF EXISTS neon_backend_perf_counters;
|
||||
DROP FUNCTION IF EXISTS get_perf_counters();
|
||||
DROP FUNCTION IF EXISTS get_backend_perf_counters();
|
||||
@@ -1,6 +1,6 @@
|
||||
# neon extension
|
||||
comment = 'cloud storage for PostgreSQL'
|
||||
default_version = '1.5'
|
||||
default_version = '1.4'
|
||||
module_pathname = '$libdir/neon'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
||||
@@ -1,262 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon_perf_counters.c
|
||||
* Collect statistics about Neon I/O
|
||||
*
|
||||
* Each backend has its own set of counters in shared memory.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
#include "neon_perf_counters.h"
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
|
||||
Size
|
||||
NeonPerfCountersShmemSize(void)
|
||||
{
|
||||
Size size = 0;
|
||||
|
||||
size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
|
||||
sizeof(neon_per_backend_counters)));
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
void
|
||||
NeonPerfCountersShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
|
||||
neon_per_backend_counters_shared =
|
||||
ShmemInitStruct("Neon perf counters",
|
||||
mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
|
||||
sizeof(neon_per_backend_counters)),
|
||||
&found);
|
||||
Assert(found == IsUnderPostmaster);
|
||||
if (!found)
|
||||
{
|
||||
/* shared memory is initialized to zeros, so nothing to do here */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Count a GetPage wait operation.
|
||||
*/
|
||||
void
|
||||
inc_getpage_wait(uint64 latency_us)
|
||||
{
|
||||
int lo = 0;
|
||||
int hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
|
||||
|
||||
/* Find the right bucket with binary search */
|
||||
while (lo < hi)
|
||||
{
|
||||
int mid = (lo + hi) / 2;
|
||||
|
||||
if (latency_us < getpage_wait_bucket_thresholds[mid])
|
||||
hi = mid;
|
||||
else
|
||||
lo = mid + 1;
|
||||
}
|
||||
MyNeonCounters->getpage_wait_us_bucket[lo]++;
|
||||
MyNeonCounters->getpage_wait_us_sum += latency_us;
|
||||
MyNeonCounters->getpage_wait_us_count++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Support functions for the views, neon_backend_perf_counters and
|
||||
* neon_perf_counters.
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *name;
|
||||
bool is_bucket;
|
||||
double bucket_le;
|
||||
double value;
|
||||
} metric_t;
|
||||
|
||||
static metric_t *
|
||||
neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
{
|
||||
#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
|
||||
metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
|
||||
uint64 bucket_accum;
|
||||
int i = 0;
|
||||
Datum getpage_wait_str;
|
||||
|
||||
metrics[i].name = "getpage_wait_seconds_count";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_wait_us_count;
|
||||
i++;
|
||||
metrics[i].name = "getpage_wait_seconds_sum";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
|
||||
i++;
|
||||
|
||||
bucket_accum = 0;
|
||||
for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
|
||||
{
|
||||
uint64 threshold = getpage_wait_bucket_thresholds[bucketno];
|
||||
|
||||
bucket_accum += counters->getpage_wait_us_bucket[bucketno];
|
||||
|
||||
metrics[i].name = "getpage_wait_seconds_bucket";
|
||||
metrics[i].is_bucket = true;
|
||||
metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
|
||||
metrics[i].value = (double) bucket_accum;
|
||||
i++;
|
||||
}
|
||||
metrics[i].name = "getpage_prefetch_requests_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_requests_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_sync_requests_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_sync_requests_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_prefetch_misses_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_misses_total;
|
||||
i++;
|
||||
metrics[i].name = "getpage_prefetch_discards_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->getpage_prefetch_discards_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_requests_sent_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_requests_sent_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_disconnects_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_disconnects_total;
|
||||
i++;
|
||||
metrics[i].name = "pageserver_send_flushes_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->pageserver_send_flushes_total;
|
||||
i++;
|
||||
metrics[i].name = "file_cache_hits_total";
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) counters->file_cache_hits_total;
|
||||
i++;
|
||||
|
||||
Assert(i == NUM_METRICS);
|
||||
|
||||
/* NULL entry marks end of array */
|
||||
metrics[i].name = NULL;
|
||||
metrics[i].value = 0;
|
||||
|
||||
return metrics;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write metric to three output Datums
|
||||
*/
|
||||
static void
|
||||
metric_to_datums(metric_t *m, Datum *values, bool *nulls)
|
||||
{
|
||||
values[0] = CStringGetTextDatum(m->name);
|
||||
nulls[0] = false;
|
||||
if (m->is_bucket)
|
||||
{
|
||||
values[1] = Float8GetDatum(m->bucket_le);
|
||||
nulls[1] = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
values[1] = (Datum) 0;
|
||||
nulls[1] = true;
|
||||
}
|
||||
values[2] = Float8GetDatum(m->value);
|
||||
nulls[2] = false;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters);
|
||||
Datum
|
||||
neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Datum values[5];
|
||||
bool nulls[5];
|
||||
|
||||
/* We put all the tuples into a tuplestore in one go. */
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
|
||||
{
|
||||
PGPROC *proc = GetPGProcByNumber(procno);
|
||||
int pid = proc->pid;
|
||||
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
|
||||
metric_t *metrics = neon_perf_counters_to_metrics(counters);
|
||||
|
||||
values[0] = Int32GetDatum(procno);
|
||||
nulls[0] = false;
|
||||
values[1] = Int32GetDatum(pid);
|
||||
nulls[1] = false;
|
||||
|
||||
for (int i = 0; metrics[i].name != NULL; i++)
|
||||
{
|
||||
metric_to_datums(&metrics[i], &values[2], &nulls[2]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
|
||||
pfree(metrics);
|
||||
}
|
||||
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_perf_counters);
|
||||
Datum
|
||||
neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Datum values[3];
|
||||
bool nulls[3];
|
||||
Datum getpage_wait_str;
|
||||
neon_per_backend_counters totals = {0};
|
||||
metric_t *metrics;
|
||||
|
||||
/* We put all the tuples into a tuplestore in one go. */
|
||||
InitMaterializedSRF(fcinfo, 0);
|
||||
|
||||
/* Aggregate the counters across all backends */
|
||||
for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
|
||||
{
|
||||
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
|
||||
|
||||
totals.getpage_wait_us_count += counters->getpage_wait_us_count;
|
||||
totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
|
||||
for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
|
||||
totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
|
||||
totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
|
||||
totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
|
||||
totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
|
||||
totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total;
|
||||
totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
|
||||
totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
|
||||
totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
|
||||
totals.file_cache_hits_total += counters->file_cache_hits_total;
|
||||
}
|
||||
|
||||
metrics = neon_perf_counters_to_metrics(&totals);
|
||||
for (int i = 0; metrics[i].name != NULL; i++)
|
||||
{
|
||||
metric_to_datums(&metrics[i], &values[0], &nulls[0]);
|
||||
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
|
||||
}
|
||||
pfree(metrics);
|
||||
|
||||
return (Datum) 0;
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon_perf_counters.h
|
||||
* Performance counters for neon storage requests
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef NEON_PERF_COUNTERS_H
|
||||
#define NEON_PERF_COUNTERS_H
|
||||
|
||||
#if PG_VERSION_NUM >= 170000
|
||||
#include "storage/procnumber.h"
|
||||
#else
|
||||
#include "storage/backendid.h"
|
||||
#include "storage/proc.h"
|
||||
#endif
|
||||
|
||||
static const uint64 getpage_wait_bucket_thresholds[] = {
|
||||
20, 30, 60, 100, /* 0 - 100 us */
|
||||
200, 300, 600, 1000, /* 100 us - 1 ms */
|
||||
2000, 3000, 6000, 10000, /* 1 ms - 10 ms */
|
||||
20000, 30000, 60000, 100000, /* 10 ms - 100 ms */
|
||||
200000, 300000, 600000, 1000000, /* 100 ms - 1 s */
|
||||
2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */
|
||||
20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */
|
||||
UINT64_MAX,
|
||||
};
|
||||
#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
* Histogram for how long an smgrread() request needs to wait for response
|
||||
* from pageserver. When prefetching is effective, these wait times can be
|
||||
* lower than the network latency to the pageserver, even zero, if the
|
||||
* page is already readily prefetched whenever we need to read a page.
|
||||
*
|
||||
* Note: we accumulate these in microseconds, because that's convenient in
|
||||
* the backend, but the 'neon_backend_perf_counters' view will convert
|
||||
* them to seconds, to make them more idiomatic as prometheus metrics.
|
||||
*/
|
||||
uint64 getpage_wait_us_count;
|
||||
uint64 getpage_wait_us_sum;
|
||||
uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
|
||||
|
||||
/*
|
||||
* Total number of speculative prefetch Getpage requests and synchronous
|
||||
* GetPage requests sent.
|
||||
*/
|
||||
uint64 getpage_prefetch_requests_total;
|
||||
uint64 getpage_sync_requests_total;
|
||||
|
||||
/* XXX: It's not clear to me when these misses happen. */
|
||||
uint64 getpage_prefetch_misses_total;
|
||||
|
||||
/*
|
||||
* Number of prefetched responses that were discarded becuase the
|
||||
* prefetched page was not needed or because it was concurrently fetched /
|
||||
* modified by another backend.
|
||||
*/
|
||||
uint64 getpage_prefetch_discards_total;
|
||||
|
||||
/*
|
||||
* Total number of requests send to pageserver. (prefetch_requests_total
|
||||
* and sync_request_total count only GetPage requests, this counts all
|
||||
* request types.)
|
||||
*/
|
||||
uint64 pageserver_requests_sent_total;
|
||||
|
||||
/*
|
||||
* Number of times the connection to the pageserver was lost and the
|
||||
* backend had to reconnect. Note that this doesn't count the first
|
||||
* connection in each backend, only reconnects.
|
||||
*/
|
||||
uint64 pageserver_disconnects_total;
|
||||
|
||||
/*
|
||||
* Number of network flushes to the pageserver. Synchronous requests are
|
||||
* flushed immediately, but when prefetching requests are sent in batches,
|
||||
* this can be smaller than pageserver_requests_sent_total.
|
||||
*/
|
||||
uint64 pageserver_send_flushes_total;
|
||||
|
||||
/*
|
||||
* Number of requests satisfied from the LFC.
|
||||
*
|
||||
* This is redundant with the server-wide file_cache_hits, but this gives
|
||||
* per-backend granularity, and it's handy to have this in the same place
|
||||
* as counters for requests that went to the pageserver. Maybe move all
|
||||
* the LFC stats to this struct in the future?
|
||||
*/
|
||||
uint64 file_cache_hits_total;
|
||||
|
||||
} neon_per_backend_counters;
|
||||
|
||||
/* Pointer to the shared memory array of neon_per_backend_counters structs */
|
||||
extern neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
|
||||
/*
|
||||
* Size of the perf counters array in shared memory. One slot for each backend
|
||||
* and aux process. IOW one for each PGPROC slot, except for slots reserved
|
||||
* for prepared transactions, because they're not real processes and cannot do
|
||||
* I/O.
|
||||
*/
|
||||
#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS)
|
||||
|
||||
#if PG_VERSION_NUM >= 170000
|
||||
#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
|
||||
#else
|
||||
#define MyNeonCounters (&neon_per_backend_counters_shared[MyProc->pgprocno])
|
||||
#endif
|
||||
|
||||
extern void inc_getpage_wait(uint64 latency);
|
||||
|
||||
extern Size NeonPerfCountersShmemSize(void);
|
||||
extern void NeonPerfCountersShmemInit(void);
|
||||
|
||||
|
||||
#endif /* NEON_PERF_COUNTERS_H */
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Support functions for the compatibility macros in neon_pgversioncompat.h
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/tuplestore.h"
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
void
|
||||
InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
Tuplestorestate *tupstore;
|
||||
MemoryContext old_context,
|
||||
per_query_ctx;
|
||||
TupleDesc stored_tupdesc;
|
||||
|
||||
/* check to see if caller supports returning a tuplestore */
|
||||
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("set-valued function called in context that cannot accept a set")));
|
||||
|
||||
/*
|
||||
* Store the tuplestore and the tuple descriptor in ReturnSetInfo. This
|
||||
* must be done in the per-query memory context.
|
||||
*/
|
||||
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
|
||||
old_context = MemoryContextSwitchTo(per_query_ctx);
|
||||
|
||||
if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE)
|
||||
elog(ERROR, "return type must be a row type");
|
||||
|
||||
tupstore = tuplestore_begin_heap(false, false, work_mem);
|
||||
rsinfo->returnMode = SFRM_Materialize;
|
||||
rsinfo->setResult = tupstore;
|
||||
rsinfo->setDesc = stored_tupdesc;
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
#endif
|
||||
@@ -6,8 +6,6 @@
|
||||
#ifndef NEON_PGVERSIONCOMPAT_H
|
||||
#define NEON_PGVERSIONCOMPAT_H
|
||||
|
||||
#include "fmgr.h"
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
|
||||
#else
|
||||
@@ -125,8 +123,4 @@
|
||||
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
|
||||
#endif
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
|
||||
#endif
|
||||
|
||||
#endif /* NEON_PGVERSIONCOMPAT_H */
|
||||
|
||||
@@ -66,7 +66,6 @@
|
||||
#include "storage/md.h"
|
||||
#include "storage/smgr.h"
|
||||
|
||||
#include "neon_perf_counters.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "bitmap.h"
|
||||
|
||||
@@ -290,6 +289,7 @@ static PrefetchState *MyPState;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
static void consume_prefetch_responses(void);
|
||||
static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
|
||||
static bool prefetch_read(PrefetchRequest *slot);
|
||||
static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
|
||||
static bool prefetch_wait_for(uint64 ring_index);
|
||||
@@ -780,27 +780,21 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
}
|
||||
|
||||
/*
|
||||
* prefetch_register_bufferv() - register and prefetch buffers
|
||||
* prefetch_register_buffer() - register and prefetch buffer
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
* This is used when issuing a speculative prefetch request, but also when
|
||||
* performing a synchronous request and need the buffer right now.
|
||||
*
|
||||
* If force_request_lsns is not NULL, those values are sent to the
|
||||
* pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
|
||||
* to calculate the LSNs to send.
|
||||
*
|
||||
* When performing a prefetch rather than a synchronous request,
|
||||
* is_prefetch==true. Currently, it only affects how the request is accounted
|
||||
* in the perf counters.
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
|
||||
static uint64
|
||||
prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
|
||||
BlockNumber nblocks, const bits8 *mask,
|
||||
bool is_prefetch)
|
||||
BlockNumber nblocks, const bits8 *mask)
|
||||
{
|
||||
uint64 min_ring_index;
|
||||
PrefetchRequest req;
|
||||
@@ -821,7 +815,6 @@ Retry:
|
||||
PrfHashEntry *entry = NULL;
|
||||
uint64 ring_index;
|
||||
neon_request_lsns *lsns;
|
||||
|
||||
if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
|
||||
continue;
|
||||
|
||||
@@ -865,7 +858,6 @@ Retry:
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
slot = NULL;
|
||||
MyNeonCounters->getpage_prefetch_discards_total++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -980,11 +972,6 @@ Retry:
|
||||
|
||||
min_ring_index = Min(min_ring_index, ring_index);
|
||||
|
||||
if (is_prefetch)
|
||||
MyNeonCounters->getpage_prefetch_requests_total++;
|
||||
else
|
||||
MyNeonCounters->getpage_sync_requests_total++;
|
||||
|
||||
prefetch_do_request(slot, lsns);
|
||||
}
|
||||
|
||||
@@ -1013,6 +1000,13 @@ Retry:
|
||||
}
|
||||
|
||||
|
||||
static uint64
|
||||
prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
|
||||
{
|
||||
return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
* context. Take care.
|
||||
@@ -1773,20 +1767,6 @@ neon_init(void)
|
||||
if (MyPState != NULL)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Sanity check that theperf counters array is sized correctly. We got
|
||||
* this wrong once, and the formula for max number of backends and aux
|
||||
* processes might well change in the future, so better safe than sorry.
|
||||
* This is a very cheap check so we do it even without assertions. On
|
||||
* v14, this gets called before initializing MyProc, so we cannot perform
|
||||
* the check here. That's OK, we don't expect the logic to change in old
|
||||
* releases.
|
||||
*/
|
||||
#if PG_VERSION_NUM>=150000
|
||||
if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
|
||||
elog(ERROR, "MyNeonCounters points past end of array");
|
||||
#endif
|
||||
|
||||
prfs_size = offsetof(PrefetchState, prf_buffer) +
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size;
|
||||
|
||||
@@ -2632,7 +2612,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
lfc_present[i] = ~(lfc_present[i]);
|
||||
|
||||
ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
|
||||
lfc_present, true);
|
||||
lfc_present);
|
||||
nblocks -= iterblocks;
|
||||
blocknum += iterblocks;
|
||||
|
||||
@@ -2676,7 +2656,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
|
||||
ring_index = prefetch_register_buffer(tag, NULL);
|
||||
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
MyPState->ring_last <= ring_index);
|
||||
@@ -2767,20 +2747,17 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
|
||||
* weren't for the behaviour of the LwLsn cache that uses the highest
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
*/
|
||||
prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);
|
||||
prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
|
||||
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
void *buffer = buffers[i];
|
||||
BlockNumber blockno = base_blockno + i;
|
||||
neon_request_lsns *reqlsns = &request_lsns[i];
|
||||
TimestampTz start_ts, end_ts;
|
||||
|
||||
if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
|
||||
continue;
|
||||
|
||||
start_ts = GetCurrentTimestamp();
|
||||
|
||||
if (RecoveryInProgress() && MyBackendType != B_STARTUP)
|
||||
XLogWaitForReplayOf(reqlsns[0].request_lsn);
|
||||
|
||||
@@ -2817,7 +2794,6 @@ Retry:
|
||||
/* drop caches */
|
||||
prefetch_set_unused(slot->my_ring_index);
|
||||
pgBufferUsage.prefetch.expired += 1;
|
||||
MyNeonCounters->getpage_prefetch_discards_total++;
|
||||
/* make it look like a prefetch cache miss */
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -2828,9 +2804,8 @@ Retry:
|
||||
if (entry == NULL)
|
||||
{
|
||||
pgBufferUsage.prefetch.misses += 1;
|
||||
MyNeonCounters->getpage_prefetch_misses_total++;
|
||||
|
||||
ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
|
||||
ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
|
||||
Assert(ring_index != UINT64_MAX);
|
||||
slot = GetPrfSlot(ring_index);
|
||||
}
|
||||
@@ -2885,9 +2860,6 @@ Retry:
|
||||
/* buffer was used, clean up for later reuse */
|
||||
prefetch_set_unused(ring_index);
|
||||
prefetch_cleanup_trailing_unused();
|
||||
|
||||
end_ts = GetCurrentTimestamp();
|
||||
inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2941,7 +2913,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
|
||||
{
|
||||
MyNeonCounters->file_cache_hits_total++;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3126,7 +3097,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
/* assume heap */
|
||||
RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
|
||||
RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
|
||||
|
||||
|
||||
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
|
||||
{
|
||||
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
|
||||
@@ -422,9 +422,6 @@ backpressure_throttling_impl(void)
|
||||
TimestampTz start,
|
||||
stop;
|
||||
bool retry = false;
|
||||
char *new_status = NULL;
|
||||
const char *old_status;
|
||||
int len;
|
||||
|
||||
if (PointerIsValid(PrevProcessInterruptsCallback))
|
||||
retry = PrevProcessInterruptsCallback();
|
||||
@@ -445,24 +442,14 @@ backpressure_throttling_impl(void)
|
||||
if (lag == 0)
|
||||
return retry;
|
||||
|
||||
|
||||
old_status = get_ps_display(&len);
|
||||
new_status = (char *) palloc(len + 64 + 1);
|
||||
memcpy(new_status, old_status, len);
|
||||
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
|
||||
set_ps_display(new_status);
|
||||
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
|
||||
/* Suspend writers until replicas catch up */
|
||||
set_ps_display("backpressure throttling");
|
||||
|
||||
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
|
||||
start = GetCurrentTimestamp();
|
||||
pg_usleep(BACK_PRESSURE_DELAY);
|
||||
stop = GetCurrentTimestamp();
|
||||
pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start);
|
||||
|
||||
/* Reset ps display */
|
||||
set_ps_display(new_status);
|
||||
pfree(new_status);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1486,33 +1473,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
if (!sk->wp->config->syncSafekeepers)
|
||||
{
|
||||
Size rbytes;
|
||||
rbytes = WALReadFromBuffers(buf, startptr, count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
startptr += rbytes;
|
||||
count -= rbytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
res = NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(count > 0);
|
||||
|
||||
/* Now read the remaining WAL from the WAL file */
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
}
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
|
||||
@@ -24,12 +24,12 @@ bytes = { workspace = true, features = ["serde"] }
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
compute_api.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
framed-websockets.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
@@ -82,6 +82,7 @@ tokio-postgres-rustls.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -80,14 +80,6 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
|
||||
fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||
fn dyn_clone(&self) -> Box<dyn TestBackend>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl Clone for Box<dyn TestBackend> {
|
||||
fn clone(&self) -> Self {
|
||||
TestBackend::dyn_clone(&**self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Backend<'_, (), ()> {
|
||||
@@ -452,7 +444,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
Self::Web(url, ()) => {
|
||||
info!("performing web authentication");
|
||||
|
||||
let info = web::authenticate(ctx, config, &url, client).await?;
|
||||
let info = web::authenticate(ctx, &url, client).await?;
|
||||
|
||||
Backend::Web(url, info)
|
||||
}
|
||||
@@ -565,7 +557,7 @@ mod tests {
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter};
|
||||
use super::{auth_quirks, AuthRateLimiter};
|
||||
|
||||
struct Auth {
|
||||
ips: Vec<IpPattern>,
|
||||
@@ -593,14 +585,6 @@ mod tests {
|
||||
))
|
||||
}
|
||||
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: crate::EndpointId,
|
||||
) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
@@ -611,15 +595,12 @@ mod tests {
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(1),
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
rate_limit_ip_subnet: 64,
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_auth_broker: false,
|
||||
accept_jwts: false,
|
||||
});
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
|
||||
@@ -8,14 +8,11 @@ use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use serde::{de::Visitor, Deserialize, Deserializer};
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{
|
||||
context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
|
||||
RoleName,
|
||||
};
|
||||
use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
@@ -30,6 +27,7 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
}
|
||||
|
||||
@@ -37,11 +35,10 @@ pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
pub(crate) audience: Option<String>,
|
||||
pub(crate) role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
pub(crate) struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
@@ -57,28 +54,18 @@ pub(crate) struct JwkCacheEntry {
|
||||
}
|
||||
|
||||
impl JwkCacheEntry {
|
||||
fn find_jwk_and_audience(
|
||||
&self,
|
||||
key_id: &str,
|
||||
role_name: &RoleName,
|
||||
) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets
|
||||
.values()
|
||||
// make sure our requested role has access to the key set
|
||||
.filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name))
|
||||
// try and find the requested key-id in the key set
|
||||
.find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets.values().find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySet {
|
||||
jwks: jose_jwk::JwkSet,
|
||||
audience: Option<String>,
|
||||
role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
impl KeySet {
|
||||
@@ -119,6 +106,7 @@ impl JwkCacheEntryLock {
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
@@ -131,10 +119,11 @@ impl JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?;
|
||||
let rules = auth_rules
|
||||
.fetch_auth_rules(ctx, endpoint, role_name)
|
||||
.await?;
|
||||
let mut key_sets =
|
||||
ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
|
||||
|
||||
// TODO(conrad): run concurrently
|
||||
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
|
||||
for rule in rules {
|
||||
@@ -162,7 +151,6 @@ impl JwkCacheEntryLock {
|
||||
KeySet {
|
||||
jwks,
|
||||
audience: rule.audience,
|
||||
role_names: rule.role_names,
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -185,6 +173,7 @@ impl JwkCacheEntryLock {
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
@@ -194,7 +183,9 @@ impl JwkCacheEntryLock {
|
||||
let Some(cached) = guard else {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
|
||||
return self
|
||||
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
|
||||
.await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
@@ -205,7 +196,9 @@ impl JwkCacheEntryLock {
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
|
||||
return self
|
||||
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
|
||||
.await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
@@ -219,7 +212,7 @@ impl JwkCacheEntryLock {
|
||||
let ctx = ctx.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry
|
||||
.renew_jwks(permit, &ctx, &client, endpoint, &fetch)
|
||||
.renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
@@ -239,7 +232,7 @@ impl JwkCacheEntryLock {
|
||||
jwt: &str,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: &RoleName,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
@@ -261,22 +254,30 @@ impl JwkCacheEntryLock {
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
let kid = header.key_id.context("missing key id")?;
|
||||
|
||||
let mut guard = self
|
||||
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
|
||||
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
|
||||
.await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid, role_name) {
|
||||
match guard.find_jwk_and_audience(kid) {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self
|
||||
.renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
|
||||
.renew_jwks(
|
||||
permit,
|
||||
ctx,
|
||||
client,
|
||||
endpoint.clone(),
|
||||
role_name.clone(),
|
||||
fetch,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
_ => {
|
||||
@@ -295,7 +296,7 @@ impl JwkCacheEntryLock {
|
||||
verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
|
||||
}
|
||||
jose_jwk::Key::Rsa(key) => {
|
||||
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
|
||||
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
|
||||
}
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
};
|
||||
@@ -307,24 +308,23 @@ impl JwkCacheEntryLock {
|
||||
|
||||
tracing::debug!(?payload, "JWT signature valid with claims");
|
||||
|
||||
if let Some(aud) = expected_audience {
|
||||
ensure!(
|
||||
payload.audience.0.iter().any(|s| s == aud),
|
||||
"invalid JWT token audience"
|
||||
);
|
||||
match (expected_audience, payload.audience) {
|
||||
// check the audience matches
|
||||
(Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
|
||||
// the audience is expected but is missing
|
||||
(Some(_), None) => bail!("invalid JWT token audience"),
|
||||
// we don't care for the audience field
|
||||
(None, _) => {}
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
ensure!(
|
||||
nbf < now + CLOCK_SKEW_LEEWAY,
|
||||
"JWT token is not yet ready to use"
|
||||
);
|
||||
ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -336,7 +336,7 @@ impl JwkCache {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: &RoleName,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
jwt: &str,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
@@ -377,7 +377,7 @@ fn verify_rsa_signature(
|
||||
data: &[u8],
|
||||
sig: &[u8],
|
||||
key: &jose_jwk::Rsa,
|
||||
alg: &jose_jwa::Algorithm,
|
||||
alg: &Option<jose_jwa::Algorithm>,
|
||||
) -> anyhow::Result<()> {
|
||||
use jose_jwa::{Algorithm, Signing};
|
||||
use rsa::{
|
||||
@@ -388,7 +388,7 @@ fn verify_rsa_signature(
|
||||
let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
|
||||
|
||||
match alg {
|
||||
Algorithm::Signing(Signing::Rs256) => {
|
||||
Some(Algorithm::Signing(Signing::Rs256)) => {
|
||||
let key = VerifyingKey::<sha2::Sha256>::new(key);
|
||||
let sig = Signature::try_from(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
@@ -402,6 +402,9 @@ fn verify_rsa_signature(
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct JwtHeader<'a> {
|
||||
/// must be "JWT"
|
||||
#[serde(rename = "typ")]
|
||||
typ: &'a str,
|
||||
/// must be a supported alg
|
||||
#[serde(rename = "alg")]
|
||||
algorithm: jose_jwa::Algorithm,
|
||||
@@ -411,12 +414,11 @@ struct JwtHeader<'a> {
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
#[allow(dead_code)]
|
||||
#[derive(serde::Deserialize, serde::Serialize, Debug)]
|
||||
struct JwtPayload<'a> {
|
||||
/// Audience - Recipient for which the JWT is intended
|
||||
#[serde(rename = "aud", default)]
|
||||
audience: OneOrMany,
|
||||
#[serde(rename = "aud")]
|
||||
audience: Option<&'a str>,
|
||||
/// Expiration - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
|
||||
expiration: Option<SystemTime>,
|
||||
@@ -439,59 +441,6 @@ struct JwtPayload<'a> {
|
||||
session_id: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// `OneOrMany` supports parsing either a single item or an array of items.
|
||||
///
|
||||
/// Needed for <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.3>
|
||||
///
|
||||
/// > The "aud" (audience) claim identifies the recipients that the JWT is
|
||||
/// > intended for. Each principal intended to process the JWT MUST
|
||||
/// > identify itself with a value in the audience claim. If the principal
|
||||
/// > processing the claim does not identify itself with a value in the
|
||||
/// > "aud" claim when this claim is present, then the JWT MUST be
|
||||
/// > rejected. In the general case, the "aud" value is **an array of case-
|
||||
/// > sensitive strings**, each containing a StringOrURI value. In the
|
||||
/// > special case when the JWT has one audience, the "aud" value MAY be a
|
||||
/// > **single case-sensitive string** containing a StringOrURI value. The
|
||||
/// > interpretation of audience values is generally application specific.
|
||||
/// > Use of this claim is OPTIONAL.
|
||||
#[derive(Default, Debug)]
|
||||
struct OneOrMany(Vec<String>);
|
||||
|
||||
impl<'de> Deserialize<'de> for OneOrMany {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct OneOrManyVisitor;
|
||||
impl<'de> Visitor<'de> for OneOrManyVisitor {
|
||||
type Value = OneOrMany;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a single string or an array of strings")
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(OneOrMany(vec![v.to_owned()]))
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let mut v = vec![];
|
||||
while let Some(s) = seq.next_element()? {
|
||||
v.push(s);
|
||||
}
|
||||
Ok(OneOrMany(v))
|
||||
}
|
||||
}
|
||||
deserializer.deserialize_any(OneOrManyVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
|
||||
let d = <Option<u64>>::deserialize(d)?;
|
||||
Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
|
||||
@@ -585,6 +534,7 @@ mod tests {
|
||||
key: jose_jwk::Key::Ec(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
@@ -598,6 +548,7 @@ mod tests {
|
||||
key: jose_jwk::Key::Rsa(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
@@ -606,6 +557,7 @@ mod tests {
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let header = JwtHeader {
|
||||
typ: "JWT",
|
||||
algorithm: jose_jwa::Algorithm::Signing(sig),
|
||||
key_id: Some(&kid),
|
||||
};
|
||||
@@ -620,7 +572,7 @@ mod tests {
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
|
||||
fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String {
|
||||
fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
|
||||
@@ -708,6 +660,11 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), ec2);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
@@ -749,98 +706,47 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr, Vec<RoleNameInt>);
|
||||
struct Fetch(SocketAddr);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
_role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
let role_name1 = RoleName::from("anonymous");
|
||||
let role_name2 = RoleName::from("authenticated");
|
||||
|
||||
let fetch = Fetch(
|
||||
addr,
|
||||
vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
],
|
||||
);
|
||||
|
||||
let role_name = RoleName::from("user");
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), &ec2);
|
||||
|
||||
// had the wrong kid, therefore will have the wrong ecdsa signature
|
||||
let bad_jwt = new_ec_jwt("3".into(), &ec2);
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&bad_jwt,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&role_name1,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("signature error"));
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&jwt1,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&bad_role_name,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("jwk not found"));
|
||||
|
||||
let tokens = [jwt1, jwt2, jwt3, jwt4];
|
||||
let role_names = [role_name1, role_name2];
|
||||
for role in &role_names {
|
||||
for token in &tokens {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
for token in [jwt1, jwt2, jwt3, jwt4] {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role_name.clone(),
|
||||
&Fetch(addr),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::{collections::HashMap, net::SocketAddr};
|
||||
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwapOption;
|
||||
@@ -10,19 +10,21 @@ use crate::{
|
||||
NodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
|
||||
EndpointId,
|
||||
intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
|
||||
EndpointId, RoleName,
|
||||
};
|
||||
|
||||
use super::jwt::{AuthRule, FetchAuthRules};
|
||||
use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
|
||||
|
||||
pub struct LocalBackend {
|
||||
pub(crate) jwks_cache: JwkCache,
|
||||
pub(crate) node_info: NodeInfo,
|
||||
}
|
||||
|
||||
impl LocalBackend {
|
||||
pub fn new(postgres_addr: SocketAddr) -> Self {
|
||||
LocalBackend {
|
||||
jwks_cache: JwkCache::default(),
|
||||
node_info: NodeInfo {
|
||||
config: {
|
||||
let mut cfg = ConnCfg::new();
|
||||
@@ -46,17 +48,26 @@ impl LocalBackend {
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct StaticAuthRules;
|
||||
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::const_empty();
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JwksRoleSettings {
|
||||
pub roles: HashMap<RoleName, EndpointJwksResponse>,
|
||||
pub project_id: ProjectIdInt,
|
||||
pub branch_id: BranchIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for StaticAuthRules {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
let mappings = JWKS_ROLE_MAP.load();
|
||||
let role_mappings = mappings
|
||||
.as_deref()
|
||||
.and_then(|m| m.roles.get(&role_name))
|
||||
.context("JWKs settings for this role were not configured")?;
|
||||
let mut rules = vec![];
|
||||
for setting in &role_mappings.jwks {
|
||||
@@ -64,7 +75,6 @@ impl FetchAuthRules for StaticAuthRules {
|
||||
id: setting.id.clone(),
|
||||
jwks_url: setting.jwks_url.clone(),
|
||||
audience: setting.jwt_audience.clone(),
|
||||
role_names: setting.role_names.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::{
|
||||
auth, compute,
|
||||
config::AuthenticationConfig,
|
||||
console::{self, provider::NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
@@ -59,7 +58,6 @@ pub(crate) fn new_psql_session_id() -> String {
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &RequestMonitoring,
|
||||
auth_config: &'static AuthenticationConfig,
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
@@ -91,14 +89,6 @@ pub(super) async fn authenticate(
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
let db_info = waiter.await.map_err(WebAuthError::from)?;
|
||||
|
||||
if auth_config.ip_allowlist_check_enabled {
|
||||
if let Some(allowed_ips) = &db_info.allowed_ips {
|
||||
if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
// This config should be self-contained, because we won't
|
||||
|
||||
@@ -1,38 +1,34 @@
|
||||
use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
net::SocketAddr,
|
||||
path::{Path, PathBuf},
|
||||
pin::pin,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use compute_api::spec::LocalProxySpec;
|
||||
use anyhow::{bail, ensure};
|
||||
use dashmap::DashMap;
|
||||
use futures::future::Either;
|
||||
use futures::{future::Either, FutureExt};
|
||||
use proxy::{
|
||||
auth::backend::{
|
||||
jwt::JwkCache,
|
||||
local::{LocalBackend, JWKS_ROLE_MAP},
|
||||
},
|
||||
auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
|
||||
cancellation::CancellationHandlerMain,
|
||||
config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
|
||||
console::{
|
||||
locks::ApiLocks,
|
||||
messages::{EndpointJwksResponse, JwksSettings},
|
||||
},
|
||||
console::{locks::ApiLocks, messages::JwksRoleMapping},
|
||||
http::health_server::AppMetrics,
|
||||
intern::RoleNameInt,
|
||||
metrics::{Metrics, ThreadPoolMetrics},
|
||||
rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
RoleName,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::Parser;
|
||||
use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
|
||||
use tokio::{net::TcpListener, task::JoinSet};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
@@ -76,12 +72,9 @@ struct LocalProxyCliArgs {
|
||||
/// Address of the postgres server
|
||||
#[clap(long, default_value = "127.0.0.1:5432")]
|
||||
compute: SocketAddr,
|
||||
/// Path of the local proxy config file
|
||||
/// File address of the local proxy config file
|
||||
#[clap(long, default_value = "./localproxy.json")]
|
||||
config_path: Utf8PathBuf,
|
||||
/// Path of the local proxy PID file
|
||||
#[clap(long, default_value = "./localproxy.pid")]
|
||||
pid_path: Utf8PathBuf,
|
||||
config_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
@@ -133,24 +126,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let args = LocalProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
|
||||
// before we bind to any ports, write the process ID to a file
|
||||
// so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
//
|
||||
// This also claims a "lock" that makes sure only one instance
|
||||
// of local-proxy runs at a time.
|
||||
let _process_guard = loop {
|
||||
match pid_file::claim_for_current_process(&args.pid_path) {
|
||||
Ok(guard) => break guard,
|
||||
Err(e) => {
|
||||
// compute-ctl might have tried to read the pid-file to let us
|
||||
// know about some config change. We should try again.
|
||||
error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
|
||||
let http_listener = TcpListener::bind(args.http).await?;
|
||||
let shutdown = CancellationToken::new();
|
||||
@@ -164,30 +139,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
16,
|
||||
));
|
||||
|
||||
// write the process ID to a file so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
let pid = std::process::id();
|
||||
info!("process running in PID {pid}");
|
||||
std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?;
|
||||
refresh_config(args.config_path.clone()).await;
|
||||
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
|
||||
let refresh_config_notify = Arc::new(Notify::new());
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
|
||||
let refresh_config_notify = Arc::clone(&refresh_config_notify);
|
||||
move || {
|
||||
refresh_config_notify.notify_one();
|
||||
}
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
|
||||
refresh_config(args.config_path.clone()).map(Ok)
|
||||
}));
|
||||
|
||||
// trigger the first config load **after** setting up the signal hook
|
||||
// to avoid the race condition where:
|
||||
// 1. No config file registered when local-proxy starts up
|
||||
// 2. The config file is written but the signal hook is not yet received
|
||||
// 3. local-proxy completes startup but has no config loaded, despite there being a registerd config.
|
||||
refresh_config_notify.notify_one();
|
||||
tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
|
||||
|
||||
maintenance_tasks.spawn(proxy::http::health_server::task_main(
|
||||
metrics_listener,
|
||||
AppMetrics {
|
||||
@@ -270,17 +227,14 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
allow_self_signed_compute: false,
|
||||
http_config,
|
||||
authentication_config: AuthenticationConfig {
|
||||
jwks_cache: JwkCache::default(),
|
||||
thread_pool: ThreadPool::new(0),
|
||||
scram_protocol_timeout: Duration::from_secs(10),
|
||||
rate_limiter_enabled: false,
|
||||
rate_limiter: BucketRateLimiter::new(vec![]),
|
||||
rate_limit_ip_subnet: 64,
|
||||
ip_allowlist_check_enabled: true,
|
||||
is_auth_broker: false,
|
||||
accept_jwts: true,
|
||||
},
|
||||
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
|
||||
require_client_ip: false,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
region: "local".into(),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
@@ -291,84 +245,81 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
})))
|
||||
}
|
||||
|
||||
async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
|
||||
loop {
|
||||
rx.notified().await;
|
||||
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
async fn refresh_config(path: PathBuf) {
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
|
||||
let bytes = tokio::fs::read(&path).await?;
|
||||
let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
|
||||
let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
|
||||
|
||||
let mut jwks_set = vec![];
|
||||
let mut settings = None;
|
||||
|
||||
for jwks in data.jwks {
|
||||
let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
|
||||
for mapping in data.roles.values_mut() {
|
||||
for jwks in &mut mapping.jwks {
|
||||
ensure!(
|
||||
jwks.jwks_url.has_authority()
|
||||
&& (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
|
||||
ensure!(
|
||||
jwks_url.has_authority()
|
||||
&& (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
ensure!(
|
||||
jwks.jwks_url
|
||||
.host()
|
||||
.is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
ensure!(
|
||||
jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks_url
|
||||
.set_username("")
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
jwks_url
|
||||
.set_password(None)
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url.set_port(None).expect(
|
||||
// clear username, password and ports
|
||||
jwks.jwks_url.set_username("").expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks_url.set_fragment(None);
|
||||
jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
jwks.jwks_url.set_password(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
jwks.jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
jwks_set.push(JwksSettings {
|
||||
id: jwks.id,
|
||||
jwks_url,
|
||||
provider_name: jwks.provider_name,
|
||||
jwt_audience: jwks.jwt_audience,
|
||||
role_names: jwks
|
||||
.role_names
|
||||
.into_iter()
|
||||
.map(RoleName::from)
|
||||
.map(|s| RoleNameInt::from(&s))
|
||||
.collect(),
|
||||
})
|
||||
// clear query params
|
||||
jwks.jwks_url.set_fragment(None);
|
||||
jwks.jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks.jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks.jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
|
||||
ensure!(
|
||||
*pr == jwks.project_id,
|
||||
"inconsistent project IDs configured"
|
||||
);
|
||||
ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
|
||||
}
|
||||
}
|
||||
|
||||
info!("successfully loaded new config");
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
|
||||
if let Some((project_id, branch_id)) = settings {
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
|
||||
roles: data.roles,
|
||||
project_id,
|
||||
branch_id,
|
||||
})));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
|
||||
Ok(())
|
||||
}));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user