mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 03:50:37 +00:00
Compare commits
49 Commits
jcsp/layer
...
yuchen/dir
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e565c4fbe9 | ||
|
|
a46757b769 | ||
|
|
9c32bfee3b | ||
|
|
69ef8caf58 | ||
|
|
b7443dd643 | ||
|
|
cc433c76a3 | ||
|
|
2034ec906a | ||
|
|
f48ab8bcaa | ||
|
|
2607a57990 | ||
|
|
f04c1c230c | ||
|
|
13f1931a09 | ||
|
|
e98a4eb5e2 | ||
|
|
e01d145066 | ||
|
|
9e9d76d6f2 | ||
|
|
14ec379d2b | ||
|
|
ebfe88a463 | ||
|
|
eb16aa9e81 | ||
|
|
d2d9921761 | ||
|
|
ba498a630a | ||
|
|
e989a5e4a2 | ||
|
|
cde1654d7b | ||
|
|
cf6a776fcf | ||
|
|
5c5871111a | ||
|
|
d56c4e7a38 | ||
|
|
43b2445d0b | ||
|
|
42ef08db47 | ||
|
|
fc962c9605 | ||
|
|
357fa070a3 | ||
|
|
02cdd37b56 | ||
|
|
fa354a65ab | ||
|
|
40f7930a7d | ||
|
|
ec07a1ecc9 | ||
|
|
c4cdfe66ac | ||
|
|
42e19e952f | ||
|
|
3d255d601b | ||
|
|
80e974d05b | ||
|
|
7fdf1ab5b6 | ||
|
|
f6d0ed6454 | ||
|
|
a2be8a440b | ||
|
|
ff4a1db223 | ||
|
|
29d54ccd20 | ||
|
|
68a1fe20f2 | ||
|
|
e8408c797a | ||
|
|
027f28deb9 | ||
|
|
ea6f9798c6 | ||
|
|
253e4d5843 | ||
|
|
852099bc83 | ||
|
|
148e230d11 | ||
|
|
6d664788c1 |
46
.github/workflows/_benchmarking_preparation.yml
vendored
46
.github/workflows/_benchmarking_preparation.yml
vendored
@@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps
|
||||
on:
|
||||
workflow_call:
|
||||
# no inputs needed
|
||||
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
setup-databases:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
||||
database: [ clickbench, tpch, userexample ]
|
||||
|
||||
|
||||
env:
|
||||
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
@@ -23,7 +27,10 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -32,13 +39,13 @@ jobs:
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
aws-rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
aws-aurora-serverless-v2-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
@@ -46,10 +53,17 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -57,23 +71,23 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
- name: Create benchmark_restore_status table if it does not exist
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
||||
# we use an advisory lock
|
||||
run: |
|
||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
||||
SELECT pg_advisory_lock(4711);
|
||||
SELECT pg_advisory_lock(4711);
|
||||
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
|
||||
databasename text primary key,
|
||||
restore_done boolean
|
||||
);
|
||||
SELECT pg_advisory_unlock(4711);
|
||||
"
|
||||
|
||||
|
||||
- name: Check if restore is already done
|
||||
id: check-restore-done
|
||||
env:
|
||||
@@ -107,7 +121,7 @@ jobs:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
mkdir -p /tmp/dumps
|
||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
||||
|
||||
- name: Replace database name in connection string
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
@@ -126,17 +140,17 @@ jobs:
|
||||
else
|
||||
new_connstr="${base_connstr}/${DATABASE_NAME}"
|
||||
fi
|
||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Restore dump
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
|
||||
# the following works only with larger computes:
|
||||
# the following works only with larger computes:
|
||||
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
|
||||
# we add the || true because:
|
||||
# the dumps were created with Neon and contain neon extensions that are not
|
||||
# the dumps were created with Neon and contain neon extensions that are not
|
||||
# available in RDS, so we will always report an error, but we can ignore it
|
||||
run: |
|
||||
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
|
||||
|
||||
@@ -236,9 +236,7 @@ jobs:
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
for io_buffer_alignment in 0 1 512 ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
|
||||
119
.github/workflows/benchmarking.yml
vendored
119
.github/workflows/benchmarking.yml
vendored
@@ -12,7 +12,6 @@ on:
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
region_id:
|
||||
@@ -59,7 +58,7 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -68,12 +67,10 @@ jobs:
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
@@ -86,7 +83,10 @@ jobs:
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: ${{ matrix.IMAGE }}
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -164,6 +164,10 @@ jobs:
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
@@ -174,12 +178,21 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -267,7 +280,7 @@ jobs:
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default='["self-hosted", "us-east-2", "x64"]'
|
||||
runner_azure='["self-hosted", "eastus2", "x64"]'
|
||||
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
|
||||
image_default="neondatabase/build-tools:pinned"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
@@ -344,7 +357,7 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -371,7 +384,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
@@ -492,17 +505,15 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # Required for OIDC authentication in azure runners
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
IMAGE: neondatabase/build-tools:pinned
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
@@ -511,13 +522,16 @@ jobs:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
|
||||
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
container:
|
||||
image: ${{ matrix.IMAGE }}
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -527,17 +541,26 @@ jobs:
|
||||
# instead of using Neon artifacts containing pgbench
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
run: |
|
||||
# Just to make it easier to test things locally on macOS (with arm64)
|
||||
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
|
||||
|
||||
cd /home/nonroot
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb
|
||||
dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
|
||||
LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
|
||||
export LD_LIBRARY_PATH
|
||||
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
|
||||
|
||||
/tmp/neon/pg_install/v16/bin/pgbench --version
|
||||
/tmp/neon/pg_install/v16/bin/psql --version
|
||||
|
||||
@@ -559,7 +582,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
@@ -620,6 +643,10 @@ jobs:
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -638,12 +665,22 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -714,6 +751,10 @@ jobs:
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -731,12 +772,22 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
@@ -806,6 +857,10 @@ jobs:
|
||||
|
||||
user-examples-compare:
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
|
||||
|
||||
strategy:
|
||||
@@ -822,12 +877,22 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
|
||||
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -1190,10 +1190,9 @@ jobs:
|
||||
|
||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||
|
||||
# TODO Add v17
|
||||
for pg_version in v14 v15 v16; do
|
||||
for pg_version in v14 v15 v16 v17; do
|
||||
# We run less tests for debug builds, so we don't need to promote them
|
||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
|
||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -1321,6 +1321,7 @@ dependencies = [
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
@@ -4296,6 +4297,7 @@ dependencies = [
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"ecdsa 0.16.9",
|
||||
|
||||
@@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
#
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
# regression tests
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
@@ -24,6 +27,7 @@ RUN set -e \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
gdb \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
|
||||
@@ -11,6 +11,10 @@ commands:
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/resize-swap'
|
||||
- name: chmod-set-disk-quota
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
@@ -30,11 +34,12 @@ commands:
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-resize-swap
|
||||
- filename: compute_ctl-sudoers
|
||||
content: |
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
@@ -100,7 +105,7 @@ merge: |
|
||||
&& apt install --no-install-recommends -y \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
|
||||
COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
@@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
|
||||
|
||||
Ok(ProcessCliResult {
|
||||
connstr,
|
||||
@@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
spec_json,
|
||||
spec_path,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> {
|
||||
spec_json: Option<&'clap String>,
|
||||
spec_path: Option<&'clap String>,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<&'clap String>,
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
@@ -293,6 +297,7 @@ fn wait_spec(
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
http_port,
|
||||
..
|
||||
}: ProcessCliResult,
|
||||
@@ -373,6 +378,7 @@ fn wait_spec(
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -381,6 +387,7 @@ struct WaitSpecResult {
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<String>,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
@@ -390,6 +397,7 @@ fn start_postgres(
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
}: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
@@ -403,6 +411,7 @@ fn start_postgres(
|
||||
);
|
||||
// before we release the mutex, fetch the swap size (if any) for later.
|
||||
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
|
||||
let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
@@ -422,8 +431,8 @@ fn start_postgres(
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_gib, "resized swap");
|
||||
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_mib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
@@ -432,10 +441,29 @@ fn start_postgres(
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
compute.state_changed.notify_all();
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set disk quota if the compute spec says so
|
||||
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
|
||||
(disk_quota_bytes, set_disk_quota_for_fs)
|
||||
{
|
||||
match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
|
||||
Ok(()) => {
|
||||
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%disk_quota_bytes, %size_mib, "set disk quota");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to set disk quota");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
@@ -450,16 +478,7 @@ fn start_postgres(
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
@@ -750,6 +769,11 @@ fn cli() -> clap::Command {
|
||||
.long("resize-swap-on-bind")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("set-disk-quota-for-fs")
|
||||
.long("set-disk-quota-for-fs")
|
||||
.value_name("SET_DISK_QUOTA_FOR_FS")
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -305,6 +306,13 @@ impl ComputeNode {
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn set_failed_status(&self, err: anyhow::Error) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.lock().unwrap().status
|
||||
}
|
||||
@@ -710,7 +718,7 @@ impl ComputeNode {
|
||||
info!("running initdb");
|
||||
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
|
||||
Command::new(initdb_bin)
|
||||
.args(["-D", pgdata])
|
||||
.args(["--pgdata", pgdata])
|
||||
.output()
|
||||
.expect("cannot start initdb process");
|
||||
|
||||
@@ -1123,6 +1131,9 @@ impl ComputeNode {
|
||||
//
|
||||
// Use that as a default location and pattern, except macos where core dumps are written
|
||||
// to /cores/ directory by default.
|
||||
//
|
||||
// With default Linux settings, the core dump file is called just "core", so check for
|
||||
// that too.
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
@@ -1134,8 +1145,17 @@ impl ComputeNode {
|
||||
let files = fs::read_dir(core_dump_dir)?;
|
||||
let cores = files.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
|
||||
Some(entry.path())
|
||||
|
||||
let is_core_dump = match entry.file_name().to_str()? {
|
||||
n if n.starts_with("core.") => true,
|
||||
"core" => true,
|
||||
_ => false,
|
||||
};
|
||||
if is_core_dump {
|
||||
Some(entry.path())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
// Print backtrace for each core dump
|
||||
@@ -1386,6 +1406,36 @@ LIMIT 100",
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
|
||||
/// Waits until current thread receives a state changed notification and
|
||||
/// the pageserver connection strings has changed.
|
||||
///
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
if !unchanged {
|
||||
info!("Pageserver config changed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
|
||||
@@ -11,9 +11,17 @@ use crate::compute::ComputeNode;
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let mut state = compute.state_changed.wait(state).unwrap();
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
|
||||
// Re-check the status after waking up
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got configuration request");
|
||||
state.status = ComputeStatus::Configuration;
|
||||
|
||||
25
compute_tools/src/disk_quota.rs
Normal file
25
compute_tools/src/disk_quota.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use anyhow::Context;
|
||||
|
||||
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
|
||||
|
||||
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
|
||||
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
|
||||
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
|
||||
let size_kb = size_bytes / 1024;
|
||||
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
|
||||
let child_result = std::process::Command::new("/usr/bin/sudo")
|
||||
.arg(DISK_QUOTA_BIN)
|
||||
.arg(size_kb.to_string())
|
||||
.arg(fs_mountpoint)
|
||||
.spawn();
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow::anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
|
||||
}
|
||||
@@ -10,6 +10,7 @@ pub mod http;
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod lsn_lease;
|
||||
mod migration;
|
||||
|
||||
@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Succeeded, sleeping for {} seconds",
|
||||
"Request succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
thread::sleep(sleep_duration);
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry(
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token.clone());
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
config
|
||||
})
|
||||
@@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry(
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
|
||||
|
||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
|
||||
retry_period_ms as u64,
|
||||
));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
94
control_plane/src/branch_mappings.rs
Normal file
94
control_plane/src/branch_mappings.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
//! Branch mappings for convenience
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct BranchMappings {
|
||||
/// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
/// --tenant_id is not explicitly specified. This comes from the branches.
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
impl BranchMappings {
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let existing_values = self.mappings.entry(branch_name.clone()).or_default();
|
||||
|
||||
let existing_ids = existing_values
|
||||
.iter()
|
||||
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
|
||||
|
||||
if let Some((_, old_timeline_id)) = existing_ids {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_branch_timeline_id(
|
||||
&self,
|
||||
branch_name: &str,
|
||||
tenant_id: TenantId,
|
||||
) -> Option<TimelineId> {
|
||||
// If it looks like a timeline ID, return it as it is
|
||||
if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
|
||||
return Some(timeline_id);
|
||||
}
|
||||
|
||||
self.mappings
|
||||
.get(branch_name)?
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
self.mappings
|
||||
.iter()
|
||||
.flat_map(|(name, tenant_timelines)| {
|
||||
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
|
||||
(TenantTimelineId::new(tenant_id, timeline_id), name.clone())
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let content = &toml::to_string_pretty(self)?;
|
||||
fs::write(path, content).with_context(|| {
|
||||
format!(
|
||||
"Failed to write branch information into path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
|
||||
let branches_file_contents = fs::read_to_string(path)?;
|
||||
Ok(toml::from_str(branches_file_contents.as_str())?)
|
||||
}
|
||||
}
|
||||
@@ -561,6 +561,7 @@ impl Endpoint {
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -113,7 +113,7 @@ impl SafekeeperNode {
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
extra_opts: Vec<String>,
|
||||
extra_opts: &[String],
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
print!(
|
||||
@@ -196,7 +196,7 @@ impl SafekeeperNode {
|
||||
]);
|
||||
}
|
||||
|
||||
args.extend(extra_opts);
|
||||
args.extend_from_slice(extra_opts);
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
|
||||
@@ -347,7 +347,7 @@ impl StorageController {
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
let initdb_args = [
|
||||
"-D",
|
||||
"--pgdata",
|
||||
pg_data_path.as_ref(),
|
||||
"--username",
|
||||
&username(),
|
||||
|
||||
@@ -50,6 +50,16 @@ pub struct ComputeSpec {
|
||||
#[serde(default)]
|
||||
pub swap_size_bytes: Option<u64>,
|
||||
|
||||
/// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs
|
||||
/// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the
|
||||
/// spec is first received.
|
||||
///
|
||||
/// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's
|
||||
/// spec generation doesn't need to be aware of the actual compute it's running on, while
|
||||
/// guaranteeing gradual rollout of disk quota.
|
||||
#[serde(default)]
|
||||
pub disk_quota_bytes: Option<u64>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
@@ -268,6 +278,22 @@ pub struct GenericOption {
|
||||
/// declare a `trait` on it.
|
||||
pub type GenericOptions = Option<Vec<GenericOption>>;
|
||||
|
||||
/// Configured the local-proxy application with the relevant JWKS and roles it should
|
||||
/// use for authorizing connect requests using JWT.
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct LocalProxySpec {
|
||||
pub jwks: Vec<JwksSettings>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct JwksSettings {
|
||||
pub id: String,
|
||||
pub role_names: Vec<String>,
|
||||
pub jwks_url: String,
|
||||
pub provider_name: String,
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -104,7 +104,7 @@ pub struct ConfigToml {
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
@@ -381,7 +381,7 @@ impl Default for ConfigToml {
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
virtual_file_io_mode: None,
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
|
||||
|
||||
@@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse {
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
@@ -994,50 +992,49 @@ pub mod virtual_file {
|
||||
}
|
||||
|
||||
/// Direct IO modes for a pageserver.
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum DirectIoMode {
|
||||
/// Direct IO disabled (uses usual buffered IO).
|
||||
#[default]
|
||||
Disabled,
|
||||
/// Direct IO disabled (performs checks and perf simulations).
|
||||
Evaluate {
|
||||
/// Alignment check level
|
||||
alignment_check: DirectIoAlignmentCheckLevel,
|
||||
/// Latency padded for performance simulation.
|
||||
latency_padding: DirectIoLatencyPadding,
|
||||
},
|
||||
/// Direct IO enabled.
|
||||
Enabled {
|
||||
/// Actions to perform on alignment error.
|
||||
on_alignment_error: DirectIoOnAlignmentErrorAction,
|
||||
},
|
||||
#[derive(
|
||||
Copy,
|
||||
Clone,
|
||||
PartialEq,
|
||||
Eq,
|
||||
Hash,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
Debug,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[repr(u8)]
|
||||
pub enum IoMode {
|
||||
/// Uses buffered IO.
|
||||
Buffered,
|
||||
/// Uses direct IO, error out if the operation fails.
|
||||
#[cfg(target_os = "linux")]
|
||||
Direct,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoAlignmentCheckLevel {
|
||||
#[default]
|
||||
Error,
|
||||
Log,
|
||||
None,
|
||||
impl IoMode {
|
||||
pub const fn preferred() -> Self {
|
||||
if cfg!(target_os = "linux") {
|
||||
Self::Direct
|
||||
} else {
|
||||
Self::Buffered
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum DirectIoOnAlignmentErrorAction {
|
||||
Error,
|
||||
#[default]
|
||||
FallbackToBuffered,
|
||||
}
|
||||
impl TryFrom<u8> for IoMode {
|
||||
type Error = u8;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
|
||||
#[serde(tag = "type", rename_all = "kebab-case")]
|
||||
pub enum DirectIoLatencyPadding {
|
||||
/// Pad virtual file operations with IO to a fake file.
|
||||
FakeFileRW { path: PathBuf },
|
||||
#[default]
|
||||
None,
|
||||
fn try_from(value: u8) -> Result<Self, Self::Error> {
|
||||
Ok(match value {
|
||||
v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
|
||||
#[cfg(target_os = "linux")]
|
||||
v if v == (IoMode::Direct as u8) => IoMode::Direct,
|
||||
x => return Err(x),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String {
|
||||
}
|
||||
|
||||
fn log_query_error(query: &str, e: &QueryError) {
|
||||
// If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`.
|
||||
match e {
|
||||
QueryError::Disconnected(ConnectionError::Io(io_error)) => {
|
||||
if is_expected_io_error(io_error) {
|
||||
|
||||
@@ -93,9 +93,9 @@ impl Conf {
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("initdb")?
|
||||
.arg("-D")
|
||||
.arg("--pgdata")
|
||||
.arg(&self.datadir)
|
||||
.args(["-U", "postgres", "--no-instructions", "--no-sync"])
|
||||
.args(["--username", "postgres", "--no-instructions", "--no-sync"])
|
||||
.output()?;
|
||||
debug!("initdb output: {:?}", output);
|
||||
ensure!(
|
||||
|
||||
@@ -164,12 +164,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench(), align);
|
||||
page_cache::init(conf.page_cache_size, align);
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-small-values");
|
||||
|
||||
@@ -550,6 +550,19 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Configs io mode at runtime.
|
||||
pub async fn put_io_mode(
|
||||
&self,
|
||||
mode: &pageserver_api::models::virtual_file::IoMode,
|
||||
) -> Result<()> {
|
||||
let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, uri, mode)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
@@ -736,4 +749,22 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_init_lsn_lease(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<LsnLease> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease",
|
||||
self.mgmt_api_endpoint,
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, LsnLeaseRequest { lsn })
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,13 +151,10 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
|
||||
pageserver::page_cache::init(100, align);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
let mut total_image_layers = 0usize;
|
||||
|
||||
@@ -59,8 +59,9 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
|
||||
page_cache::init(100);
|
||||
let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
|
||||
page_cache::init(100, align);
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
@@ -190,12 +191,10 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
|
||||
pageserver::page_cache::init(100, align);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
|
||||
@@ -205,12 +205,9 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let align = DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
|
||||
page_cache::init(100, align);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
}
|
||||
|
||||
@@ -63,6 +63,10 @@ pub(crate) struct Args {
|
||||
#[clap(long)]
|
||||
set_io_alignment: Option<usize>,
|
||||
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
|
||||
#[clap(long)]
|
||||
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
@@ -133,6 +137,10 @@ async fn main_impl(
|
||||
mgmt_api_client.put_io_alignment(align).await?;
|
||||
}
|
||||
|
||||
if let Some(mode) = &args.set_io_mode {
|
||||
mgmt_api_client.put_io_mode(mode).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
|
||||
@@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command};
|
||||
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::config::PageserverIdentity;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::controller_upcall_client::ControllerUpcallClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||
@@ -125,7 +125,7 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
@@ -173,7 +173,7 @@ fn main() -> anyhow::Result<()> {
|
||||
conf.virtual_file_io_engine,
|
||||
conf.io_buffer_alignment,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
page_cache::init(conf.page_cache_size, conf.io_buffer_alignment);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -396,7 +396,7 @@ fn start_pageserver(
|
||||
// Set up deletion queue
|
||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||
remote_storage.clone(),
|
||||
ControlPlaneClient::new(conf, &shutdown_pageserver),
|
||||
ControllerUpcallClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
if let Some(deletion_workers) = deletion_workers {
|
||||
|
||||
@@ -174,7 +174,7 @@ pub struct PageServerConf {
|
||||
pub l0_flush: crate::l0_flush::L0FlushConfig,
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
pub virtual_file_io_mode: virtual_file::IoMode,
|
||||
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
@@ -325,7 +325,7 @@ impl PageServerConf {
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
l0_flush,
|
||||
virtual_file_direct_io,
|
||||
virtual_file_io_mode,
|
||||
concurrent_tenant_warmup,
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
virtual_file_io_engine,
|
||||
@@ -368,7 +368,6 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
virtual_file_direct_io,
|
||||
io_buffer_alignment,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
@@ -408,6 +407,7 @@ impl PageServerConf {
|
||||
l0_flush: l0_flush
|
||||
.map(crate::l0_flush::L0FlushConfig::from)
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
|
||||
use pageserver_api::config::NodeMetadata;
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
pub struct ControlPlaneClient {
|
||||
/// The Pageserver's client for using the storage controller upcall API: this is a small API
|
||||
/// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
|
||||
///
|
||||
/// The server presenting this API may either be the storage controller or some other
|
||||
/// service (such as the Neon control plane) providing a store of generation numbers.
|
||||
pub struct ControllerUpcallClient {
|
||||
http_client: reqwest::Client,
|
||||
base_url: Url,
|
||||
node_id: NodeId,
|
||||
@@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi {
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
|
||||
}
|
||||
|
||||
impl ControlPlaneClient {
|
||||
impl ControllerUpcallClient {
|
||||
/// A None return value indicates that the input `conf` object does not have control
|
||||
/// plane API enabled.
|
||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
||||
@@ -114,7 +117,7 @@ impl ControlPlaneClient {
|
||||
}
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(
|
||||
&self,
|
||||
@@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.join("validate")
|
||||
.expect("Failed to build validate path");
|
||||
|
||||
let request = ValidateRequest {
|
||||
tenants: tenants
|
||||
.into_iter()
|
||||
.map(|(id, gen)| ValidateRequestTenant {
|
||||
id,
|
||||
gen: gen
|
||||
.into()
|
||||
.expect("Generation should always be valid for a Tenant doing deletions"),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
// When sending validate requests, break them up into chunks so that we
|
||||
// avoid possible edge cases of generating any HTTP requests that
|
||||
// require database I/O across many thousands of tenants.
|
||||
let mut result: HashMap<TenantShardId, bool> = HashMap::with_capacity(tenants.len());
|
||||
for tenant_chunk in (tenants).chunks(128) {
|
||||
let request = ValidateRequest {
|
||||
tenants: tenant_chunk
|
||||
.iter()
|
||||
.map(|(id, generation)| ValidateRequestTenant {
|
||||
id: *id,
|
||||
gen: (*generation).into().expect(
|
||||
"Generation should always be valid for a Tenant doing deletions",
|
||||
),
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"control-plane-client-validate-sleep",
|
||||
&self.cancel
|
||||
);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
let response: ValidateResponse =
|
||||
self.retry_http_forever(&re_attach_path, request).await?;
|
||||
for rt in response.tenants {
|
||||
result.insert(rt.id, rt.valid);
|
||||
}
|
||||
}
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|rt| (rt.id, rt.valid))
|
||||
.collect())
|
||||
Ok(result.into_iter().collect())
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
@@ -622,7 +622,7 @@ impl DeletionQueue {
|
||||
/// If remote_storage is None, then the returned workers will also be None.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
control_plane_client: Option<C>,
|
||||
controller_upcall_client: Option<C>,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
||||
where
|
||||
@@ -662,7 +662,7 @@ impl DeletionQueue {
|
||||
conf,
|
||||
backend_rx,
|
||||
executor_tx,
|
||||
control_plane_client,
|
||||
controller_upcall_client,
|
||||
lsn_table.clone(),
|
||||
cancel.clone(),
|
||||
),
|
||||
@@ -704,7 +704,7 @@ mod test {
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
use crate::{
|
||||
control_plane_client::RetryForeverError,
|
||||
controller_upcall_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
@@ -25,8 +25,8 @@ use tracing::info;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
|
||||
use crate::controller_upcall_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
@@ -61,7 +61,7 @@ where
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
// Client for calling into control plane API for validation of deletes
|
||||
control_plane_client: Option<C>,
|
||||
controller_upcall_client: Option<C>,
|
||||
|
||||
// DeletionLists which are waiting generation validation. Not safe to
|
||||
// execute until [`validate`] has processed them.
|
||||
@@ -94,7 +94,7 @@ where
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
control_plane_client: Option<C>,
|
||||
controller_upcall_client: Option<C>,
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -102,7 +102,7 @@ where
|
||||
conf,
|
||||
rx,
|
||||
tx,
|
||||
control_plane_client,
|
||||
controller_upcall_client,
|
||||
lsn_table,
|
||||
pending_lists: Vec::new(),
|
||||
validated_lists: Vec::new(),
|
||||
@@ -145,8 +145,8 @@ where
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
|
||||
match control_plane_client
|
||||
let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
|
||||
match controller_upcall_client
|
||||
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -17,6 +17,7 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::virtual_file::IoMode;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
@@ -824,7 +825,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
|
||||
let lease = if with_lease {
|
||||
timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
|
||||
.inspect_err(|_| {
|
||||
warn!("fail to grant a lease to {}", lsn);
|
||||
})
|
||||
@@ -1692,9 +1693,18 @@ async fn lsn_lease_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let result = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
|
||||
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
|
||||
|
||||
let result = async {
|
||||
timeline
|
||||
.init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(
|
||||
e.context(format!("invalid lsn lease request at {lsn}")),
|
||||
)
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
@@ -2372,6 +2382,16 @@ async fn put_io_alignment_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_mode_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let mode: IoMode = json_request(&mut r).await?;
|
||||
crate::virtual_file::set_io_mode(mode);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Polled by control plane.
|
||||
///
|
||||
/// See [`crate::utilization`].
|
||||
@@ -3062,6 +3082,7 @@ pub fn make_router(
|
||||
.put("/v1/io_alignment", |r| {
|
||||
api_handler(r, put_io_alignment_handler)
|
||||
})
|
||||
.put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
|
||||
@@ -6,7 +6,7 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
pub mod control_plane_client;
|
||||
pub mod controller_upcall_client;
|
||||
pub mod deletion_queue;
|
||||
pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
|
||||
@@ -8,6 +8,8 @@ use metrics::{
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{is_expected_io_error, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use strum::{EnumCount, VariantNames};
|
||||
use strum_macros::{IntoStaticStr, VariantNames};
|
||||
use tracing::warn;
|
||||
@@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
pub(crate) struct BasebackupQueryTime {
|
||||
ok: Histogram,
|
||||
error: Histogram,
|
||||
client_error: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
@@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
BasebackupQueryTime {
|
||||
ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
|
||||
error: vec.get_metric_with_label_values(&["error"]).unwrap(),
|
||||
client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1557,7 +1561,7 @@ impl BasebackupQueryTime {
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
@@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let metric = if res.is_ok() {
|
||||
&self.parent.ok
|
||||
} else {
|
||||
&self.parent.error
|
||||
// If you want to change categorize of a specific error, also change it in `log_query_error`.
|
||||
let metric = match res {
|
||||
Ok(_) => &self.parent.ok,
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
|
||||
if is_expected_io_error(io_error) =>
|
||||
{
|
||||
&self.parent.client_error
|
||||
}
|
||||
Err(_) => &self.parent.error,
|
||||
};
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
|
||||
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
|
||||
virtual_file::{self, dio::IoBufferMut},
|
||||
};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -90,8 +91,8 @@ const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
///
|
||||
/// Initialize the page cache. This must be called once at page server startup.
|
||||
///
|
||||
pub fn init(size: usize) {
|
||||
if PAGE_CACHE.set(PageCache::new(size)).is_err() {
|
||||
pub fn init(size: usize, align: usize) {
|
||||
if PAGE_CACHE.set(PageCache::new(size, align)).is_err() {
|
||||
panic!("page cache already initialized");
|
||||
}
|
||||
}
|
||||
@@ -106,7 +107,12 @@ pub fn get() -> &'static PageCache {
|
||||
// page cache is usable in unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
|
||||
PAGE_CACHE.get_or_init(|| {
|
||||
PageCache::new(
|
||||
TEST_PAGE_CACHE_SIZE,
|
||||
virtual_file::get_io_buffer_alignment(),
|
||||
)
|
||||
})
|
||||
} else {
|
||||
PAGE_CACHE.get().expect("page cache not initialized")
|
||||
}
|
||||
@@ -637,13 +643,11 @@ impl PageCache {
|
||||
/// Initialize a new page cache
|
||||
///
|
||||
/// This should be called only once at page server startup.
|
||||
fn new(num_pages: usize) -> Self {
|
||||
fn new(num_pages: usize, align: usize) -> Self {
|
||||
assert!(num_pages > 0, "page cache size must be > 0");
|
||||
|
||||
// We could use Vec::leak here, but that potentially also leaks
|
||||
// uninitialized reserved capacity. With into_boxed_slice and Box::leak
|
||||
// this is avoided.
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
let page_buffer =
|
||||
IoBufferMut::with_capacity_aligned_zeroed(num_pages * PAGE_SZ, align).leak();
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
|
||||
@@ -273,10 +273,20 @@ async fn page_service_conn_main(
|
||||
info!("Postgres client disconnected ({io_error})");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(io_error).context("Postgres connection error")
|
||||
let tenant_id = conn_handler.timeline_handles.tenant_id();
|
||||
Err(io_error).context(format!(
|
||||
"Postgres connection error for tenant_id={:?} client at peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
))
|
||||
}
|
||||
}
|
||||
other => other.context("Postgres query error"),
|
||||
other => {
|
||||
let tenant_id = conn_handler.timeline_handles.tenant_id();
|
||||
other.context(format!(
|
||||
"Postgres query error for tenant_id={:?} client peer_addr={}",
|
||||
tenant_id, peer_addr
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -340,6 +350,10 @@ impl TimelineHandles {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_id(&self) -> Option<TenantId> {
|
||||
self.wrapper.tenant_id.get().copied()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct TenantManagerWrapper {
|
||||
@@ -819,7 +833,7 @@ impl PageServerHandler {
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
let lease = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.inspect_err(|e| {
|
||||
warn!("{e}");
|
||||
})
|
||||
@@ -997,7 +1011,6 @@ impl PageServerHandler {
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::info!("get_rel_page_at_lsn: {lsn}");
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
|
||||
@@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
@@ -182,27 +183,54 @@ pub struct TenantSharedResources {
|
||||
pub(super) struct AttachedTenantConf {
|
||||
tenant_conf: TenantConfOpt,
|
||||
location: AttachedLocationConfig,
|
||||
/// The deadline before which we are blocked from GC so that
|
||||
/// leases have a chance to be renewed.
|
||||
lsn_lease_deadline: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||
// Sets a deadline before which we cannot proceed to GC due to lsn lease.
|
||||
//
|
||||
// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
|
||||
// length, we guarantee that all the leases we granted before will have a chance to renew
|
||||
// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
|
||||
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
|
||||
Some(
|
||||
tokio::time::Instant::now()
|
||||
+ tenant_conf
|
||||
.lsn_lease_length
|
||||
.unwrap_or(LsnLease::DEFAULT_LENGTH),
|
||||
)
|
||||
} else {
|
||||
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
|
||||
// because we don't do GC in these modes.
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
tenant_conf,
|
||||
location,
|
||||
lsn_lease_deadline,
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
tenant_conf: location_conf.tenant_conf,
|
||||
location: *attach_conf,
|
||||
}),
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
self.lsn_lease_deadline
|
||||
.map(|d| tokio::time::Instant::now() < d)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
struct TimelinePreload {
|
||||
timeline_id: TimelineId,
|
||||
@@ -1822,6 +1850,11 @@ impl Tenant {
|
||||
info!("Skipping GC in location state {:?}", conf.location);
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
|
||||
if conf.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
info!("Skipping GC because lsn lease deadline is not reached");
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
}
|
||||
|
||||
let _guard = match self.gc_block.start().await {
|
||||
@@ -2630,6 +2663,8 @@ impl Tenant {
|
||||
Arc::new(AttachedTenantConf {
|
||||
tenant_conf: new_tenant_conf.clone(),
|
||||
location: inner.location,
|
||||
// Attached location is not changed, no need to update lsn lease deadline.
|
||||
lsn_lease_deadline: inner.lsn_lease_deadline,
|
||||
})
|
||||
});
|
||||
|
||||
@@ -3887,9 +3922,9 @@ async fn run_initdb(
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
.args(["-D", initdb_target_dir.as_ref()])
|
||||
.args(["-U", &conf.superuser])
|
||||
.args(["-E", "utf8"])
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
@@ -4461,13 +4496,17 @@ mod tests {
|
||||
tline.freeze_and_flush().await.map_err(|e| e.into())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
|
||||
.await?
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
@@ -7244,9 +7283,17 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_lsn_lease() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
|
||||
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
|
||||
.await
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Advance to the lsn lease deadline so that GC is not blocked by
|
||||
// initial transition into AttachedSingle.
|
||||
tokio::time::advance(tenant.get_lsn_lease_length()).await;
|
||||
tokio::time::resume();
|
||||
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
|
||||
let end_lsn = Lsn(0x100);
|
||||
@@ -7274,24 +7321,33 @@ mod tests {
|
||||
|
||||
let leased_lsns = [0x30, 0x50, 0x70];
|
||||
let mut leases = Vec::new();
|
||||
let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
|
||||
leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
|
||||
Ok(())
|
||||
leased_lsns.iter().for_each(|n| {
|
||||
leases.push(
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect("lease request should succeed"),
|
||||
);
|
||||
});
|
||||
|
||||
// Renewing with shorter lease should not change the lease.
|
||||
let updated_lease_0 =
|
||||
timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
|
||||
assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
|
||||
let updated_lease_0 = timeline
|
||||
.renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
|
||||
.expect("lease renewal should succeed");
|
||||
assert_eq!(
|
||||
updated_lease_0.valid_until, leases[0].valid_until,
|
||||
" Renewing with shorter lease should not change the lease."
|
||||
);
|
||||
|
||||
// Renewing with a long lease should renew lease with later expiration time.
|
||||
let updated_lease_1 = timeline.make_lsn_lease(
|
||||
Lsn(leased_lsns[1]),
|
||||
timeline.get_lsn_lease_length() * 2,
|
||||
&ctx,
|
||||
)?;
|
||||
|
||||
assert!(updated_lease_1.valid_until > leases[1].valid_until);
|
||||
let updated_lease_1 = timeline
|
||||
.renew_lsn_lease(
|
||||
Lsn(leased_lsns[1]),
|
||||
timeline.get_lsn_lease_length() * 2,
|
||||
&ctx,
|
||||
)
|
||||
.expect("lease renewal should succeed");
|
||||
assert!(
|
||||
updated_lease_1.valid_until > leases[1].valid_until,
|
||||
"Renewing with a long lease should renew lease with later expiration time."
|
||||
);
|
||||
|
||||
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
|
||||
info!(
|
||||
@@ -7308,7 +7364,8 @@ mod tests {
|
||||
&CancellationToken::new(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Keeping everything <= Lsn(0x80) b/c leases:
|
||||
// 0/10: initdb layer
|
||||
@@ -7322,13 +7379,16 @@ mod tests {
|
||||
// Make lease on a already GC-ed LSN.
|
||||
// 0/80 does not have a valid lease + is below latest_gc_cutoff
|
||||
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
|
||||
let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
|
||||
assert!(res.is_err());
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect_err("lease request on GC-ed LSN should fail");
|
||||
|
||||
// Should still be able to renew a currently valid lease
|
||||
// Assumption: original lease to is still valid for 0/50.
|
||||
let _ =
|
||||
timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
|
||||
// (use `Timeline::init_lsn_lease` for testing so it always does validation)
|
||||
timeline
|
||||
.init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
|
||||
.expect("lease renewal with validation should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = &self.buffered_writer.as_inner().as_inner().path;
|
||||
let path = self.buffered_writer.as_inner().as_inner().path();
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
@@ -356,7 +356,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let file_contents =
|
||||
std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
|
||||
std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
|
||||
assert_eq!(file_contents, &content[0..cap]);
|
||||
|
||||
let buffer_contents = file.buffered_writer.inspect_buffer();
|
||||
@@ -392,7 +392,7 @@ mod tests {
|
||||
.buffered_writer
|
||||
.as_inner()
|
||||
.as_inner()
|
||||
.path
|
||||
.path()
|
||||
.metadata()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,29 +1,12 @@
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
use tokio::time::Instant;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
|
||||
#[derive(Default)]
|
||||
struct Storage {
|
||||
timelines_blocked: TimelinesBlocked,
|
||||
/// The deadline before which we are blocked from GC so that
|
||||
/// leases have a chance to be renewed.
|
||||
lsn_lease_deadline: Option<Instant>,
|
||||
}
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
impl Storage {
|
||||
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
|
||||
self.lsn_lease_deadline
|
||||
.map(|d| Instant::now() < d)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
|
||||
/// blocking.
|
||||
/// GcBlock provides persistent (per-timeline) gc blocking.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
@@ -66,17 +49,6 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
|
||||
///
|
||||
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
|
||||
/// length, we guarantee that all the leases we granted before will have a chance to renew
|
||||
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
|
||||
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
|
||||
let deadline = Instant::now() + lsn_lease_length;
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
g.lsn_lease_deadline = Some(deadline);
|
||||
}
|
||||
|
||||
/// Describe the current gc blocking reasons.
|
||||
///
|
||||
/// TODO: make this json serializable.
|
||||
@@ -102,7 +74,7 @@ impl GcBlock {
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
@@ -133,7 +105,7 @@ impl GcBlock {
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.timelines_blocked.entry(timeline.timeline_id) {
|
||||
match g.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
@@ -147,7 +119,7 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.timelines_blocked.len();
|
||||
let remaining_blocks = g.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
@@ -172,11 +144,11 @@ impl GcBlock {
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.timelines_blocked.is_empty() {
|
||||
if g.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.timelines_blocked.remove(&timeline.timeline_id);
|
||||
g.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
@@ -187,11 +159,10 @@ impl GcBlock {
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.timelines_blocked.is_empty());
|
||||
g.timelines_blocked
|
||||
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
@@ -205,7 +176,6 @@ pub(super) struct Guard<'a> {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
tenant_blocked_by_lsn_lease_deadline: bool,
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
@@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
|
||||
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons {
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.timelines_blocked.retain(|_key, value| {
|
||||
g.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
|
||||
if !g.is_empty() {
|
||||
Some(BlockingReasons {
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
@@ -240,17 +208,14 @@ impl BlockingReasons {
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
|
||||
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
|
||||
if g.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.timelines_blocked
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
|
||||
timelines: g.timelines_blocked.len(),
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1470,52 +1470,4 @@ mod tests {
|
||||
LayerVisibilityHint::Visible
|
||||
));
|
||||
}
|
||||
|
||||
/// Exercise edge case of querying at exactly the LSN of an image layer
|
||||
#[test]
|
||||
fn layer_search_at_image_lsn() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let last_record_lsn = Lsn::from_hex("00000000DEADBEEF").unwrap();
|
||||
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
|
||||
let image_layer = PersistentLayerDesc {
|
||||
key_range: Key::from_i128(0)..Key::from_i128(i128::MAX),
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(last_record_lsn),
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: false,
|
||||
file_size: 123,
|
||||
};
|
||||
|
||||
let delta_layer = PersistentLayerDesc {
|
||||
key_range: Key::from_i128(0)..Key::from_i128(i128::MAX),
|
||||
lsn_range: Lsn(0)..Lsn(0xdead0000),
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: true,
|
||||
file_size: 123,
|
||||
};
|
||||
|
||||
updates.insert_historic(image_layer.clone());
|
||||
updates.insert_historic(delta_layer);
|
||||
|
||||
updates.flush();
|
||||
|
||||
// FIXME: according to the search() docstring, it searches for layers with start LSNs _less then_
|
||||
// `end_lsn` -- i.e. it's correct that if you ask for exactly the LSN of an image layer, it shouldn't hit
|
||||
// it. However, the way that page_service calls it is to take the last_record_lsn of a Timeline
|
||||
// and pass that directly into LayerMap::search().
|
||||
|
||||
let searched = layer_map
|
||||
.search(Key::from_i128(12345), last_record_lsn)
|
||||
.unwrap();
|
||||
|
||||
// We searched at the LSN of the image layer: we should hit it
|
||||
assert_eq!(searched.layer.as_ref(), &image_layer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
use crate::controller_upcall_client::{
|
||||
ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
@@ -122,7 +122,7 @@ pub(crate) enum ShardSelector {
|
||||
Known(ShardIndex),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
/// A convenience for use with the re_attach ControllerUpcallClient function: rather
|
||||
/// than the serializable struct, we build this enum that encapsulates
|
||||
/// the invariant that attached tenants always have generations.
|
||||
///
|
||||
@@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
+ TEMP_FILE_SUFFIX;
|
||||
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
|
||||
fs::rename(path.as_ref(), &tmp_path).await?;
|
||||
fs::File::open(parent).await?.sync_all().await?;
|
||||
fs::File::open(parent)
|
||||
.await?
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("safe_rename_tenant_dir")?;
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
@@ -341,7 +345,7 @@ async fn init_load_generations(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
);
|
||||
emergency_generations(tenant_confs)
|
||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
||||
} else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
@@ -949,12 +953,6 @@ impl TenantManager {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
if attach_conf.attach_mode == AttachmentMode::Single {
|
||||
tenant
|
||||
.gc_block
|
||||
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
}
|
||||
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
|
||||
@@ -178,6 +178,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -185,6 +186,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -232,6 +234,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
|
||||
@@ -433,7 +433,6 @@ impl ReadableLayer {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
tracing::info!("get_values_reconstruct_data: {:?}", self.id());
|
||||
match self {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
layer
|
||||
|
||||
@@ -40,15 +40,15 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::dio::IoBufferMut;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
@@ -572,7 +572,7 @@ impl DeltaLayerWriterInner {
|
||||
ensure!(
|
||||
metadata.len() <= S3_UPLOAD_LIMIT,
|
||||
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
|
||||
file.path,
|
||||
file.path(),
|
||||
metadata.len()
|
||||
);
|
||||
|
||||
@@ -589,7 +589,9 @@ impl DeltaLayerWriterInner {
|
||||
);
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("delta_layer sync_all")?;
|
||||
|
||||
trace!("created delta layer {}", self.path);
|
||||
|
||||
@@ -788,7 +790,7 @@ impl DeltaLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
let file = VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
|
||||
@@ -989,7 +991,8 @@ impl DeltaLayerInner {
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
@@ -1008,7 +1011,7 @@ impl DeltaLayerInner {
|
||||
blob_meta.key,
|
||||
PageReconstructError::Other(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
self.file.path(),
|
||||
kind
|
||||
)),
|
||||
);
|
||||
@@ -1016,7 +1019,7 @@ impl DeltaLayerInner {
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1034,7 +1037,7 @@ impl DeltaLayerInner {
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
|
||||
@@ -1052,7 +1055,7 @@ impl DeltaLayerInner {
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to deserialize blob from virtual file {}",
|
||||
self.file.path,
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
|
||||
@@ -1133,7 +1136,7 @@ impl DeltaLayerInner {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
|
||||
};
|
||||
use futures::stream::TryStreamExt;
|
||||
|
||||
@@ -1183,15 +1186,15 @@ impl DeltaLayerInner {
|
||||
|
||||
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||
|
||||
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||
let read_mode = VectoredReadCoalesceMode::get();
|
||||
let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
|
||||
|
||||
let max_read_size = self
|
||||
.max_vectored_read_bytes
|
||||
.map(|x| x.0.get())
|
||||
.unwrap_or(8192);
|
||||
|
||||
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut buffer = Some(IoBufferMut::with_capacity_aligned(max_read_size, align));
|
||||
|
||||
// FIXME: buffering of DeltaLayerWriter
|
||||
let mut per_blob_copy = Vec::new();
|
||||
@@ -1228,12 +1231,12 @@ impl DeltaLayerInner {
|
||||
{
|
||||
None
|
||||
} else {
|
||||
read_builder.replace(VectoredReadBuilder::new(
|
||||
read_builder.replace(ChunkedVectoredReadBuilder::new(
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
read_mode,
|
||||
align,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1550,12 +1553,12 @@ impl<'a> DeltaLayerIterator<'a> {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let blob_read = meta.read(&view).await?;
|
||||
let value = Value::des(&blob_read)?;
|
||||
@@ -1930,7 +1933,9 @@ pub(crate) mod test {
|
||||
&vectored_reads,
|
||||
constants::MAX_VECTORED_READ_BYTES,
|
||||
);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
|
||||
|
||||
for read in vectored_reads {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
|
||||
@@ -40,11 +40,12 @@ use crate::tenant::vectored_blob_io::{
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::dio::IoBufferMut;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
@@ -388,7 +389,7 @@ impl ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path, ctx)
|
||||
let file = VirtualFile::open_v2(path, ctx)
|
||||
.await
|
||||
.context("open layer file")?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
@@ -542,14 +543,15 @@ impl ImageLayerInner {
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut key_count = 0;
|
||||
for read in plan.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
|
||||
let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
@@ -597,13 +599,13 @@ impl ImageLayerInner {
|
||||
);
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
|
||||
|
||||
match res {
|
||||
Ok(blobs_buf) => {
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await;
|
||||
|
||||
@@ -614,7 +616,7 @@ impl ImageLayerInner {
|
||||
meta.meta.key,
|
||||
PageReconstructError::Other(anyhow!(e).context(format!(
|
||||
"Failed to decompress blob from virtual file {}",
|
||||
self.file.path,
|
||||
self.file.path(),
|
||||
))),
|
||||
);
|
||||
|
||||
@@ -635,7 +637,7 @@ impl ImageLayerInner {
|
||||
blob_meta.key,
|
||||
PageReconstructError::from(anyhow!(
|
||||
"Failed to read blobs from virtual file {}: {}",
|
||||
self.file.path,
|
||||
self.file.path(),
|
||||
kind
|
||||
)),
|
||||
);
|
||||
@@ -889,7 +891,9 @@ impl ImageLayerWriterInner {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("image_layer sync_all")?;
|
||||
|
||||
trace!("created image layer {}", self.path);
|
||||
|
||||
@@ -1037,12 +1041,12 @@ impl<'a> ImageLayerIterator<'a> {
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
|
||||
let mut next_batch = std::collections::VecDeque::new();
|
||||
let buf_size = plan.size();
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&plan, buf, self.ctx)
|
||||
.await?;
|
||||
let frozen_buf = blobs_buf.buf.freeze();
|
||||
let view = BufView::new_bytes(frozen_buf);
|
||||
let view = BufView::new_slice(&blobs_buf.buf);
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let img_buf = meta.read(&view).await?;
|
||||
next_batch.push_back((
|
||||
|
||||
@@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
|
||||
let mut first = true;
|
||||
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
|
||||
@@ -66,6 +66,7 @@ use std::{
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
|
||||
@@ -1324,16 +1325,38 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN.
|
||||
///
|
||||
/// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
|
||||
/// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
|
||||
/// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
|
||||
/// the requesting lease.
|
||||
pub(crate) fn make_lsn_lease(
|
||||
/// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`.
|
||||
pub(crate) fn init_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
self.make_lsn_lease(lsn, length, true, ctx)
|
||||
}
|
||||
|
||||
/// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period.
|
||||
pub(crate) fn renew_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
self.make_lsn_lease(lsn, length, false, ctx)
|
||||
}
|
||||
|
||||
/// Obtains a temporary lease blocking garbage collection for the given LSN.
|
||||
///
|
||||
/// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error
|
||||
/// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present.
|
||||
///
|
||||
/// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease.
|
||||
/// The returned lease is therefore the maximum between the existing lease and the requesting lease.
|
||||
fn make_lsn_lease(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
length: Duration,
|
||||
init: bool,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<LsnLease> {
|
||||
let lease = {
|
||||
@@ -1347,8 +1370,8 @@ impl Timeline {
|
||||
|
||||
let entry = gc_info.leases.entry(lsn);
|
||||
|
||||
let lease = {
|
||||
if let Entry::Occupied(mut occupied) = entry {
|
||||
match entry {
|
||||
Entry::Occupied(mut occupied) => {
|
||||
let existing_lease = occupied.get_mut();
|
||||
if valid_until > existing_lease.valid_until {
|
||||
existing_lease.valid_until = valid_until;
|
||||
@@ -1360,20 +1383,28 @@ impl Timeline {
|
||||
}
|
||||
|
||||
existing_lease.clone()
|
||||
} else {
|
||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
Entry::Vacant(vacant) => {
|
||||
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
|
||||
// not blocked by the lsn lease deadline.
|
||||
let validate = {
|
||||
let conf = self.tenant_conf.load();
|
||||
conf.location.attach_mode == AttachmentMode::Single
|
||||
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
|
||||
};
|
||||
|
||||
if init || validate {
|
||||
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
|
||||
if lsn < *latest_gc_cutoff_lsn {
|
||||
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
let dt: DateTime<Utc> = valid_until.into();
|
||||
info!("lease created, valid until {}", dt);
|
||||
entry.or_insert(LsnLease { valid_until }).clone()
|
||||
vacant.insert(LsnLease { valid_until }).clone()
|
||||
}
|
||||
};
|
||||
|
||||
lease
|
||||
}
|
||||
};
|
||||
|
||||
Ok(lease)
|
||||
@@ -1950,8 +1981,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
// TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
|
||||
#[allow(unused)]
|
||||
pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -3856,21 +3885,21 @@ impl Timeline {
|
||||
)));
|
||||
}
|
||||
|
||||
// let distance = lsn.0 - partition_lsn.0;
|
||||
// if *partition_lsn != Lsn(0)
|
||||
// && distance <= self.repartition_threshold
|
||||
// && !flags.contains(CompactFlags::ForceRepartition)
|
||||
// {
|
||||
// debug!(
|
||||
// distance,
|
||||
// threshold = self.repartition_threshold,
|
||||
// "no repartitioning needed"
|
||||
// );
|
||||
// return Ok((
|
||||
// (dense_partition.clone(), sparse_partition.clone()),
|
||||
// *partition_lsn,
|
||||
// ));
|
||||
// }
|
||||
let distance = lsn.0 - partition_lsn.0;
|
||||
if *partition_lsn != Lsn(0)
|
||||
&& distance <= self.repartition_threshold
|
||||
&& !flags.contains(CompactFlags::ForceRepartition)
|
||||
{
|
||||
debug!(
|
||||
distance,
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((
|
||||
(dense_partition.clone(), sparse_partition.clone()),
|
||||
*partition_lsn,
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
@@ -5779,7 +5808,6 @@ impl<'a> TimelineWriter<'a> {
|
||||
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
||||
/// the latest and can be read.
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
tracing::info!("finish_write @ {new_lsn}");
|
||||
self.tl.finish_write(new_lsn);
|
||||
}
|
||||
|
||||
|
||||
@@ -364,10 +364,6 @@ impl Timeline {
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough". Skip image layer creation if L0 compaction cannot keep up.
|
||||
if fully_compacted {
|
||||
tracing::info!(
|
||||
"create_image_layers @ {lsn} (latest {})",
|
||||
self.get_last_record_lsn()
|
||||
);
|
||||
let image_layers = self
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||
use crate::virtual_file::dio::IoBufferMut;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
|
||||
/// Metadata bundled with the start and end offset of a blob.
|
||||
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
|
||||
/// Return type of [`VectoredBlobReader::read_blobs`]
|
||||
pub struct VectoredBlobsBuf {
|
||||
/// Buffer for all blobs in this read
|
||||
pub buf: BytesMut,
|
||||
pub buf: IoBufferMut,
|
||||
/// Offsets into the buffer and metadata for all blobs in this read
|
||||
pub blobs: Vec<VectoredBlob>,
|
||||
}
|
||||
@@ -185,171 +186,7 @@ pub(crate) enum VectoredReadExtended {
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum VectoredReadCoalesceMode {
|
||||
/// Only coalesce exactly adjacent reads.
|
||||
AdjacentOnly,
|
||||
/// In addition to adjacent reads, also consider reads whose corresponding
|
||||
/// `end` and `start` offsets reside at the same chunk.
|
||||
Chunked(usize),
|
||||
}
|
||||
|
||||
impl VectoredReadCoalesceMode {
|
||||
/// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
|
||||
/// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
|
||||
pub(crate) fn get() -> Self {
|
||||
let align = virtual_file::get_io_buffer_alignment_raw();
|
||||
if align == 0 {
|
||||
VectoredReadCoalesceMode::AdjacentOnly
|
||||
} else {
|
||||
VectoredReadCoalesceMode::Chunked(align)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum VectoredReadBuilder {
|
||||
Adjacent(AdjacentVectoredReadBuilder),
|
||||
Chunked(ChunkedVectoredReadBuilder),
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
fn new_impl(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
match mode {
|
||||
VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
|
||||
AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
|
||||
),
|
||||
VectoredReadCoalesceMode::Chunked(chunk_size) => {
|
||||
Self::Chunked(ChunkedVectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
meta,
|
||||
max_read_size,
|
||||
chunk_size,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
|
||||
}
|
||||
|
||||
pub(crate) fn new_streaming(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, None, mode)
|
||||
}
|
||||
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.build(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.build(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.size(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.size(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct AdjacentVectoredReadBuilder {
|
||||
/// Start offset of the read.
|
||||
start: u64,
|
||||
// End offset of the read.
|
||||
end: u64,
|
||||
/// Start offset and metadata for each blob in this read
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
max_read_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl AdjacentVectoredReadBuilder {
|
||||
/// Start building a new vectored read.
|
||||
///
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, meta)
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
Self {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
/// offset matches with the current end of the vectored read
|
||||
/// and the resuting size is below the max read size
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let size = (end - start) as usize;
|
||||
let not_limited_by_max_read_size = {
|
||||
if let Some(max_read_size) = self.max_read_size {
|
||||
self.size() + size <= max_read_size
|
||||
} else {
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if self.end == start && not_limited_by_max_read_size {
|
||||
self.end = end;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
.expect("LSNs are ordered within vectored reads");
|
||||
|
||||
return VectoredReadExtended::Yes;
|
||||
}
|
||||
|
||||
VectoredReadExtended::No
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
VectoredRead {
|
||||
start: self.start,
|
||||
end: self.end,
|
||||
blobs_at: self.blobs_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A vectored read builder that tries to coalesce all reads that fits in a chunk.
|
||||
pub(crate) struct ChunkedVectoredReadBuilder {
|
||||
/// Start block number
|
||||
start_blk_no: usize,
|
||||
@@ -373,7 +210,7 @@ impl ChunkedVectoredReadBuilder {
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
pub(crate) fn new(
|
||||
fn new_impl(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
@@ -396,6 +233,25 @@ impl ChunkedVectoredReadBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
align: usize,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
|
||||
}
|
||||
|
||||
pub(crate) fn new_streaming(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
align: usize,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, None, align)
|
||||
}
|
||||
|
||||
/// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
|
||||
///
|
||||
/// The resulting size also must be below the max read size.
|
||||
@@ -474,17 +330,17 @@ pub struct VectoredReadPlanner {
|
||||
|
||||
max_read_size: usize,
|
||||
|
||||
mode: VectoredReadCoalesceMode,
|
||||
align: usize,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
pub fn new(max_read_size: usize) -> Self {
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size,
|
||||
mode,
|
||||
align,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,7 +401,7 @@ impl VectoredReadPlanner {
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Vec<VectoredRead> {
|
||||
let mut current_read_builder: Option<VectoredReadBuilder> = None;
|
||||
let mut current_read_builder: Option<ChunkedVectoredReadBuilder> = None;
|
||||
let mut reads = Vec::new();
|
||||
|
||||
for (key, blobs_for_key) in self.blobs {
|
||||
@@ -558,12 +414,12 @@ impl VectoredReadPlanner {
|
||||
};
|
||||
|
||||
if extended == VectoredReadExtended::No {
|
||||
let next_read_builder = VectoredReadBuilder::new(
|
||||
let next_read_builder = ChunkedVectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.max_read_size,
|
||||
self.mode,
|
||||
self.align,
|
||||
);
|
||||
|
||||
let prev_read_builder = current_read_builder.replace(next_read_builder);
|
||||
@@ -605,7 +461,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
pub async fn read_blobs(
|
||||
&self,
|
||||
read: &VectoredRead,
|
||||
buf: BytesMut,
|
||||
buf: IoBufferMut,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<VectoredBlobsBuf, std::io::Error> {
|
||||
assert!(read.size() > 0);
|
||||
@@ -688,7 +544,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
/// `handle` gets called and when the current key would just exceed the read_size and
|
||||
/// max_cnt constraints.
|
||||
pub struct StreamingVectoredReadPlanner {
|
||||
read_builder: Option<VectoredReadBuilder>,
|
||||
read_builder: Option<ChunkedVectoredReadBuilder>,
|
||||
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
|
||||
prev: Option<(Key, Lsn, u64)>,
|
||||
/// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
|
||||
@@ -699,21 +555,21 @@ pub struct StreamingVectoredReadPlanner {
|
||||
/// Size of the current batch
|
||||
cnt: usize,
|
||||
|
||||
mode: VectoredReadCoalesceMode,
|
||||
align: usize,
|
||||
}
|
||||
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
assert!(max_read_size > 0);
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
Self {
|
||||
read_builder: None,
|
||||
prev: None,
|
||||
max_cnt,
|
||||
max_read_size,
|
||||
cnt: 0,
|
||||
mode,
|
||||
align,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -762,11 +618,11 @@ impl StreamingVectoredReadPlanner {
|
||||
}
|
||||
None => {
|
||||
self.read_builder = {
|
||||
Some(VectoredReadBuilder::new_streaming(
|
||||
Some(ChunkedVectoredReadBuilder::new_streaming(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.mode,
|
||||
self.align,
|
||||
))
|
||||
};
|
||||
}
|
||||
@@ -1090,9 +946,10 @@ mod tests {
|
||||
|
||||
// Multiply by two (compressed data might need more space), and add a few bytes for the header
|
||||
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let mut buf = IoBufferMut::with_capacity_aligned(reserved_bytes, align);
|
||||
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
let align = virtual_file::get_io_buffer_alignment();
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||
let meta = BlobMeta {
|
||||
key: Key::MIN,
|
||||
@@ -1104,7 +961,8 @@ mod tests {
|
||||
if idx + 1 == offsets.len() {
|
||||
continue;
|
||||
}
|
||||
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
|
||||
let read_builder =
|
||||
ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
|
||||
let read = read_builder.build();
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
|
||||
@@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
#[cfg(target_os = "linux")]
|
||||
use std::os::unix::fs::OpenOptionsExt;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
|
||||
|
||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
|
||||
@@ -38,10 +40,11 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
|
||||
mod metadata;
|
||||
mod open_options;
|
||||
use self::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
pub(crate) use api::DirectIoMode;
|
||||
pub(crate) use api::IoMode;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
pub(crate) mod dio;
|
||||
|
||||
pub(crate) mod owned_buffers_io {
|
||||
//! Abstractions for IO with owned buffers.
|
||||
@@ -53,6 +56,7 @@ pub(crate) mod owned_buffers_io {
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod io_buf_aligned;
|
||||
pub(crate) mod io_buf_ext;
|
||||
pub(crate) mod slice;
|
||||
pub(crate) mod write;
|
||||
@@ -61,6 +65,176 @@ pub(crate) mod owned_buffers_io {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum VirtualFile {
|
||||
Buffered(VirtualFileInner),
|
||||
Direct(VirtualFileInner),
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
fn inner(&self) -> &VirtualFileInner {
|
||||
match self {
|
||||
Self::Buffered(file) => file,
|
||||
Self::Direct(file) => file,
|
||||
}
|
||||
}
|
||||
|
||||
fn inner_mut(&mut self) -> &mut VirtualFileInner {
|
||||
match self {
|
||||
Self::Buffered(file) => file,
|
||||
Self::Direct(file) => file,
|
||||
}
|
||||
}
|
||||
|
||||
fn into_inner(self) -> VirtualFileInner {
|
||||
match self {
|
||||
Self::Buffered(file) => file,
|
||||
Self::Direct(file) => file,
|
||||
}
|
||||
}
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub async fn open<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let file = VirtualFileInner::open(path, ctx).await?;
|
||||
Ok(Self::Buffered(file))
|
||||
}
|
||||
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
///
|
||||
/// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
|
||||
pub async fn open_v2<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
|
||||
}
|
||||
|
||||
pub async fn create<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let file = VirtualFileInner::create(path, ctx).await?;
|
||||
Ok(Self::Buffered(file))
|
||||
}
|
||||
|
||||
pub async fn create_v2<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
VirtualFile::open_with_options_v2(
|
||||
path.as_ref(),
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn open_with_options<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
|
||||
Ok(Self::Buffered(file))
|
||||
}
|
||||
|
||||
pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`.
|
||||
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let file = match get_io_mode() {
|
||||
IoMode::Buffered => {
|
||||
let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
|
||||
Self::Buffered(file)
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoMode::Direct => {
|
||||
let file = VirtualFileInner::open_with_options(
|
||||
path,
|
||||
open_options.custom_flags(nix::libc::O_DIRECT),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
Self::Direct(file)
|
||||
}
|
||||
};
|
||||
Ok(file)
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Utf8Path {
|
||||
self.inner().path.as_path()
|
||||
}
|
||||
|
||||
pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
|
||||
final_path: Utf8PathBuf,
|
||||
tmp_path: Utf8PathBuf,
|
||||
content: B,
|
||||
) -> std::io::Result<()> {
|
||||
VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
|
||||
}
|
||||
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
self.inner().sync_all().await
|
||||
}
|
||||
|
||||
pub async fn sync_data(&self) -> Result<(), Error> {
|
||||
self.inner().sync_data().await
|
||||
}
|
||||
|
||||
pub async fn metadata(&self) -> Result<Metadata, Error> {
|
||||
self.inner().metadata().await
|
||||
}
|
||||
|
||||
pub fn remove(self) {
|
||||
self.into_inner().remove();
|
||||
}
|
||||
|
||||
pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
self.inner_mut().seek(pos).await
|
||||
}
|
||||
|
||||
pub async fn read_exact_at<Buf>(
|
||||
&self,
|
||||
slice: Slice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Slice<Buf>, Error>
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
{
|
||||
self.inner().read_exact_at(slice, offset, ctx).await
|
||||
}
|
||||
|
||||
pub async fn read_exact_at_page(
|
||||
&self,
|
||||
page: PageWriteGuard<'static>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PageWriteGuard<'static>, Error> {
|
||||
self.inner().read_exact_at_page(page, offset, ctx).await
|
||||
}
|
||||
|
||||
pub async fn write_all_at<Buf: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<Buf>,
|
||||
offset: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
||||
self.inner().write_all_at(buf, offset, ctx).await
|
||||
}
|
||||
|
||||
pub async fn write_all<Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, Result<usize, Error>) {
|
||||
self.inner_mut().write_all(buf, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
@@ -77,7 +251,7 @@ pub(crate) mod owned_buffers_io {
|
||||
/// 'tag' field is used to detect whether the handle still is valid or not.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct VirtualFile {
|
||||
pub struct VirtualFileInner {
|
||||
/// Lazy handle to the global file descriptor cache. The slot that this points to
|
||||
/// might contain our File, or it may be empty, or it may contain a File that
|
||||
/// belongs to a different VirtualFile.
|
||||
@@ -350,12 +524,12 @@ macro_rules! with_file {
|
||||
}};
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
impl VirtualFileInner {
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub async fn open<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
) -> Result<VirtualFileInner, std::io::Error> {
|
||||
Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
|
||||
}
|
||||
|
||||
@@ -364,7 +538,7 @@ impl VirtualFile {
|
||||
pub async fn create<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
) -> Result<VirtualFileInner, std::io::Error> {
|
||||
Self::open_with_options(
|
||||
path.as_ref(),
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
@@ -382,7 +556,7 @@ impl VirtualFile {
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
_ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
) -> Result<VirtualFileInner, std::io::Error> {
|
||||
let path_ref = path.as_ref();
|
||||
let path_str = path_ref.to_string();
|
||||
let parts = path_str.split('/').collect::<Vec<&str>>();
|
||||
@@ -413,7 +587,7 @@ impl VirtualFile {
|
||||
open_options.open(path_ref.as_std_path()).await?
|
||||
});
|
||||
|
||||
// Strip all options other than read and write.
|
||||
// Strip all options other than read and write (O_DIRECT).
|
||||
//
|
||||
// It would perhaps be nicer to check just for the read and write flags
|
||||
// explicitly, but OpenOptions doesn't contain any functions to read flags,
|
||||
@@ -423,7 +597,7 @@ impl VirtualFile {
|
||||
reopen_options.create_new(false);
|
||||
reopen_options.truncate(false);
|
||||
|
||||
let vfile = VirtualFile {
|
||||
let vfile = VirtualFileInner {
|
||||
handle: RwLock::new(handle),
|
||||
pos: 0,
|
||||
path: path_ref.to_path_buf(),
|
||||
@@ -466,6 +640,7 @@ impl VirtualFile {
|
||||
&[]
|
||||
};
|
||||
utils::crashsafe::overwrite(&final_path, &tmp_path, content)
|
||||
.maybe_fatal_err("crashsafe_overwrite")
|
||||
})
|
||||
.await
|
||||
.expect("blocking task is never aborted")
|
||||
@@ -475,7 +650,7 @@ impl VirtualFile {
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
|
||||
res
|
||||
res.maybe_fatal_err("sync_all")
|
||||
})
|
||||
}
|
||||
|
||||
@@ -483,7 +658,7 @@ impl VirtualFile {
|
||||
pub async fn sync_data(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
|
||||
res
|
||||
res.maybe_fatal_err("sync_data")
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1033,6 +1208,21 @@ impl tokio_epoll_uring::IoFd for FileGuard {
|
||||
|
||||
#[cfg(test)]
|
||||
impl VirtualFile {
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
|
||||
self.inner().read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
|
||||
self.inner_mut().read_to_end(buf, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl VirtualFileInner {
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
@@ -1066,7 +1256,7 @@ impl VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
impl Drop for VirtualFileInner {
|
||||
/// If a VirtualFile is dropped, close the underlying file if it was open.
|
||||
fn drop(&mut self) {
|
||||
let handle = self.handle.get_mut();
|
||||
@@ -1147,7 +1337,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize)
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
if set_io_buffer_alignment(io_buffer_alignment).is_err() {
|
||||
panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
|
||||
panic!(
|
||||
"IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
|
||||
);
|
||||
}
|
||||
io_engine::init(engine);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
@@ -1174,14 +1366,16 @@ fn get_open_files() -> &'static OpenFiles {
|
||||
|
||||
static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
|
||||
|
||||
/// Returns true if `x` is zero or a power of two.
|
||||
fn is_zero_or_power_of_two(x: usize) -> bool {
|
||||
(x == 0) || ((x & (x - 1)) == 0)
|
||||
/// Returns true if the alignment is a power of two and is greater or equal to 512.
|
||||
fn is_valid_io_buffer_alignment(align: usize) -> bool {
|
||||
align.is_power_of_two() && align >= 512
|
||||
}
|
||||
|
||||
/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
|
||||
/// not a power of two or less than 512 bytes.
|
||||
#[allow(unused)]
|
||||
pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
|
||||
if is_zero_or_power_of_two(align) {
|
||||
if is_valid_io_buffer_alignment(align) {
|
||||
IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -1189,19 +1383,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
|
||||
/// Gets the io buffer alignment.
|
||||
///
|
||||
/// This function should be used to check the raw config value.
|
||||
pub(crate) fn get_io_buffer_alignment_raw() -> usize {
|
||||
/// This function should be used for getting the actual alignment value to use.
|
||||
pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
if cfg!(test) {
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
|
||||
if let Some(test_align) = utils::env::var(env_var_name) {
|
||||
if is_zero_or_power_of_two(test_align) {
|
||||
if is_valid_io_buffer_alignment(test_align) {
|
||||
test_align
|
||||
} else {
|
||||
panic!("IO buffer alignment ({test_align}) is not a power of two");
|
||||
panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
|
||||
}
|
||||
} else {
|
||||
align
|
||||
@@ -1211,14 +1405,15 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
|
||||
///
|
||||
/// This function should be used for getting the actual alignment value to use.
|
||||
pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
let align = get_io_buffer_alignment_raw();
|
||||
align.max(1)
|
||||
static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
|
||||
|
||||
pub(crate) fn set_io_mode(mode: IoMode) {
|
||||
IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub(crate) fn get_io_mode() -> IoMode {
|
||||
IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::context::DownloadBehavior;
|
||||
@@ -1527,7 +1722,7 @@ mod tests {
|
||||
// Open the file many times.
|
||||
let mut files = Vec::new();
|
||||
for _ in 0..VIRTUAL_FILES {
|
||||
let f = VirtualFile::open_with_options(
|
||||
let f = VirtualFileInner::open_with_options(
|
||||
&test_file_path,
|
||||
OpenOptions::new().read(true),
|
||||
&ctx,
|
||||
@@ -1579,7 +1774,7 @@ mod tests {
|
||||
let path = testdir.join("myfile");
|
||||
let tmp_path = testdir.join("myfile.tmp");
|
||||
|
||||
VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
|
||||
VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
|
||||
@@ -1588,7 +1783,7 @@ mod tests {
|
||||
assert!(!tmp_path.exists());
|
||||
drop(file);
|
||||
|
||||
VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
|
||||
VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
|
||||
@@ -1611,7 +1806,7 @@ mod tests {
|
||||
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
|
||||
assert!(tmp_path.exists());
|
||||
|
||||
VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
|
||||
VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
410
pageserver/src/virtual_file/dio.rs
Normal file
410
pageserver/src/virtual_file/dio.rs
Normal file
@@ -0,0 +1,410 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use core::slice;
|
||||
use std::{
|
||||
alloc::{self, Layout},
|
||||
cmp,
|
||||
mem::{ManuallyDrop, MaybeUninit},
|
||||
ops::{Deref, DerefMut},
|
||||
ptr::{addr_of_mut, NonNull},
|
||||
};
|
||||
|
||||
use bytes::buf::UninitSlice;
|
||||
|
||||
struct IoBufferPtr(*mut u8);
|
||||
|
||||
// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
|
||||
unsafe impl Send for IoBufferPtr {}
|
||||
|
||||
/// An aligned buffer type used for I/O.
|
||||
pub struct IoBufferMut {
|
||||
ptr: IoBufferPtr,
|
||||
capacity: usize,
|
||||
len: usize,
|
||||
align: usize,
|
||||
}
|
||||
|
||||
impl IoBufferMut {
|
||||
/// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
|
||||
///
|
||||
/// The buffer will be able to hold at most `capacity` elements and will never resize.
|
||||
///
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
|
||||
/// * `align` must not be zero,
|
||||
///
|
||||
/// * `align` must be a power of two,
|
||||
///
|
||||
/// * `capacity`, when rounded up to the nearest multiple of `align`,
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
pub fn with_capacity_aligned(capacity: usize, align: usize) -> Self {
|
||||
let layout = Layout::from_size_align(capacity, align).expect("Invalid layout");
|
||||
|
||||
// SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
|
||||
let ptr = unsafe {
|
||||
let ptr = alloc::alloc(layout);
|
||||
if ptr.is_null() {
|
||||
alloc::handle_alloc_error(layout);
|
||||
}
|
||||
IoBufferPtr(ptr)
|
||||
};
|
||||
|
||||
IoBufferMut {
|
||||
ptr,
|
||||
capacity,
|
||||
len: 0,
|
||||
align,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
|
||||
pub fn with_capacity_aligned_zeroed(capacity: usize, align: usize) -> Self {
|
||||
use bytes::BufMut;
|
||||
let mut buf = Self::with_capacity_aligned(capacity, align);
|
||||
buf.put_bytes(0, capacity);
|
||||
buf.len = capacity;
|
||||
buf
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes the buffer can hold.
|
||||
#[inline]
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.capacity
|
||||
}
|
||||
|
||||
/// Returns the alignment of the buffer.
|
||||
#[inline]
|
||||
pub fn align(&self) -> usize {
|
||||
self.align
|
||||
}
|
||||
|
||||
/// Returns the number of bytes in the buffer, also referred to as its 'length'.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Force the length of the buffer to `new_len`.
|
||||
#[inline]
|
||||
unsafe fn set_len(&mut self, new_len: usize) {
|
||||
debug_assert!(new_len <= self.capacity());
|
||||
self.len = new_len;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn as_ptr(&self) -> *const u8 {
|
||||
self.ptr.0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn as_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.ptr.0
|
||||
}
|
||||
|
||||
/// Extracts a slice containing the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&s[..]`.
|
||||
#[inline]
|
||||
fn as_slice(&self) -> &[u8] {
|
||||
// SAFETY: The pointer is valid and `len` bytes are initialized.
|
||||
unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
|
||||
}
|
||||
|
||||
/// Extracts a mutable slice of the entire buffer.
|
||||
///
|
||||
/// Equivalent to `&mut s[..]`.
|
||||
fn as_mut_slice(&mut self) -> &mut [u8] {
|
||||
// SAFETY: The pointer is valid and `len` bytes are initialized.
|
||||
unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
|
||||
}
|
||||
|
||||
/// Drops the all the contents of the buffer, setting its length to `0`.
|
||||
#[inline]
|
||||
pub fn clear(&mut self) {
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
/// Reserves capacity for at least `additional` more bytes to be inserted
|
||||
/// in the given `IoBufferMut`. The collection may reserve more space to
|
||||
/// speculatively avoid frequent reallocations. After calling `reserve`,
|
||||
/// capacity will be greater than or equal to `self.len() + additional`.
|
||||
/// Does nothing if capacity is already sufficient.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the new capacity exceeds `isize::MAX` _bytes_.
|
||||
pub fn reserve(&mut self, additional: usize) {
|
||||
if additional > self.capacity() - self.len() {
|
||||
self.reserve_inner(additional);
|
||||
}
|
||||
}
|
||||
|
||||
fn reserve_inner(&mut self, additional: usize) {
|
||||
let Some(required_cap) = self.len().checked_add(additional) else {
|
||||
capacity_overflow()
|
||||
};
|
||||
|
||||
let old_capacity = self.capacity();
|
||||
let align = self.align();
|
||||
// This guarantees exponential growth. The doubling cannot overflow
|
||||
// because `cap <= isize::MAX` and the type of `cap` is `usize`.
|
||||
let cap = cmp::max(old_capacity * 2, required_cap);
|
||||
|
||||
if !is_valid_alloc(cap) {
|
||||
capacity_overflow()
|
||||
}
|
||||
let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
|
||||
|
||||
let old_ptr = self.as_mut_ptr();
|
||||
|
||||
// SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
|
||||
// and we panics on null pointer.
|
||||
let (ptr, cap) = unsafe {
|
||||
let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
|
||||
let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
|
||||
if ptr.is_null() {
|
||||
alloc::handle_alloc_error(new_layout);
|
||||
}
|
||||
(IoBufferPtr(ptr), cap)
|
||||
};
|
||||
|
||||
self.ptr = ptr;
|
||||
self.capacity = cap;
|
||||
}
|
||||
|
||||
|
||||
/// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
|
||||
pub fn leak<'a>(self) -> &'a mut [u8] {
|
||||
let mut buf = ManuallyDrop::new(self);
|
||||
// SAFETY: leaking the buffer as intended.
|
||||
unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
|
||||
}
|
||||
}
|
||||
|
||||
fn capacity_overflow() -> ! {
|
||||
panic!("capacity overflow")
|
||||
}
|
||||
|
||||
// We need to guarantee the following:
|
||||
// * We don't ever allocate `> isize::MAX` byte-size objects.
|
||||
// * We don't overflow `usize::MAX` and actually allocate too little.
|
||||
//
|
||||
// On 64-bit we just need to check for overflow since trying to allocate
|
||||
// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
|
||||
// an extra guard for this in case we're running on a platform which can use
|
||||
// all 4GB in user-space, e.g., PAE or x32.
|
||||
#[inline]
|
||||
fn is_valid_alloc(alloc_size: usize) -> bool {
|
||||
!(usize::BITS < 64 && alloc_size > isize::MAX as usize)
|
||||
}
|
||||
|
||||
impl Drop for IoBufferMut {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: memory was allocated with std::alloc::alloc with the same layout.
|
||||
unsafe {
|
||||
alloc::dealloc(
|
||||
self.as_mut_ptr(),
|
||||
Layout::from_size_align_unchecked(self.capacity, self.align),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for IoBufferMut {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for IoBufferMut {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.as_mut_slice()
|
||||
}
|
||||
}
|
||||
|
||||
/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
|
||||
unsafe impl bytes::BufMut for IoBufferMut {
|
||||
#[inline]
|
||||
fn remaining_mut(&self) -> usize {
|
||||
// Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
|
||||
// Thus, it can have at most `self.capacity` bytes.
|
||||
self.capacity() - self.len()
|
||||
}
|
||||
|
||||
// SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
|
||||
#[inline]
|
||||
unsafe fn advance_mut(&mut self, cnt: usize) {
|
||||
let len: usize = self.len();
|
||||
let remaining = self.remaining_mut();
|
||||
|
||||
if remaining < cnt {
|
||||
panic_advance(cnt, remaining);
|
||||
}
|
||||
|
||||
// Addition will not overflow since the sum is at most the capacity.
|
||||
self.set_len(len + cnt);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
|
||||
let cap = self.capacity();
|
||||
let len = self.len();
|
||||
|
||||
// SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
|
||||
// valid for `cap - len` bytes. The subtraction will not underflow since
|
||||
// `len <= cap`.
|
||||
unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) }
|
||||
}
|
||||
}
|
||||
|
||||
/// Panic with a nice error message.
|
||||
#[cold]
|
||||
fn panic_advance(idx: usize, len: usize) -> ! {
|
||||
panic!(
|
||||
"advance out of bounds: the len is {} but advancing by {}",
|
||||
len, idx
|
||||
);
|
||||
}
|
||||
|
||||
/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer,
|
||||
/// and the location remains stable even if [`Self`] is moved.
|
||||
unsafe impl tokio_epoll_uring::IoBuf for IoBufferMut {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
self.capacity()
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: See above.
|
||||
unsafe impl tokio_epoll_uring::IoBufMut for IoBufferMut {
|
||||
fn stable_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.as_mut_ptr()
|
||||
}
|
||||
|
||||
unsafe fn set_init(&mut self, init_len: usize) {
|
||||
if self.len() < init_len {
|
||||
self.set_len(init_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_with_capacity_aligned() {
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
|
||||
let v = IoBufferMut::with_capacity_aligned(ALIGN / 2, ALIGN);
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN / 2);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_capacity_aligned_zeroed() {
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let v = IoBufferMut::with_capacity_aligned_zeroed(ALIGN, ALIGN);
|
||||
assert_eq!(v.len(), ALIGN);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
assert_eq!(&v[..], &[0; ALIGN])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reserve() {
|
||||
use bytes::BufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
|
||||
let capacity = v.capacity();
|
||||
v.reserve(capacity);
|
||||
assert_eq!(v.capacity(), capacity);
|
||||
let data = [b'a'; ALIGN];
|
||||
v.put(&data[..]);
|
||||
v.reserve(capacity);
|
||||
assert!(v.capacity() >= capacity * 2);
|
||||
assert_eq!(&v[..], &data[..]);
|
||||
let capacity = v.capacity();
|
||||
v.clear();
|
||||
v.reserve(capacity);
|
||||
assert_eq!(capacity, v.capacity());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_put() {
|
||||
use bytes::BufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
|
||||
let x = [b'a'; ALIGN];
|
||||
|
||||
for _ in 0..2 {
|
||||
for _ in 0..4 {
|
||||
v.put(&x[..]);
|
||||
}
|
||||
assert_eq!(v.len(), ALIGN * 4);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
v.clear()
|
||||
}
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN * 4);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_bytes_put_panic() {
|
||||
use bytes::BufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
|
||||
let x = [b'a'; ALIGN];
|
||||
for _ in 0..5 {
|
||||
v.put_slice(&x[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_io_buf_put_slice() {
|
||||
use tokio_epoll_uring::BoundedBufMut;
|
||||
const ALIGN: usize = 4 * 1024;
|
||||
let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
|
||||
let x = [b'a'; ALIGN];
|
||||
|
||||
for _ in 0..2 {
|
||||
v.put_slice(&x[..]);
|
||||
assert_eq!(v.len(), ALIGN);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
v.clear()
|
||||
}
|
||||
assert_eq!(v.len(), 0);
|
||||
assert_eq!(v.capacity(), ALIGN);
|
||||
assert_eq!(v.align(), ALIGN);
|
||||
assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
|
||||
use crate::virtual_file::dio::IoBufferMut;
|
||||
|
||||
pub(crate) trait IoBufAlignedMut: IoBufMut {}
|
||||
|
||||
impl IoBufAlignedMut for IoBufferMut {}
|
||||
@@ -1,5 +1,6 @@
|
||||
//! See [`FullSlice`].
|
||||
|
||||
use crate::virtual_file::dio::IoBufferMut;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
@@ -76,3 +77,4 @@ macro_rules! impl_io_buf_ext {
|
||||
impl_io_buf_ext!(Bytes);
|
||||
impl_io_buf_ext!(BytesMut);
|
||||
impl_io_buf_ext!(Vec<u8>);
|
||||
impl_io_buf_ext!(IoBufferMut);
|
||||
|
||||
@@ -1473,11 +1473,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
if (!sk->wp->config->syncSafekeepers)
|
||||
{
|
||||
Size rbytes;
|
||||
rbytes = WALReadFromBuffers(buf, startptr, count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
startptr += rbytes;
|
||||
count -= rbytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
res = NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(count > 0);
|
||||
|
||||
/* Now read the remaining WAL from the WAL file */
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
}
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
|
||||
@@ -24,6 +24,7 @@ bytes = { workspace = true, features = ["serde"] }
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
compute_api.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
dashmap.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
@@ -80,6 +80,14 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
|
||||
fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||
fn dyn_clone(&self) -> Box<dyn TestBackend>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl Clone for Box<dyn TestBackend> {
|
||||
fn clone(&self) -> Self {
|
||||
TestBackend::dyn_clone(&**self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Backend<'_, (), ()> {
|
||||
@@ -585,6 +593,14 @@ mod tests {
|
||||
))
|
||||
}
|
||||
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: crate::EndpointId,
|
||||
) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
|
||||
@@ -12,7 +12,10 @@ use serde::{Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
|
||||
use crate::{
|
||||
context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
|
||||
RoleName,
|
||||
};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
@@ -27,7 +30,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
}
|
||||
|
||||
@@ -35,10 +37,11 @@ pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
pub(crate) audience: Option<String>,
|
||||
pub(crate) role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct JwkCache {
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
@@ -54,18 +57,28 @@ pub(crate) struct JwkCacheEntry {
|
||||
}
|
||||
|
||||
impl JwkCacheEntry {
|
||||
fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets.values().find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
fn find_jwk_and_audience(
|
||||
&self,
|
||||
key_id: &str,
|
||||
role_name: &RoleName,
|
||||
) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets
|
||||
.values()
|
||||
// make sure our requested role has access to the key set
|
||||
.filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name))
|
||||
// try and find the requested key-id in the key set
|
||||
.find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySet {
|
||||
jwks: jose_jwk::JwkSet,
|
||||
audience: Option<String>,
|
||||
role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
impl KeySet {
|
||||
@@ -106,7 +119,6 @@ impl JwkCacheEntryLock {
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
@@ -119,11 +131,10 @@ impl JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules
|
||||
.fetch_auth_rules(ctx, endpoint, role_name)
|
||||
.await?;
|
||||
let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?;
|
||||
let mut key_sets =
|
||||
ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
|
||||
|
||||
// TODO(conrad): run concurrently
|
||||
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
|
||||
for rule in rules {
|
||||
@@ -151,6 +162,7 @@ impl JwkCacheEntryLock {
|
||||
KeySet {
|
||||
jwks,
|
||||
audience: rule.audience,
|
||||
role_names: rule.role_names,
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -173,7 +185,6 @@ impl JwkCacheEntryLock {
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
@@ -183,9 +194,7 @@ impl JwkCacheEntryLock {
|
||||
let Some(cached) = guard else {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
return self
|
||||
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
|
||||
.await;
|
||||
return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
@@ -196,9 +205,7 @@ impl JwkCacheEntryLock {
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self
|
||||
.renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
|
||||
.await;
|
||||
return self.renew_jwks(permit, ctx, client, endpoint, fetch).await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
@@ -212,7 +219,7 @@ impl JwkCacheEntryLock {
|
||||
let ctx = ctx.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry
|
||||
.renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
|
||||
.renew_jwks(permit, &ctx, &client, endpoint, &fetch)
|
||||
.await
|
||||
{
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
@@ -232,7 +239,7 @@ impl JwkCacheEntryLock {
|
||||
jwt: &str,
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
role_name: &RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
@@ -254,30 +261,26 @@ impl JwkCacheEntryLock {
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
ensure!(
|
||||
header.typ == "JWT",
|
||||
"Provided authentication token is not a valid JWT encoding"
|
||||
);
|
||||
let kid = header.key_id.context("missing key id")?;
|
||||
|
||||
let mut guard = self
|
||||
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
|
||||
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
|
||||
.await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid) {
|
||||
match guard.find_jwk_and_audience(kid, role_name) {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self
|
||||
.renew_jwks(
|
||||
permit,
|
||||
ctx,
|
||||
client,
|
||||
endpoint.clone(),
|
||||
role_name.clone(),
|
||||
fetch,
|
||||
)
|
||||
.renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
|
||||
.await?;
|
||||
}
|
||||
_ => {
|
||||
@@ -320,11 +323,14 @@ impl JwkCacheEntryLock {
|
||||
let now = SystemTime::now();
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY);
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
|
||||
ensure!(
|
||||
nbf < now + CLOCK_SKEW_LEEWAY,
|
||||
"JWT token is not yet ready to use"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -336,7 +342,7 @@ impl JwkCache {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
role_name: &RoleName,
|
||||
fetch: &F,
|
||||
jwt: &str,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
@@ -572,7 +578,7 @@ mod tests {
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
|
||||
fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
|
||||
fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
|
||||
@@ -660,11 +666,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), ec2);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
@@ -706,47 +707,98 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr);
|
||||
struct Fetch(SocketAddr, Vec<RoleNameInt>);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
_role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
let role_name = RoleName::from("user");
|
||||
let role_name1 = RoleName::from("anonymous");
|
||||
let role_name2 = RoleName::from("authenticated");
|
||||
|
||||
let fetch = Fetch(
|
||||
addr,
|
||||
vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
],
|
||||
);
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
for token in [jwt1, jwt2, jwt3, jwt4] {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role_name.clone(),
|
||||
&Fetch(addr),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), &ec2);
|
||||
|
||||
// had the wrong kid, therefore will have the wrong ecdsa signature
|
||||
let bad_jwt = new_ec_jwt("3".into(), &ec2);
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&bad_jwt,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&role_name1,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("signature error"));
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&jwt1,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&bad_role_name,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("jwk not found"));
|
||||
|
||||
let tokens = [jwt1, jwt2, jwt3, jwt4];
|
||||
let role_names = [role_name1, role_name2];
|
||||
for role in &role_names {
|
||||
for token in &tokens {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{collections::HashMap, net::SocketAddr};
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwapOption;
|
||||
@@ -10,8 +10,8 @@ use crate::{
|
||||
NodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
|
||||
EndpointId, RoleName,
|
||||
intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
|
||||
EndpointId,
|
||||
};
|
||||
|
||||
use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
|
||||
@@ -48,26 +48,17 @@ impl LocalBackend {
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct StaticAuthRules;
|
||||
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JwksRoleSettings {
|
||||
pub roles: HashMap<RoleName, EndpointJwksResponse>,
|
||||
pub project_id: ProjectIdInt,
|
||||
pub branch_id: BranchIdInt,
|
||||
}
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::const_empty();
|
||||
|
||||
impl FetchAuthRules for StaticAuthRules {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
let mappings = JWKS_ROLE_MAP.load();
|
||||
let role_mappings = mappings
|
||||
.as_deref()
|
||||
.and_then(|m| m.roles.get(&role_name))
|
||||
.context("JWKs settings for this role were not configured")?;
|
||||
let mut rules = vec![];
|
||||
for setting in &role_mappings.jwks {
|
||||
@@ -75,6 +66,7 @@ impl FetchAuthRules for StaticAuthRules {
|
||||
id: setting.id.clone(),
|
||||
jwks_url: setting.jwks_url.clone(),
|
||||
audience: setting.jwt_audience.clone(),
|
||||
role_names: setting.role_names.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -1,34 +1,35 @@
|
||||
use std::{
|
||||
net::SocketAddr,
|
||||
path::{Path, PathBuf},
|
||||
pin::pin,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};
|
||||
|
||||
use anyhow::{bail, ensure};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use compute_api::spec::LocalProxySpec;
|
||||
use dashmap::DashMap;
|
||||
use futures::{future::Either, FutureExt};
|
||||
use futures::future::Either;
|
||||
use proxy::{
|
||||
auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
|
||||
auth::backend::local::{LocalBackend, JWKS_ROLE_MAP},
|
||||
cancellation::CancellationHandlerMain,
|
||||
config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
|
||||
console::{locks::ApiLocks, messages::JwksRoleMapping},
|
||||
console::{
|
||||
locks::ApiLocks,
|
||||
messages::{EndpointJwksResponse, JwksSettings},
|
||||
},
|
||||
http::health_server::AppMetrics,
|
||||
intern::RoleNameInt,
|
||||
metrics::{Metrics, ThreadPoolMetrics},
|
||||
rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
RoleName,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::Parser;
|
||||
use tokio::{net::TcpListener, task::JoinSet};
|
||||
use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
@@ -72,9 +73,12 @@ struct LocalProxyCliArgs {
|
||||
/// Address of the postgres server
|
||||
#[clap(long, default_value = "127.0.0.1:5432")]
|
||||
compute: SocketAddr,
|
||||
/// File address of the local proxy config file
|
||||
/// Path of the local proxy config file
|
||||
#[clap(long, default_value = "./localproxy.json")]
|
||||
config_path: PathBuf,
|
||||
config_path: Utf8PathBuf,
|
||||
/// Path of the local proxy PID file
|
||||
#[clap(long, default_value = "./localproxy.pid")]
|
||||
pid_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
@@ -126,6 +130,24 @@ async fn main() -> anyhow::Result<()> {
|
||||
let args = LocalProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
|
||||
// before we bind to any ports, write the process ID to a file
|
||||
// so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
//
|
||||
// This also claims a "lock" that makes sure only one instance
|
||||
// of local-proxy runs at a time.
|
||||
let _process_guard = loop {
|
||||
match pid_file::claim_for_current_process(&args.pid_path) {
|
||||
Ok(guard) => break guard,
|
||||
Err(e) => {
|
||||
// compute-ctl might have tried to read the pid-file to let us
|
||||
// know about some config change. We should try again.
|
||||
error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
|
||||
let http_listener = TcpListener::bind(args.http).await?;
|
||||
let shutdown = CancellationToken::new();
|
||||
@@ -139,12 +161,30 @@ async fn main() -> anyhow::Result<()> {
|
||||
16,
|
||||
));
|
||||
|
||||
refresh_config(args.config_path.clone()).await;
|
||||
// write the process ID to a file so that compute-ctl can find our process later
|
||||
// in order to trigger the appropriate SIGHUP on config change.
|
||||
let pid = std::process::id();
|
||||
info!("process running in PID {pid}");
|
||||
std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?;
|
||||
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
|
||||
refresh_config(args.config_path.clone()).map(Ok)
|
||||
|
||||
let refresh_config_notify = Arc::new(Notify::new());
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
|
||||
let refresh_config_notify = Arc::clone(&refresh_config_notify);
|
||||
move || {
|
||||
refresh_config_notify.notify_one();
|
||||
}
|
||||
}));
|
||||
|
||||
// trigger the first config load **after** setting up the signal hook
|
||||
// to avoid the race condition where:
|
||||
// 1. No config file registered when local-proxy starts up
|
||||
// 2. The config file is written but the signal hook is not yet received
|
||||
// 3. local-proxy completes startup but has no config loaded, despite there being a registerd config.
|
||||
refresh_config_notify.notify_one();
|
||||
tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
|
||||
|
||||
maintenance_tasks.spawn(proxy::http::health_server::task_main(
|
||||
metrics_listener,
|
||||
AppMetrics {
|
||||
@@ -245,81 +285,84 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
})))
|
||||
}
|
||||
|
||||
async fn refresh_config(path: PathBuf) {
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
|
||||
loop {
|
||||
rx.notified().await;
|
||||
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
|
||||
async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let bytes = tokio::fs::read(&path).await?;
|
||||
let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
|
||||
let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
|
||||
|
||||
let mut settings = None;
|
||||
let mut jwks_set = vec![];
|
||||
|
||||
for mapping in data.roles.values_mut() {
|
||||
for jwks in &mut mapping.jwks {
|
||||
ensure!(
|
||||
jwks.jwks_url.has_authority()
|
||||
&& (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
for jwks in data.jwks {
|
||||
let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
|
||||
|
||||
ensure!(
|
||||
jwks.jwks_url
|
||||
.host()
|
||||
.is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
ensure!(
|
||||
jwks_url.has_authority()
|
||||
&& (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks.jwks_url.set_username("").expect(
|
||||
ensure!(
|
||||
jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks_url
|
||||
.set_username("")
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
jwks_url
|
||||
.set_password(None)
|
||||
.expect("url can be a base and has a valid host and is not a file. should not error");
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
jwks.jwks_url.set_password(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks.jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks.jwks_url.set_fragment(None);
|
||||
jwks.jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks.jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks.jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
|
||||
ensure!(
|
||||
*pr == jwks.project_id,
|
||||
"inconsistent project IDs configured"
|
||||
);
|
||||
ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks_url.set_fragment(None);
|
||||
jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
jwks_set.push(JwksSettings {
|
||||
id: jwks.id,
|
||||
jwks_url,
|
||||
provider_name: jwks.provider_name,
|
||||
jwt_audience: jwks.jwt_audience,
|
||||
role_names: jwks
|
||||
.role_names
|
||||
.into_iter()
|
||||
.map(RoleName::from)
|
||||
.map(|s| RoleNameInt::from(&s))
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
if let Some((project_id, branch_id)) = settings {
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
|
||||
roles: data.roles,
|
||||
project_id,
|
||||
branch_id,
|
||||
})));
|
||||
}
|
||||
info!("successfully loaded new config");
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
|
||||
Ok(())
|
||||
}));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
|
||||
@@ -461,10 +461,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(
|
||||
cancellation_token.clone(),
|
||||
|| async { Ok(()) },
|
||||
));
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(
|
||||
http_listener,
|
||||
AppMetrics {
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
use measured::FixedCardinalityLabel;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::{self, Display};
|
||||
|
||||
use crate::auth::IpPattern;
|
||||
|
||||
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
|
||||
use crate::proxy::retry::CouldRetry;
|
||||
use crate::RoleName;
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
@@ -348,11 +346,6 @@ impl ColdStartInfo {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct JwksRoleMapping {
|
||||
pub roles: HashMap<RoleName, EndpointJwksResponse>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct EndpointJwksResponse {
|
||||
pub jwks: Vec<JwksSettings>,
|
||||
@@ -361,11 +354,10 @@ pub struct EndpointJwksResponse {
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct JwksSettings {
|
||||
pub id: String,
|
||||
pub project_id: ProjectIdInt,
|
||||
pub branch_id: BranchIdInt,
|
||||
pub jwks_url: url::Url,
|
||||
pub provider_name: String,
|
||||
pub jwt_audience: Option<String>,
|
||||
pub role_names: Vec<RoleNameInt>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -5,7 +5,10 @@ pub mod neon;
|
||||
use super::messages::{ConsoleError, MetricsAuxInfo};
|
||||
use crate::{
|
||||
auth::{
|
||||
backend::{ComputeCredentialKeys, ComputeUserInfo},
|
||||
backend::{
|
||||
jwt::{AuthRule, FetchAuthRules},
|
||||
ComputeCredentialKeys, ComputeUserInfo,
|
||||
},
|
||||
IpPattern,
|
||||
},
|
||||
cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
|
||||
@@ -16,7 +19,7 @@ use crate::{
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
|
||||
scram, EndpointCacheKey,
|
||||
scram, EndpointCacheKey, EndpointId,
|
||||
};
|
||||
use dashmap::DashMap;
|
||||
use std::{hash::Hash, sync::Arc, time::Duration};
|
||||
@@ -334,6 +337,12 @@ pub(crate) trait Api {
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
@@ -343,6 +352,7 @@ pub(crate) trait Api {
|
||||
}
|
||||
|
||||
#[non_exhaustive]
|
||||
#[derive(Clone)]
|
||||
pub enum ConsoleBackend {
|
||||
/// Current Cloud API (V2).
|
||||
Console(neon::Api),
|
||||
@@ -386,6 +396,20 @@ impl Api for ConsoleBackend {
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
match self {
|
||||
Self::Console(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
Self::Postgres(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
#[cfg(test)]
|
||||
Self::Test(_api) => Ok(vec![]),
|
||||
}
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
@@ -552,3 +576,13 @@ impl WakeComputePermit {
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
impl FetchAuthRules for ConsoleBackend {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
self.get_endpoint_jwks(ctx, endpoint).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,9 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
||||
};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::{
|
||||
auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
|
||||
};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
use crate::{
|
||||
@@ -118,6 +120,39 @@ impl Api {
|
||||
})
|
||||
}
|
||||
|
||||
async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
|
||||
let (client, connection) =
|
||||
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
|
||||
|
||||
let connection = tokio::spawn(connection);
|
||||
|
||||
let res = client.query(
|
||||
"select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1",
|
||||
&[&endpoint.as_str()],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut rows = vec![];
|
||||
for row in res {
|
||||
rows.push(AuthRule {
|
||||
id: row.get("id"),
|
||||
jwks_url: url::Url::parse(row.get("jwks_url"))?,
|
||||
audience: row.get("audience"),
|
||||
role_names: row
|
||||
.get::<_, Vec<String>>("role_names")
|
||||
.into_iter()
|
||||
.map(RoleName::from)
|
||||
.map(|s| RoleNameInt::from(&s))
|
||||
.collect(),
|
||||
});
|
||||
}
|
||||
|
||||
drop(client);
|
||||
connection.await??;
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config
|
||||
@@ -185,6 +220,14 @@ impl super::Api for Api {
|
||||
))
|
||||
}
|
||||
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
self.do_get_endpoint_jwks(endpoint).await
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
|
||||
@@ -7,27 +7,33 @@ use super::{
|
||||
NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo,
|
||||
auth::backend::{jwt::AuthRule, ComputeUserInfo},
|
||||
compute,
|
||||
console::messages::{ColdStartInfo, Reason},
|
||||
console::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::WakeComputeRateLimiter,
|
||||
scram, EndpointCacheKey,
|
||||
scram, EndpointCacheKey, EndpointId,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use ::http::{header::AUTHORIZATION, HeaderName};
|
||||
use anyhow::bail;
|
||||
use futures::TryFutureExt;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{debug, error, info, info_span, warn, Instrument};
|
||||
|
||||
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Api {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
jwt: String,
|
||||
// put in a shared ref so we don't copy secrets all over in memory
|
||||
jwt: Arc<str>,
|
||||
}
|
||||
|
||||
impl Api {
|
||||
@@ -38,7 +44,9 @@ impl Api {
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
|
||||
let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
|
||||
.unwrap_or_default()
|
||||
.into();
|
||||
Self {
|
||||
endpoint,
|
||||
caches,
|
||||
@@ -71,9 +79,9 @@ impl Api {
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.get_path("proxy_get_role_secret")
|
||||
.header(X_REQUEST_ID, &request_id)
|
||||
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.query(&[
|
||||
("application_name", application_name.as_str()),
|
||||
@@ -125,6 +133,61 @@ impl Api {
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
if !self
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &endpoint.normalize())
|
||||
.await
|
||||
{
|
||||
bail!("endpoint not found");
|
||||
}
|
||||
let request_id = ctx.session_id().to_string();
|
||||
async {
|
||||
let request = self
|
||||
.endpoint
|
||||
.get_with_url(|url| {
|
||||
url.path_segments_mut()
|
||||
.push("endpoints")
|
||||
.push(endpoint.as_str())
|
||||
.push("jwks");
|
||||
})
|
||||
.header(X_REQUEST_ID, &request_id)
|
||||
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
|
||||
let body = parse_body::<EndpointJwksResponse>(response).await?;
|
||||
|
||||
let rules = body
|
||||
.jwks
|
||||
.into_iter()
|
||||
.map(|jwks| AuthRule {
|
||||
id: jwks.id,
|
||||
jwks_url: jwks.jwks_url,
|
||||
audience: jwks.jwt_audience,
|
||||
role_names: jwks.role_names,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(rules)
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
.instrument(info_span!("http", id = request_id))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
@@ -135,7 +198,7 @@ impl Api {
|
||||
async {
|
||||
let mut request_builder = self
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.get_path("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
@@ -262,6 +325,15 @@ impl super::Api for Api {
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_endpoint_jwks(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
self.do_get_endpoint_jwks(ctx, endpoint).await
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
|
||||
@@ -86,9 +86,17 @@ impl Endpoint {
|
||||
|
||||
/// Return a [builder](RequestBuilder) for a `GET` request,
|
||||
/// appending a single `path` segment to the base endpoint URL.
|
||||
pub(crate) fn get(&self, path: &str) -> RequestBuilder {
|
||||
pub(crate) fn get_path(&self, path: &str) -> RequestBuilder {
|
||||
self.get_with_url(|u| {
|
||||
u.path_segments_mut().push(path);
|
||||
})
|
||||
}
|
||||
|
||||
/// Return a [builder](RequestBuilder) for a `GET` request,
|
||||
/// accepting a closure to modify the url path segments for more complex paths queries.
|
||||
pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
|
||||
let mut url = self.endpoint.clone();
|
||||
url.path_segments_mut().push(path);
|
||||
f(&mut url);
|
||||
self.client.get(url.into_inner())
|
||||
}
|
||||
|
||||
@@ -144,7 +152,7 @@ mod tests {
|
||||
|
||||
// Validate that this pattern makes sense.
|
||||
let req = endpoint
|
||||
.get("frobnicate")
|
||||
.get_path("frobnicate")
|
||||
.query(&[
|
||||
("foo", Some("10")), // should be just `foo=10`
|
||||
("bar", None), // shouldn't be passed at all
|
||||
@@ -162,7 +170,7 @@ mod tests {
|
||||
let endpoint = Endpoint::new(url, Client::new());
|
||||
|
||||
let req = endpoint
|
||||
.get("frobnicate")
|
||||
.get_path("frobnicate")
|
||||
.query(&[("session_id", uuid::Uuid::nil())])
|
||||
.build()?;
|
||||
|
||||
|
||||
@@ -130,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub(crate) struct RoleNameTag;
|
||||
pub struct RoleNameTag;
|
||||
impl InternId for RoleNameTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
|
||||
pub type RoleNameInt = InternedString<RoleNameTag>;
|
||||
impl From<&RoleName> for RoleNameInt {
|
||||
fn from(value: &RoleName) -> Self {
|
||||
RoleNameTag::get_interner().get_or_intern(value)
|
||||
|
||||
@@ -82,7 +82,7 @@
|
||||
impl_trait_overcaptures,
|
||||
)]
|
||||
|
||||
use std::{convert::Infallible, future::Future};
|
||||
use std::convert::Infallible;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use intern::{EndpointIdInt, EndpointIdTag, InternId};
|
||||
@@ -117,13 +117,12 @@ pub mod usage_metrics;
|
||||
pub mod waiters;
|
||||
|
||||
/// Handle unix signals appropriately.
|
||||
pub async fn handle_signals<F, Fut>(
|
||||
pub async fn handle_signals<F>(
|
||||
token: CancellationToken,
|
||||
mut refresh_config: F,
|
||||
) -> anyhow::Result<Infallible>
|
||||
where
|
||||
F: FnMut() -> Fut,
|
||||
Fut: Future<Output = anyhow::Result<()>>,
|
||||
F: FnMut(),
|
||||
{
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
|
||||
@@ -136,7 +135,7 @@ where
|
||||
// Hangup is commonly used for config reload.
|
||||
_ = hangup.recv() => {
|
||||
warn!("received SIGHUP");
|
||||
refresh_config().await?;
|
||||
refresh_config();
|
||||
}
|
||||
// Shut down the whole application.
|
||||
_ = interrupt.recv() => {
|
||||
|
||||
@@ -525,6 +525,10 @@ impl TestBackend for TestConnectMechanism {
|
||||
{
|
||||
unimplemented!("not used in tests")
|
||||
}
|
||||
|
||||
fn dyn_clone(&self) -> Box<dyn TestBackend> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
|
||||
@@ -43,6 +43,13 @@ impl ThreadPool {
|
||||
pub fn new(n_workers: u8) -> Arc<Self> {
|
||||
// rayon would be nice here, but yielding in rayon does not work well afaict.
|
||||
|
||||
if n_workers == 0 {
|
||||
return Arc::new(Self {
|
||||
runtime: None,
|
||||
metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
|
||||
});
|
||||
}
|
||||
|
||||
Arc::new_cyclic(|pool| {
|
||||
let pool = pool.clone();
|
||||
let worker_id = AtomicUsize::new(0);
|
||||
|
||||
@@ -119,7 +119,7 @@ impl PoolingBackend {
|
||||
.check_jwt(
|
||||
ctx,
|
||||
user_info.endpoint.clone(),
|
||||
user_info.user.clone(),
|
||||
&user_info.user,
|
||||
&StaticAuthRules,
|
||||
jwt,
|
||||
)
|
||||
|
||||
@@ -374,14 +374,16 @@ type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// fsync the datadir to make sure we have a consistent state on disk.
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
let started = Instant::now();
|
||||
utils::crashsafe::syncfs(dfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"syncfs data directory done"
|
||||
);
|
||||
if !conf.no_sync {
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
let started = Instant::now();
|
||||
utils::crashsafe::syncfs(dfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"syncfs data directory done"
|
||||
);
|
||||
}
|
||||
|
||||
info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::walproposer_sim::{
|
||||
|
||||
pub mod walproposer_sim;
|
||||
|
||||
// Generates 2000 random seeds and runs a schedule for each of them.
|
||||
// Generates 500 random seeds and runs a schedule for each of them.
|
||||
// If you see this test fail, please report the last seed to the
|
||||
// @safekeeper team.
|
||||
#[test]
|
||||
@@ -17,7 +17,7 @@ fn test_random_schedules() -> anyhow::Result<()> {
|
||||
let clock = init_logger();
|
||||
let mut config = TestConfig::new(Some(clock));
|
||||
|
||||
for _ in 0..2000 {
|
||||
for _ in 0..500 {
|
||||
let seed: u64 = rand::thread_rng().gen();
|
||||
config.network = generate_network_opts(seed);
|
||||
|
||||
|
||||
@@ -401,6 +401,7 @@ class NeonEnvBuilder:
|
||||
safekeeper_extra_opts: Optional[list[str]] = None,
|
||||
storage_controller_port_override: Optional[int] = None,
|
||||
pageserver_io_buffer_alignment: Optional[int] = None,
|
||||
pageserver_virtual_file_io_mode: Optional[str] = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -455,6 +456,7 @@ class NeonEnvBuilder:
|
||||
self.storage_controller_port_override = storage_controller_port_override
|
||||
|
||||
self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
|
||||
self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
@@ -1028,6 +1030,7 @@ class NeonEnv:
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
|
||||
self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
|
||||
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
|
||||
|
||||
# Create the neon_local's `NeonLocalInitConf`
|
||||
cfg: Dict[str, Any] = {
|
||||
@@ -1091,7 +1094,10 @@ class NeonEnv:
|
||||
for key, value in override.items():
|
||||
ps_cfg[key] = value
|
||||
|
||||
ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
|
||||
if self.pageserver_io_buffer_alignment is not None:
|
||||
ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
|
||||
if self.pageserver_virtual_file_io_mode is not None:
|
||||
ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageservers.append(
|
||||
@@ -1330,6 +1336,7 @@ def neon_simple_env(
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
|
||||
pageserver_io_buffer_alignment: Optional[int],
|
||||
pageserver_virtual_file_io_mode: Optional[str],
|
||||
) -> Iterator[NeonEnv]:
|
||||
"""
|
||||
Simple Neon environment, with no authentication and no safekeepers.
|
||||
@@ -1356,6 +1363,7 @@ def neon_simple_env(
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
|
||||
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1380,6 +1388,7 @@ def neon_env_builder(
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
record_property: Callable[[str, object], None],
|
||||
pageserver_io_buffer_alignment: Optional[int],
|
||||
pageserver_virtual_file_io_mode: Optional[str],
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1415,6 +1424,7 @@ def neon_env_builder(
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
|
||||
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
|
||||
) as builder:
|
||||
yield builder
|
||||
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
|
||||
@@ -3311,7 +3321,7 @@ class VanillaPostgres(PgProtocol):
|
||||
self.pg_bin = pg_bin
|
||||
self.running = False
|
||||
if init:
|
||||
self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
|
||||
self.pg_bin.run_capture(["initdb", "--pgdata", str(pgdatadir)])
|
||||
self.configure([f"port = {port}\n"])
|
||||
|
||||
def enable_tls(self):
|
||||
|
||||
@@ -39,6 +39,11 @@ def pageserver_io_buffer_alignment() -> Optional[int]:
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_virtual_file_io_mode() -> Optional[str]:
|
||||
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
|
||||
return None
|
||||
|
||||
@@ -8,6 +8,7 @@ import requests
|
||||
from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@@ -161,6 +162,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
# Get timeline_start_lsn, waiting until it's nonzero. It is a way to ensure
|
||||
# that the timeline is fully initialized at the safekeeper.
|
||||
def get_non_zero_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
def timeline_start_lsn_non_zero() -> Lsn:
|
||||
s = self.timeline_status(tenant_id, timeline_id).timeline_start_lsn
|
||||
assert s > Lsn(0)
|
||||
return s
|
||||
|
||||
return wait_until(30, 1, timeline_start_lsn_non_zero)
|
||||
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
|
||||
@@ -56,32 +56,20 @@ class Workload:
|
||||
with ENDPOINT_LOCK:
|
||||
self._endpoint.reconfigure()
|
||||
|
||||
def go_readonly(self):
|
||||
self.stop()
|
||||
self._endpoint = self.make_endpoint(readonly=True, pageserver_id=None)
|
||||
self._endpoint.start(pageserver_id=None)
|
||||
|
||||
def make_endpoint(self, readonly: bool, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
# We may be running alongside other Workloads for different tenants. Full TTID is
|
||||
# obnoxiously long for use here, but a cut-down version is still unique enough for tests.
|
||||
endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
|
||||
|
||||
if readonly:
|
||||
self._endpoint_opts["hot_standby"] = True
|
||||
|
||||
return self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id=endpoint_id,
|
||||
**self._endpoint_opts,
|
||||
)
|
||||
|
||||
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
with ENDPOINT_LOCK:
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.make_endpoint(pageserver_id=pageserver_id, readonly=False)
|
||||
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id=endpoint_id,
|
||||
**self._endpoint_opts,
|
||||
)
|
||||
self._endpoint.start(pageserver_id=pageserver_id)
|
||||
else:
|
||||
self._endpoint.reconfigure(pageserver_id=pageserver_id)
|
||||
|
||||
@@ -11,7 +11,6 @@ from fixtures.neon_fixtures import (
|
||||
generate_uploads_and_deletions,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
@@ -413,32 +412,3 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
|
||||
f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
|
||||
)
|
||||
assert res[0][0] == 1
|
||||
|
||||
|
||||
def test_image_layer_reads(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(256)
|
||||
workload.validate()
|
||||
|
||||
# wait_for_wal_insert_lsn(env, workload._endpoint, tenant_id, timeline_id)
|
||||
|
||||
workload.go_readonly()
|
||||
|
||||
commit_lsn = env.safekeepers[0].http_client().get_commit_lsn(tenant_id, timeline_id)
|
||||
wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, commit_lsn)
|
||||
log.info(f"Ingested up to commit_lsn {commit_lsn}")
|
||||
|
||||
env.pageserver.http_client().timeline_compact(
|
||||
tenant_id, timeline_id, force_image_layer_creation=True
|
||||
)
|
||||
|
||||
# This should send getpage requests at the same LSN where we just created image layers
|
||||
workload.validate()
|
||||
|
||||
# Nothing should have written in the meantime
|
||||
assert commit_lsn == env.safekeepers[0].http_client().get_commit_lsn(tenant_id, timeline_id)
|
||||
|
||||
@@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion, skip_on_postgres
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
|
||||
from fixtures.workload import Workload
|
||||
|
||||
@@ -156,9 +156,6 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(after="test_create_snapshot")
|
||||
@skip_on_postgres(
|
||||
PgVersion.V17, "There are no snapshots yet"
|
||||
) # TODO: revert this once we have snapshots
|
||||
def test_backward_compatibility(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
@@ -206,9 +203,6 @@ def test_backward_compatibility(
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(after="test_create_snapshot")
|
||||
@skip_on_postgres(
|
||||
PgVersion.V17, "There are no snapshots yet"
|
||||
) # TODO: revert this once we have snapshots
|
||||
def test_forward_compatibility(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
|
||||
@@ -27,7 +27,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*basebackup .* failed: invalid basebackup lsn.*",
|
||||
".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected",
|
||||
".*/lsn_lease.*invalid lsn lease request.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -108,7 +108,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
|
||||
assert cur.fetchone() == (1,)
|
||||
|
||||
# Create node at pre-initdb lsn
|
||||
with pytest.raises(Exception, match="invalid basebackup lsn"):
|
||||
with pytest.raises(Exception, match="invalid lsn lease request"):
|
||||
# compute node startup with invalid LSN should fail
|
||||
env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
@@ -167,6 +167,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
return last_flush_lsn
|
||||
|
||||
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
|
||||
"""
|
||||
Trigger GC manually on all pageservers. Then run an `SELECT` query.
|
||||
"""
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||
log.info(f"{gc_result=}")
|
||||
|
||||
assert (
|
||||
gc_result["layers_removed"] == 0
|
||||
), "No layers should be removed, old layers are guarded by leases."
|
||||
|
||||
with ep_static.cursor() as cur:
|
||||
cur.execute("SELECT count(*) FROM t0")
|
||||
assert cur.fetchone() == (ROW_COUNT,)
|
||||
|
||||
# Insert some records on main branch
|
||||
with env.endpoints.create_start("main") as ep_main:
|
||||
with ep_main.cursor() as cur:
|
||||
@@ -193,25 +210,31 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
|
||||
# Trigger GC
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||
log.info(f"{gc_result=}")
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
|
||||
assert (
|
||||
gc_result["layers_removed"] == 0
|
||||
), "No layers should be removed, old layers are guarded by leases."
|
||||
# Trigger Pageserver restarts
|
||||
for ps in env.pageservers:
|
||||
ps.stop()
|
||||
# Static compute should have at least one lease request failure due to connection.
|
||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||
ps.start()
|
||||
|
||||
with ep_static.cursor() as cur:
|
||||
cur.execute("SELECT count(*) FROM t0")
|
||||
assert cur.fetchone() == (ROW_COUNT,)
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
|
||||
# Reconfigure pageservers
|
||||
env.pageservers[0].stop()
|
||||
env.storage_controller.node_configure(
|
||||
env.pageservers[0].id, {"availability": "Offline"}
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
trigger_gc_and_select(env, ep_static)
|
||||
|
||||
# Do some update so we can increment latest_gc_cutoff
|
||||
generate_updates_on_main(env, ep_main, i, end=100)
|
||||
|
||||
# Wait for the existing lease to expire.
|
||||
time.sleep(LSN_LEASE_LENGTH)
|
||||
time.sleep(LSN_LEASE_LENGTH + 1)
|
||||
# Now trigger GC again, layers should be removed.
|
||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||
client = ps.http_client()
|
||||
|
||||
@@ -45,10 +45,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
|
||||
tenant_after = http.tenant_status(env.initial_tenant)
|
||||
assert tenant_before != tenant_after
|
||||
gc_blocking = tenant_after["gc_blocking"]
|
||||
assert (
|
||||
gc_blocking
|
||||
== "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
|
||||
)
|
||||
assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
|
||||
|
||||
wait_for_another_gc_round()
|
||||
pss.assert_log_contains(gc_skipped_line)
|
||||
|
||||
@@ -772,7 +772,7 @@ class ProposerPostgres(PgProtocol):
|
||||
def initdb(self):
|
||||
"""Run initdb"""
|
||||
|
||||
args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()]
|
||||
args = ["initdb", "--username", "cloud_admin", "--pgdata", self.pg_data_dir_path()]
|
||||
self.pg_bin.run(args)
|
||||
|
||||
def start(self):
|
||||
@@ -2084,8 +2084,13 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
|
||||
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
|
||||
timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
|
||||
timeline_start_lsn = timeline_status.timeline_start_lsn
|
||||
# Note: currently timelines on sks are created by compute and commit of
|
||||
# transaction above is finished when 2/3 sks received it, so there is a
|
||||
# small chance that timeline on this sk is not created/initialized yet,
|
||||
# hence the usage of waiting function to prevent flakiness.
|
||||
timeline_start_lsn = (
|
||||
env.safekeepers[0].http_client().get_non_zero_timeline_start_lsn(tenant_id, timeline_id)
|
||||
)
|
||||
log.info(f"Timeline start LSN: {timeline_start_lsn}")
|
||||
|
||||
current_percent = 0.0
|
||||
|
||||
Reference in New Issue
Block a user