diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index a52e43b4da..d60f97320b 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps on: workflow_call: # no inputs needed - + defaults: run: shell: bash -euxo pipefail {0} jobs: setup-databases: + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: - platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] database: [ clickbench, tpch, userexample ] - + env: LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib PLATFORM: ${{ matrix.platform }} @@ -23,7 +27,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -32,13 +39,13 @@ jobs: run: | case "${PLATFORM}" in neon) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; aws-rds-postgres) - CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; aws-aurora-serverless-v2-postgres) - CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}" @@ -46,10 +53,17 @@ jobs: ;; esac - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -57,23 +71,23 @@ jobs: path: /tmp/neon/ prefix: latest - # we create a table that has one row for each database that we want to restore with the status whether the restore is done + # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} - # to avoid a race condition of multiple jobs trying to create the table at the same time, + # to avoid a race condition of multiple jobs trying to create the table at the same time, # we use an advisory lock run: | ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " - SELECT pg_advisory_lock(4711); + SELECT pg_advisory_lock(4711); CREATE TABLE IF NOT EXISTS benchmark_restore_status ( databasename text primary key, restore_done boolean ); SELECT pg_advisory_unlock(4711); " - + - name: Check if restore is already done id: check-restore-done env: @@ -107,7 +121,7 @@ jobs: DATABASE_NAME: ${{ matrix.database }} run: | mkdir -p /tmp/dumps - aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ - name: Replace database name in connection string if: steps.check-restore-done.outputs.skip != 'true' @@ -126,17 +140,17 @@ jobs: else new_connstr="${base_connstr}/${DATABASE_NAME}" fi - echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT - name: Restore dump if: steps.check-restore-done.outputs.skip != 'true' env: DATABASE_NAME: ${{ matrix.database }} DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} - # the following works only with larger computes: + # the following works only with larger computes: # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" # we add the || true because: - # the dumps were created with Neon and contain neon extensions that are not + # the dumps were created with Neon and contain neon extensions that are not # available in RDS, so we will always report an error, but we can ignore it run: | ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5ea911eb95..5fc6aa247a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -236,9 +236,7 @@ jobs: # run pageserver tests with different settings for io_engine in std-fs tokio-epoll-uring ; do - for io_buffer_alignment in 0 1 512 ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' done # Run separate tests for real S3 diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index a4a597acde..32806b89ab 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -12,7 +12,6 @@ on: # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 3 * * *' # run once a day, timezone is utc - workflow_dispatch: # adds ability to run this manually inputs: region_id: @@ -59,7 +58,7 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: @@ -68,12 +67,10 @@ jobs: PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - DEFAULT_PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' RUNNER: [ self-hosted, eastus2, x64 ] - IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -86,7 +83,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: ${{ matrix.IMAGE }} + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -164,6 +164,10 @@ jobs: replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 16 @@ -174,12 +178,21 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download @@ -267,7 +280,7 @@ jobs: region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' - image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" + image_default="neondatabase/build-tools:pinned" matrix='{ "pg_version" : [ 16 @@ -344,7 +357,7 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false @@ -371,7 +384,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Configure AWS credentials # necessary on Azure runners + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -492,17 +505,15 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: include: - PLATFORM: "neonvm-captest-pgvector" RUNNER: [ self-hosted, us-east-2, x64 ] - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - PLATFORM: "azure-captest-pgvector" RUNNER: [ self-hosted, eastus2, x64 ] - IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" @@ -511,13 +522,16 @@ jobs: DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} runs-on: ${{ matrix.RUNNER }} container: - image: ${{ matrix.IMAGE }} + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -527,17 +541,26 @@ jobs: # instead of using Neon artifacts containing pgbench - name: Install postgresql-16 where pytest expects it run: | + # Just to make it easier to test things locally on macOS (with arm64) + arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') + cd /home/nonroot - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb - dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb" + dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql - ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib + + LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}" + export LD_LIBRARY_PATH + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV} + /tmp/neon/pg_install/v16/bin/pgbench --version /tmp/neon/pg_install/v16/bin/psql --version @@ -559,7 +582,7 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -620,6 +643,10 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: @@ -638,12 +665,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -714,6 +751,10 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: @@ -731,12 +772,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -806,6 +857,10 @@ jobs: user-examples-compare: if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: @@ -822,12 +877,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 81a9fd99ae..a759efb56c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -341,7 +341,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - SYNC_AFTER_EACH_TEST: true + SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -773,7 +773,7 @@ jobs: matrix: version: [ v14, v15, v16, v17 ] env: - VM_BUILDER_VERSION: v0.29.3 + VM_BUILDER_VERSION: v0.35.0 steps: - uses: actions/checkout@v4 @@ -1190,10 +1190,9 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") - # TODO Add v17 - for pg_version in v14 v15 v16; do + for pg_version in v14 v15 v16 v17; do # We run less tests for debug builds, so we don't need to promote them - if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then + if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then continue fi diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index cad9764532..5c5423e252 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -102,12 +102,17 @@ jobs: # Default set of platforms to run e2e tests on platforms='["docker", "k8s"]' - # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms. # If the workflow run is not a pull request, add k8s-neonvm to the list. if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do case "$f" in - vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node) + # List of directories that contain code which affect compute images. + # + # This isn't exhaustive, just the paths that are most directly compute-related. + # For example, compute_ctl also depends on libs/utils, but we don't trigger + # an e2e run on that. + vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node) platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') ;; *) diff --git a/Cargo.lock b/Cargo.lock index d0702e09d4..cde9aa7a77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -90,9 +90,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.0" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" @@ -269,9 +269,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.3.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" +checksum = "2ac9889352d632214df943e26740c46a0f3da6e329fbd28164fe7ae1b061da7b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -300,9 +300,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -312,15 +312,16 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.2.1" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" +checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", "aws-smithy-eventstream", "aws-smithy-http", + "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", @@ -328,6 +329,7 @@ dependencies = [ "fastrand 2.0.0", "http 0.2.9", "http-body 0.4.5", + "once_cell", "percent-encoding", "pin-project-lite", "tracing", @@ -336,9 +338,9 @@ dependencies = [ [[package]] name = "aws-sdk-iam" -version = "1.17.0" +version = "1.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b" +checksum = "053df3024ea2ed0431359b3cddecc92dcfadeaedf71dd497292b39e37e597b46" dependencies = [ "aws-credential-types", "aws-runtime", @@ -359,9 +361,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.26.0" +version = "1.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" +checksum = "f571deb0a80c20d21d9f3e8418c1712af9ff4bf399d057e5549a934eca4844e2" dependencies = [ "ahash", "aws-credential-types", @@ -394,9 +396,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.22.0" +version = "1.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" +checksum = "ebb97e44983752cf7e12968c5f569a5d7562dbbc67006755c331d9d9c99580ae" dependencies = [ "aws-credential-types", "aws-runtime", @@ -416,9 +418,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.22.0" +version = "1.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" +checksum = "ad061d977235898e4a97ecbd5d882786cca41b4828943584dc792dcc35eb3d3c" dependencies = [ "aws-credential-types", "aws-runtime", @@ -438,9 +440,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.22.0" +version = "1.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" +checksum = "300ce43d1f7f4eb023e57d38b0921d964e8e62bed7f82f6b7849e7eab7a14575" dependencies = [ "aws-credential-types", "aws-runtime", @@ -461,9 +463,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.1" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" +checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -501,9 +503,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.7" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" +checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -522,9 +524,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.4" +version = "0.60.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" dependencies = [ "aws-smithy-types", "bytes", @@ -533,9 +535,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.8" +version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -573,9 +575,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.5.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" +checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -587,6 +589,7 @@ dependencies = [ "http 0.2.9", "http-body 0.4.5", "http-body 1.0.0", + "httparse", "hyper 0.14.30", "hyper-rustls 0.24.0", "once_cell", @@ -599,9 +602,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.6.0" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" +checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -616,9 +619,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.9" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" +checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b" dependencies = [ "base64-simd", "bytes", @@ -642,24 +645,23 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.8" +version = "0.60.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" +checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.2.0" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ "aws-credential-types", "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "http 0.2.9", "rustc_version", "tracing", ] @@ -671,7 +673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" dependencies = [ "async-trait", - "axum-core", + "axum-core 0.3.4", "base64 0.21.1", "bitflags 1.3.2", "bytes", @@ -691,7 +693,7 @@ dependencies = [ "serde_path_to_error", "serde_urlencoded", "sha1", - "sync_wrapper", + "sync_wrapper 0.1.2", "tokio", "tokio-tungstenite", "tower", @@ -699,6 +701,33 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "itoa", + "matchit 0.7.0", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper 1.0.1", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "axum-core" version = "0.3.4" @@ -716,6 +745,26 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 1.0.1", + "tower-layer", + "tower-service", +] + [[package]] name = "azure_core" version = "0.19.0" @@ -918,7 +967,7 @@ dependencies = [ "clang-sys", "itertools 0.12.1", "log", - "prettyplease 0.2.17", + "prettyplease", "proc-macro2", "quote", "regex", @@ -1223,6 +1272,7 @@ dependencies = [ "notify", "num_cpus", "opentelemetry", + "opentelemetry_sdk", "postgres", "regex", "remote_storage", @@ -1321,6 +1371,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "futures", "humantime", "humantime-serde", "hyper 0.14.30", @@ -1875,9 +1926,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.10.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" dependencies = [ "humantime", "is-terminal", @@ -2035,7 +2086,7 @@ dependencies = [ "futures-core", "futures-sink", "http-body-util", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "pin-project", "rand 0.8.5", @@ -2454,9 +2505,9 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.0" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", @@ -2539,9 +2590,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", @@ -2581,7 +2632,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "rustls 0.22.4", "rustls-pki-types", @@ -2592,28 +2643,29 @@ dependencies = [ [[package]] name = "hyper-timeout" -version = "0.4.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793" dependencies = [ - "hyper 0.14.30", + "hyper 1.4.1", + "hyper-util", "pin-project-lite", "tokio", - "tokio-io-timeout", + "tower-service", ] [[package]] name = "hyper-util" -version = "0.1.3" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.2.0", + "hyper 1.4.1", "pin-project-lite", "socket2", "tokio", @@ -3367,55 +3419,56 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.20.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" +checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" dependencies = [ - "opentelemetry_api", - "opentelemetry_sdk", + "futures-core", + "futures-sink", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", ] [[package]] name = "opentelemetry-http" -version = "0.9.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" +checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab" dependencies = [ "async-trait", "bytes", - "http 0.2.9", - "opentelemetry_api", - "reqwest 0.11.19", + "http 1.1.0", + "opentelemetry", + "reqwest 0.12.4", ] [[package]] name = "opentelemetry-otlp" -version = "0.13.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" +checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727" dependencies = [ "async-trait", "futures-core", - "http 0.2.9", + "http 1.1.0", + "opentelemetry", "opentelemetry-http", "opentelemetry-proto", - "opentelemetry-semantic-conventions", - "opentelemetry_api", "opentelemetry_sdk", "prost", - "reqwest 0.11.19", + "reqwest 0.12.4", "thiserror", - "tokio", - "tonic", ] [[package]] name = "opentelemetry-proto" -version = "0.3.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" +checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" dependencies = [ - "opentelemetry_api", + "opentelemetry", "opentelemetry_sdk", "prost", "tonic", @@ -3423,46 +3476,25 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.12.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" -dependencies = [ - "opentelemetry", -] - -[[package]] -name = "opentelemetry_api" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" -dependencies = [ - "futures-channel", - "futures-util", - "indexmap 1.9.3", - "js-sys", - "once_cell", - "pin-project-lite", - "thiserror", - "urlencoding", -] +checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05" [[package]] name = "opentelemetry_sdk" -version = "0.20.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" +checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" dependencies = [ "async-trait", - "crossbeam-channel", "futures-channel", "futures-executor", "futures-util", + "glob", "once_cell", - "opentelemetry_api", - "ordered-float 3.9.2", + "opentelemetry", "percent-encoding", "rand 0.8.5", - "regex", "serde_json", "thiserror", "tokio", @@ -3478,15 +3510,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ordered-float" -version = "3.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" -dependencies = [ - "num-traits", -] - [[package]] name = "ordered-multimap" version = "0.7.3" @@ -4142,16 +4165,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "prettyplease" -version = "0.1.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" -dependencies = [ - "proc-macro2", - "syn 1.0.109", -] - [[package]] name = "prettyplease" version = "0.2.17" @@ -4224,9 +4237,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.9" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", "prost-derive", @@ -4234,44 +4247,43 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.9" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" +checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", - "heck 0.4.1", - "itertools 0.10.5", - "lazy_static", + "heck 0.5.0", + "itertools 0.12.1", "log", "multimap", + "once_cell", "petgraph", - "prettyplease 0.1.25", + "prettyplease", "prost", "prost-types", "regex", - "syn 1.0.109", + "syn 2.0.52", "tempfile", - "which", ] [[package]] name = "prost-derive" -version = "0.11.9" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.12.1", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] name = "prost-types" -version = "0.11.9" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" +checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ "prost", ] @@ -4296,6 +4308,7 @@ dependencies = [ "camino-tempfile", "chrono", "clap", + "compute_api", "consumption_metrics", "dashmap", "ecdsa 0.16.9", @@ -4313,7 +4326,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.30", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "indexmap 2.0.1", "ipnet", @@ -4369,7 +4382,6 @@ dependencies = [ "tokio-tungstenite", "tokio-util", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "try-lock", @@ -4680,7 +4692,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sync_wrapper", + "sync_wrapper 0.1.2", "test-context", "tokio", "tokio-stream", @@ -4745,7 +4757,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-rustls 0.26.0", "hyper-util", "ipnet", @@ -4761,7 +4773,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper", + "sync_wrapper 0.1.2", "tokio", "tokio-rustls 0.25.0", "tokio-util", @@ -4802,7 +4814,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 1.1.0", - "hyper 1.2.0", + "hyper 1.4.1", "parking_lot 0.11.2", "reqwest 0.12.4", "reqwest-middleware", @@ -4814,9 +4826,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.0" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" +checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45" dependencies = [ "anyhow", "async-trait", @@ -5035,6 +5047,21 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls" +version = "0.23.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki 0.102.2", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -5060,6 +5087,19 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pemfile" version = "1.0.2" @@ -5135,6 +5175,7 @@ dependencies = [ "fail", "futures", "hex", + "http 1.1.0", "humantime", "hyper 0.14.30", "metrics", @@ -5691,13 +5732,16 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", + "bytes", "clap", "const_format", "futures", "futures-core", "futures-util", + "http-body-util", "humantime", - "hyper 0.14.30", + "hyper 1.4.1", + "hyper-util", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5899,6 +5943,12 @@ dependencies = [ "futures-core", ] +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "synstructure" version = "0.12.6" @@ -6027,7 +6077,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float 2.10.1", + "ordered-float", ] [[package]] @@ -6129,9 +6179,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.37.0" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", @@ -6173,9 +6223,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -6242,10 +6292,21 @@ dependencies = [ ] [[package]] -name = "tokio-stream" -version = "0.1.14" +name = "tokio-rustls" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +dependencies = [ + "rustls 0.23.7", + "rustls-pki-types", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" dependencies = [ "futures-core", "pin-project-lite", @@ -6332,29 +6393,30 @@ dependencies = [ [[package]] name = "tonic" -version = "0.9.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ "async-stream", "async-trait", - "axum", - "base64 0.21.1", + "axum 0.7.5", + "base64 0.22.1", "bytes", - "futures-core", - "futures-util", - "h2 0.3.26", - "http 0.2.9", - "http-body 0.4.5", - "hyper 0.14.30", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.4.1", "hyper-timeout", + "hyper-util", "percent-encoding", "pin-project", "prost", - "rustls-native-certs 0.6.2", - "rustls-pemfile 1.0.2", + "rustls-native-certs 0.8.0", + "rustls-pemfile 2.1.1", + "socket2", "tokio", - "tokio-rustls 0.24.0", + "tokio-rustls 0.26.0", "tokio-stream", "tower", "tower-layer", @@ -6364,15 +6426,16 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.9.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" dependencies = [ - "prettyplease 0.1.25", + "prettyplease", "proc-macro2", "prost-build", + "prost-types", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -6409,11 +6472,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -6433,9 +6495,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.24" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", @@ -6444,9 +6506,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", "valuable", @@ -6464,21 +6526,22 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ - "lazy_static", "log", + "once_cell", "tracing-core", ] [[package]] name = "tracing-opentelemetry" -version = "0.21.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" +checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" dependencies = [ + "js-sys", "once_cell", "opentelemetry", "opentelemetry_sdk", @@ -6487,6 +6550,7 @@ dependencies = [ "tracing-core", "tracing-log", "tracing-subscriber", + "web-time", ] [[package]] @@ -6501,9 +6565,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", "once_cell", @@ -6527,6 +6591,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", + "opentelemetry_sdk", "tokio", "tracing", "tracing-opentelemetry", @@ -6776,7 +6841,7 @@ name = "vm_monitor" version = "0.1.0" dependencies = [ "anyhow", - "axum", + "axum 0.6.20", "cgroups-rs", "clap", "futures", @@ -6982,6 +7047,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webpki-roots" version = "0.25.2" @@ -6997,17 +7072,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "which" -version = "4.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269" -dependencies = [ - "either", - "libc", - "once_cell", -] - [[package]] name = "whoami" version = "1.5.1" @@ -7242,7 +7306,6 @@ dependencies = [ "aws-smithy-async", "aws-smithy-http", "aws-smithy-types", - "axum", "base64 0.21.1", "base64ct", "bytes", @@ -7251,7 +7314,6 @@ dependencies = [ "chrono", "clap", "clap_builder", - "crossbeam-utils", "crypto-bigint 0.5.5", "der 0.7.8", "deranged", @@ -7270,8 +7332,9 @@ dependencies = [ "hex", "hmac", "hyper 0.14.30", + "hyper 1.4.1", + "hyper-util", "indexmap 1.9.3", - "itertools 0.10.5", "itertools 0.12.1", "lazy_static", "libc", @@ -7283,6 +7346,7 @@ dependencies = [ "num-traits", "once_cell", "parquet", + "prettyplease", "proc-macro2", "prost", "quote", @@ -7290,7 +7354,6 @@ dependencies = [ "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest 0.11.19", "reqwest 0.12.4", "rustls 0.21.11", "scopeguard", @@ -7303,20 +7366,18 @@ dependencies = [ "subtle", "syn 1.0.109", "syn 2.0.52", - "sync_wrapper", + "sync_wrapper 0.1.2", "tikv-jemalloc-sys", "time", "time-macros", "tokio", - "tokio-rustls 0.24.0", + "tokio-stream", "tokio-util", "toml_edit", "tonic", "tower", "tracing", "tracing-core", - "tracing-log", - "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index a788dcf3cb..abdb978f3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,14 +53,14 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [ flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.3", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.26" -aws-sdk-iam = "1.15.0" +aws-config = { version = "1.5", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.52" +aws-sdk-iam = "1.46.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.1.9" +aws-smithy-types = "1.2" aws-credential-types = "1.2.0" -aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } -aws-types = "1.2.0" +aws-sigv4 = { version = "1.2", features = ["sign-http"] } +aws-types = "1.3" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -96,9 +96,12 @@ hmac = "0.12.1" hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } +http-body-util = "0.1.2" humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" +hyper_1 = { package = "hyper", version = "1.4" } +hyper-util = "0.1" tokio-tungstenite = "0.20.0" indexmap = "2" indoc = "2" @@ -116,9 +119,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.20.0" -opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.12.0" +opentelemetry = "0.24" +opentelemetry_sdk = "0.24" +opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.16" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -126,12 +130,12 @@ pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency -prost = "0.11" +prost = "0.13" rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] } reqwest-middleware = "0.3.0" reqwest-retry = "0.5" routerify = "3" @@ -174,11 +178,11 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.9", features = ["tls", "tls-roots"]} +tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" -tracing-error = "0.2.0" -tracing-opentelemetry = "0.21.0" +tracing-error = "0.2" +tracing-opentelemetry = "0.25" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } @@ -242,7 +246,7 @@ criterion = "0.5.1" rcgen = "0.12" rstest = "0.18" camino-tempfile = "1.0.2" -tonic-build = "0.9" +tonic-build = "0.12" [patch.crates-io] diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index c4209c7a12..d8bcacf228 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] # System deps +# +# 'gdb' is included so that we get backtraces of core dumps produced in +# regression tests RUN set -e \ && apt update \ && apt install -y \ @@ -24,6 +27,7 @@ RUN set -e \ cmake \ curl \ flex \ + gdb \ git \ gnupg \ gzip \ diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 2c647a669c..eb4682445c 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -12,10 +12,25 @@ ARG DEBIAN_FLAVOR=bullseye-slim ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps ARG DEBIAN_FLAVOR -RUN apt update && \ + +RUN case $DEBIAN_FLAVOR in \ + # Version-specific installs for Bullseye (PG14-PG16): + # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. + # Install newer version (3.25) from backports. + bullseye*) \ + echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + ;; \ + # Version-specific installs for Bookworm (PG17): + bookworm*) \ + VERSION_INSTALLS="cmake"; \ + ;; \ + esac && \ + apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ + $VERSION_INSTALLS ######################################################################################### # @@ -89,7 +104,7 @@ FROM build-deps AS postgis-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + apt install -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc @@ -200,27 +215,6 @@ FROM build-deps AS h3-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "$(uname -m)" in \ - "x86_64") \ - export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ - ;; \ - "aarch64") \ - export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \ - ;; \ - *) \ - echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \ - -q -O /tmp/cmake-install.sh \ - && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \ - && chmod u+x /tmp/cmake-install.sh \ - && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ - && rm /tmp/cmake-install.sh - RUN case "${PG_VERSION}" in "v17") \ mkdir -p /h3/usr/ && \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -506,8 +500,6 @@ RUN case "${PG_VERSION}" in "v17") \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ esac && \ - apt-get update && \ - apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ @@ -596,7 +588,6 @@ RUN case "${PG_VERSION}" in "v17") \ esac && \ apt-get update && \ apt-get install -y \ - cmake \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ @@ -761,7 +752,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ - apt-get install -y curl libclang-dev cmake && \ + apt-get install -y curl libclang-dev && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -871,6 +862,28 @@ RUN case "${PG_VERSION}" in "v17") \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control +######################################################################################### +# +# Layer "pg-session-jwt-build" +# Compile "pg_session_jwt" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-session-jwt-build +ARG PG_VERSION + +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ + echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release + # it's needed to enable extension because it uses untrusted C language + # sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_session_jwt.control && \ + # echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_session_jwt.control + ######################################################################################### # # Layer "wal2json-build" @@ -967,6 +980,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1154,11 +1168,6 @@ RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -# cmake is required for the h3 test -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && apt-get install -y cmake RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -1185,7 +1194,6 @@ ENV PGDATABASE=postgres ######################################################################################### FROM debian:$DEBIAN_FLAVOR ARG DEBIAN_FLAVOR -ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1258,7 +1266,7 @@ RUN apt update && \ libxml2 \ libxslt1.1 \ libzstd1 \ - libcurl4-openssl-dev \ + libcurl4 \ locales \ procps \ ca-certificates \ diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml index acb17d3cc0..92da0cdbdd 100644 --- a/compute/etc/neon_collector.yml +++ b/compute/etc/neon_collector.yml @@ -94,6 +94,68 @@ metrics: query: | select sum(pg_database_size(datname)) as total from pg_database; +- metric_name: getpage_wait_seconds_count + type: counter + help: 'Number of getpage requests' + values: [getpage_wait_seconds_count] + query_ref: neon_perf_counters + +- metric_name: getpage_wait_seconds_sum + type: counter + help: 'Time spent in getpage requests' + values: [getpage_wait_seconds_sum] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_requests_total + type: counter + help: 'Number of getpage issued for prefetching' + values: [getpage_prefetch_requests_total] + query_ref: neon_perf_counters + +- metric_name: getpage_sync_requests_total + type: counter + help: 'Number of synchronous getpage issued' + values: [getpage_sync_requests_total] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_misses_total + type: counter + help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' + values: [getpage_prefetch_misses_total] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_discards_total + type: counter + help: 'Number of prefetch responses issued but not used' + values: [getpage_prefetch_discards_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_requests_sent_total + type: counter + help: 'Number of all requests sent to the pageserver (not just GetPage requests)' + values: [pageserver_requests_sent_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_disconnects_total + type: counter + help: 'Number of times that the connection to the pageserver was lost' + values: [pageserver_disconnects_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_send_flushes_total + type: counter + help: 'Number of flushes to the pageserver connection' + values: [pageserver_send_flushes_total] + query_ref: neon_perf_counters + +- metric_name: getpage_wait_seconds_bucket + type: counter + help: 'Histogram buckets of getpage request latency' + key_labels: + - bucket_le + values: [value] + query_ref: getpage_wait_seconds_buckets + # DEPRECATED - metric_name: lfc_approximate_working_set_size type: gauge @@ -244,3 +306,26 @@ metrics: SELECT slot_name, CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost FROM pg_replication_slots; + +queries: + - query_name: neon_perf_counters + query: | + WITH c AS ( + SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters + ) + SELECT d.* + FROM pg_catalog.jsonb_to_record((select jb from c)) as d( + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric + ); + + - query_name: getpage_wait_seconds_buckets + query: | + SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml index 0af44745e5..50fcd62e4f 100644 --- a/compute/vm-image-spec.yaml +++ b/compute/vm-image-spec.yaml @@ -11,6 +11,10 @@ commands: user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' - name: pgbouncer user: postgres sysvInitAction: respawn @@ -30,11 +34,12 @@ commands: shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - - filename: compute_ctl-resize-swap + - filename: compute_ctl-sudoers content: | # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap - # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -100,7 +105,7 @@ merge: | && apt install --no-install-recommends -y \ sudo \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers COPY cgconfig.conf /etc/cgconfig.conf diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 00a82e4be6..b6d84d7eff 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -21,6 +21,7 @@ nix.workspace = true notify.workspace = true num_cpus.workspace = true opentelemetry.workspace = true +opentelemetry_sdk.workspace = true postgres.workspace = true regex.workspace = true serde_json.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 9499a7186e..109d315d67 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,6 +44,7 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use compute_tools::disk_quota::set_disk_quota; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; @@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); + let set_disk_quota_for_fs = matches.get_one::("set-disk-quota-for-fs"); Ok(ProcessCliResult { connstr, @@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result { spec_json, spec_path, resize_swap_on_bind, + set_disk_quota_for_fs, }) } @@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> { spec_json: Option<&'clap String>, spec_path: Option<&'clap String>, resize_swap_on_bind: bool, + set_disk_quota_for_fs: Option<&'clap String>, } fn startup_context_from_env() -> Option { @@ -214,7 +218,7 @@ fn startup_context_from_env() -> Option { } if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; - use opentelemetry::sdk::propagation::TraceContextPropagator; + use opentelemetry_sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() .extract(&startup_tracing_carrier) .attach(); @@ -293,6 +297,7 @@ fn wait_spec( pgbin, ext_remote_storage, resize_swap_on_bind, + set_disk_quota_for_fs, http_port, .. }: ProcessCliResult, @@ -373,6 +378,7 @@ fn wait_spec( compute, http_port, resize_swap_on_bind, + set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), }) } @@ -381,6 +387,7 @@ struct WaitSpecResult { // passed through from ProcessCliResult http_port: u16, resize_swap_on_bind: bool, + set_disk_quota_for_fs: Option, } fn start_postgres( @@ -390,6 +397,7 @@ fn start_postgres( compute, http_port, resize_swap_on_bind, + set_disk_quota_for_fs, }: WaitSpecResult, ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. @@ -403,6 +411,7 @@ fn start_postgres( ); // before we release the mutex, fetch the swap size (if any) for later. let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; + let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes; drop(state); // Launch remaining service threads @@ -422,8 +431,8 @@ fn start_postgres( // OOM-killed during startup because swap wasn't available yet. match resize_swap(size_bytes) { Ok(()) => { - let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%size_bytes, %size_gib, "resized swap"); + let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_mib, "resized swap"); } Err(err) => { let err = err.context("failed to resize swap"); @@ -432,10 +441,29 @@ fn start_postgres( // Mark compute startup as failed; don't try to start postgres, and report this // error to the control plane when it next asks. prestartup_failed = true; - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{err:?}")); - state.status = ComputeStatus::Failed; - compute.state_changed.notify_all(); + compute.set_failed_status(err); + delay_exit = true; + } + } + } + + // Set disk quota if the compute spec says so + if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = + (disk_quota_bytes, set_disk_quota_for_fs) + { + match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) { + Ok(()) => { + let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%disk_quota_bytes, %size_mib, "set disk quota"); + } + Err(err) => { + let err = err.context("failed to set disk quota"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + compute.set_failed_status(err); delay_exit = true; } } @@ -450,16 +478,7 @@ fn start_postgres( Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:#}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock + compute.set_failed_status(err); delay_exit = true; None } @@ -750,6 +769,11 @@ fn cli() -> clap::Command { .long("resize-swap-on-bind") .action(clap::ArgAction::SetTrue), ) + .arg( + Arg::new("set-disk-quota-for-fs") + .long("set-disk-quota-for-fs") + .value_name("SET_DISK_QUOTA_FOR_FS") + ) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1f47bb58a3..2f6e2bdb2c 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; use std::sync::{Condvar, Mutex, RwLock}; use std::thread; +use std::time::Duration; use std::time::Instant; use anyhow::{Context, Result}; @@ -305,6 +306,13 @@ impl ComputeNode { self.state_changed.notify_all(); } + pub fn set_failed_status(&self, err: anyhow::Error) { + let mut state = self.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + self.state_changed.notify_all(); + } + pub fn get_status(&self) -> ComputeStatus { self.state.lock().unwrap().status } @@ -710,7 +718,7 @@ impl ComputeNode { info!("running initdb"); let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); Command::new(initdb_bin) - .args(["-D", pgdata]) + .args(["--pgdata", pgdata]) .output() .expect("cannot start initdb process"); @@ -1052,19 +1060,26 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override( + pgdata_path, + "neon.max_cluster_size=-1", + || { + self.pg_reload_conf()?; + + self.apply_config(&compute_state)?; + + Ok(()) + }, + )?; self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - })?; - self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); @@ -1123,6 +1138,9 @@ impl ComputeNode { // // Use that as a default location and pattern, except macos where core dumps are written // to /cores/ directory by default. + // + // With default Linux settings, the core dump file is called just "core", so check for + // that too. pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), @@ -1134,8 +1152,17 @@ impl ComputeNode { let files = fs::read_dir(core_dump_dir)?; let cores = files.filter_map(|entry| { let entry = entry.ok()?; - let _ = entry.file_name().to_str()?.strip_prefix("core.")?; - Some(entry.path()) + + let is_core_dump = match entry.file_name().to_str()? { + n if n.starts_with("core.") => true, + "core" => true, + _ => false, + }; + if is_core_dump { + Some(entry.path()) + } else { + None + } }); // Print backtrace for each core dump @@ -1386,6 +1413,36 @@ LIMIT 100", } Ok(remote_ext_metrics) } + + /// Waits until current thread receives a state changed notification and + /// the pageserver connection strings has changed. + /// + /// The operation will time out after a specified duration. + pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) { + let state = self.state.lock().unwrap(); + let old_pageserver_connstr = state + .pspec + .as_ref() + .expect("spec must be set") + .pageserver_connstr + .clone(); + let mut unchanged = true; + let _ = self + .state_changed + .wait_timeout_while(state, duration, |s| { + let pageserver_connstr = &s + .pspec + .as_ref() + .expect("spec must be set") + .pageserver_connstr; + unchanged = pageserver_connstr == &old_pageserver_connstr; + unchanged + }) + .unwrap(); + if !unchanged { + info!("Pageserver config changed"); + } + } } pub fn forward_termination_signal() { diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index 274a221ac7..7bd0e4938d 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -11,9 +11,17 @@ use crate::compute::ComputeNode; fn configurator_main_loop(compute: &Arc) { info!("waiting for reconfiguration requests"); loop { - let state = compute.state.lock().unwrap(); - let mut state = compute.state_changed.wait(state).unwrap(); + let mut state = compute.state.lock().unwrap(); + // We have to re-check the status after re-acquiring the lock because it could be that + // the status has changed while we were waiting for the lock, and we might not need to + // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e. + // we are waiting for a condition variable that will never be signaled. + if state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + } + + // Re-check the status after waking up if state.status == ComputeStatus::ConfigurationPending { info!("got configuration request"); state.status = ComputeStatus::Configuration; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs new file mode 100644 index 0000000000..e838c5b9fd --- /dev/null +++ b/compute_tools/src/disk_quota.rs @@ -0,0 +1,25 @@ +use anyhow::Context; + +pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; + +/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. +/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { + let size_kb = size_bytes / 1024; + // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(DISK_QUOTA_BIN) + .arg(size_kb.to_string()) + .arg(fs_mountpoint) + .spawn(); + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => Err(anyhow::anyhow!("process exited with {status}")), + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`")) +} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index c402d63305..c5b4ca632c 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -10,6 +10,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod disk_quota; pub mod extension_server; pub mod lsn_lease; mod migration; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 84be5b0809..00be5c13f9 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,4 +1,3 @@ -use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -23,8 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = - tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new); + let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl"); // Put it all together tracing_subscriber::registry() diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 7e5917c55f..3061d387a5 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -57,10 +57,10 @@ fn lsn_lease_bg_task( .max(valid_duration / 2); info!( - "Succeeded, sleeping for {} seconds", + "Request succeeded, sleeping for {} seconds", sleep_duration.as_secs() ); - thread::sleep(sleep_duration); + compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration); } } @@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry( .map(|connstr| { let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); if let Some(storage_auth_token) = &spec.storage_auth_token { - info!("Got storage auth token from spec file"); config.password(storage_auth_token.clone()); - } else { - info!("Storage auth token not set"); } config }) @@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry( bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); } Err(e) => { - warn!("Failed to acquire lsn lease: {e} (attempt {attempts}"); + warn!("Failed to acquire lsn lease: {e} (attempt {attempts})"); - thread::sleep(Duration::from_millis(retry_period_ms as u64)); + compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis( + retry_period_ms as u64, + )); retry_period_ms *= 1.5; retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index df87c181bf..355eca0fe5 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,6 +9,7 @@ anyhow.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true +futures.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 92f609761a..624936620d 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -6,7 +6,7 @@ //! rely on `neon_local` to set up the environment for each test. //! use anyhow::{anyhow, bail, Context, Result}; -use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; +use clap::Parser; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::{ @@ -56,10 +56,627 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "16"; +const DEFAULT_PG_VERSION: u32 = 16; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; +#[derive(clap::Parser)] +#[command(version = GIT_VERSION, about, name = "Neon CLI")] +struct Cli { + #[command(subcommand)] + command: NeonLocalCmd, +} + +#[derive(clap::Subcommand)] +enum NeonLocalCmd { + Init(InitCmdArgs), + + #[command(subcommand)] + Tenant(TenantCmd), + #[command(subcommand)] + Timeline(TimelineCmd), + #[command(subcommand)] + Pageserver(PageserverCmd), + #[command(subcommand)] + #[clap(alias = "storage_controller")] + StorageController(StorageControllerCmd), + #[command(subcommand)] + #[clap(alias = "storage_broker")] + StorageBroker(StorageBrokerCmd), + #[command(subcommand)] + Safekeeper(SafekeeperCmd), + #[command(subcommand)] + Endpoint(EndpointCmd), + #[command(subcommand)] + Mappings(MappingsCmd), + + Start(StartCmdArgs), + Stop(StopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")] +struct InitCmdArgs { + #[clap(long, help("How many pageservers to create (default 1)"))] + num_pageservers: Option, + + #[clap(long)] + config: Option, + + #[clap(long, help("Force initialization even if the repository is not empty"))] + #[arg(value_parser)] + #[clap(default_value = "must-not-exist")] + force: InitForceMode, +} + +#[derive(clap::Args)] +#[clap(about = "Start pageserver and safekeepers")] +struct StartCmdArgs { + #[clap(long = "start-timeout", default_value = "10s")] + timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop pageserver and safekeepers")] +struct StopCmdArgs { + #[arg(value_enum)] + #[clap(long, default_value_t = StopMode::Fast)] + mode: StopMode, +} + +#[derive(Clone, Copy, clap::ValueEnum)] +enum StopMode { + Fast, + Immediate, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage tenants")] +enum TenantCmd { + List, + Create(TenantCreateCmdArgs), + SetDefault(TenantSetDefaultCmdArgs), + Config(TenantConfigCmdArgs), + Import(TenantImportCmdArgs), +} + +#[derive(clap::Args)] +struct TenantCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap( + long, + help = "Use a specific timeline id when creating a tenant and its initial timeline" + )] + timeline_id: Option, + + #[clap(short = 'c')] + config: Vec, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version to use for the initial timeline")] + pg_version: u32, + + #[clap( + long, + help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified" + )] + set_default: bool, + + #[clap(long, help = "Number of shards in the new tenant")] + #[arg(default_value_t = 0)] + shard_count: u8, + #[clap(long, help = "Sharding stripe size in pages")] + shard_stripe_size: Option, + + #[clap(long, help = "Placement policy shards in this tenant")] + #[arg(value_parser = parse_placement_policy)] + placement_policy: Option, +} + +fn parse_placement_policy(s: &str) -> anyhow::Result { + Ok(serde_json::from_str::(s)?) +} + +#[derive(clap::Args)] +#[clap( + about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified" +)] +struct TenantSetDefaultCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, +} + +#[derive(clap::Args)] +struct TenantConfigCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(short = 'c')] + config: Vec, +} + +#[derive(clap::Args)] +#[clap( + about = "Import a tenant that is present in remote storage, and create branches for its timelines" +)] +struct TenantImportCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage timelines")] +enum TimelineCmd { + List(TimelineListCmdArgs), + Branch(TimelineBranchCmdArgs), + Create(TimelineCreateCmdArgs), + Import(TimelineImportCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "List all timelines available to this pageserver")] +struct TimelineListCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_shard_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a new timeline, branching off from another timeline")] +struct TimelineBranchCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: Option, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[clap( + long, + help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name." + )] + ancestor_branch_name: Option, + + #[clap( + long, + help = "When using another timeline as base, use a specific Lsn in it instead of the latest one" + )] + ancestor_start_lsn: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a new blank timeline")] +struct TimelineCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: Option, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version")] + pg_version: u32, +} + +#[derive(clap::Args)] +#[clap(about = "Import timeline from a basebackup directory")] +struct TimelineImportCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: TimelineId, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[clap(long, help = "Basebackup tarfile to import")] + base_tarfile: PathBuf, + + #[clap(long, help = "Lsn the basebackup starts at")] + base_lsn: Lsn, + + #[clap(long, help = "Wal to add after base")] + wal_tarfile: Option, + + #[clap(long, help = "Lsn the basebackup ends at")] + end_lsn: Option, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version of the backup being imported")] + pg_version: u32, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage pageservers")] +enum PageserverCmd { + Status(PageserverStatusCmdArgs), + Start(PageserverStartCmdArgs), + Stop(PageserverStopCmdArgs), + Restart(PageserverRestartCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Show status of a local pageserver")] +struct PageserverStatusCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Start local pageserver")] +struct PageserverStartCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop local pageserver")] +struct PageserverStopCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, +} + +#[derive(clap::Args)] +#[clap(about = "Restart local pageserver")] +struct PageserverRestartCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage storage controller")] +enum StorageControllerCmd { + Start(StorageControllerStartCmdArgs), + Stop(StorageControllerStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start storage controller")] +struct StorageControllerStartCmdArgs { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, + + #[clap( + long, + help = "Identifier used to distinguish storage controller instances" + )] + #[arg(default_value_t = 1)] + instance_id: u8, + + #[clap( + long, + help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" + )] + base_port: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Stop storage controller")] +struct StorageControllerStopCmdArgs { + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, + + #[clap( + long, + help = "Identifier used to distinguish storage controller instances" + )] + #[arg(default_value_t = 1)] + instance_id: u8, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage storage broker")] +enum StorageBrokerCmd { + Start(StorageBrokerStartCmdArgs), + Stop(StorageBrokerStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start broker")] +struct StorageBrokerStartCmdArgs { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "stop broker")] +struct StorageBrokerStopCmdArgs { + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage safekeepers")] +enum SafekeeperCmd { + Start(SafekeeperStartCmdArgs), + Stop(SafekeeperStopCmdArgs), + Restart(SafekeeperRestartCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start local safekeeper")] +struct SafekeeperStartCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[clap( + short = 'e', + long = "safekeeper-extra-opt", + help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" + )] + extra_opt: Vec, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop local safekeeper")] +struct SafekeeperStopCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, +} + +#[derive(clap::Args)] +#[clap(about = "Restart local safekeeper")] +struct SafekeeperRestartCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, + + #[clap( + short = 'e', + long = "safekeeper-extra-opt", + help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" + )] + extra_opt: Vec, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage Postgres instances")] +enum EndpointCmd { + List(EndpointListCmdArgs), + Create(EndpointCreateCmdArgs), + Start(EndpointStartCmdArgs), + Reconfigure(EndpointReconfigureCmdArgs), + Stop(EndpointStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "List endpoints")] +struct EndpointListCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_shard_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a compute endpoint")] +struct EndpointCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(help = "Postgres endpoint id")] + endpoint_id: Option, + #[clap(long, help = "Name of the branch the endpoint will run on")] + branch_name: Option, + #[clap( + long, + help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used" + )] + lsn: Option, + #[clap(long)] + pg_port: Option, + #[clap(long)] + http_port: Option, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap( + long, + help = "Don't do basebackup, create endpoint directory with only config files", + action = clap::ArgAction::Set, + default_value_t = false + )] + config_only: bool, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version")] + pg_version: u32, + + #[clap( + long, + help = "If set, the node will be a hot replica on the specified timeline", + action = clap::ArgAction::Set, + default_value_t = false + )] + hot_standby: bool, + + #[clap(long, help = "If set, will set up the catalog for neon_superuser")] + update_catalog: bool, + + #[clap( + long, + help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." + )] + allow_multiple: bool, +} + +#[derive(clap::Args)] +#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")] +struct EndpointStartCmdArgs { + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap(long)] + safekeepers: Option, + + #[clap( + long, + help = "Configure the remote extensions storage proxy gateway to request for extensions." + )] + remote_ext_config: Option, + + #[clap( + long, + help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`" + )] + create_test_user: bool, + + #[clap( + long, + help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." + )] + allow_multiple: bool, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Reconfigure an endpoint")] +struct EndpointReconfigureCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap(long)] + safekeepers: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Stop an endpoint")] +struct EndpointStopCmdArgs { + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + + #[clap( + long, + help = "Also delete data directory (now optional, should be default in future)" + )] + destroy: bool, + + #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m \"")] + #[arg(value_parser(["smart", "fast", "immediate"]))] + #[arg(default_value = "fast")] + mode: String, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage neon_local branch name mappings")] +enum MappingsCmd { + Map(MappingsMapCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Create new mapping which cannot exist already")] +struct MappingsMapCmdArgs { + #[clap( + long, + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, + #[clap( + long, + help = "Timeline id. Represented as a hexadecimal string 32 symbols length" + )] + timeline_id: TimelineId, + #[clap(long, help = "Branch name to give to the timeline")] + branch_name: String, +} + /// /// Timelines tree element used as a value in the HashMap. /// @@ -80,19 +697,13 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let matches = cli().get_matches(); - - let (sub_name, sub_args) = match matches.subcommand() { - Some(subcommand_data) => subcommand_data, - None => bail!("no subcommand provided"), - }; + let cli = Cli::parse(); // Check for 'neon init' command first. - let subcommand_result = if sub_name == "init" { - handle_init(sub_args).map(|env| Some(Cow::Owned(env))) + let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command { + handle_init(&args).map(|env| Some(Cow::Owned(env))) } else { // all other commands need an existing config - let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); let env = Box::leak(Box::new(env)); @@ -101,19 +712,20 @@ fn main() -> Result<()> { .build() .unwrap(); - let subcommand_result = match sub_name { - "tenant" => rt.block_on(handle_tenant(sub_args, env)), - "timeline" => rt.block_on(handle_timeline(sub_args, env)), - "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))), - "stop" => rt.block_on(handle_stop_all(sub_args, env)), - "pageserver" => rt.block_on(handle_pageserver(sub_args, env)), - "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)), - "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)), - "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)), - "endpoint" => rt.block_on(handle_endpoint(sub_args, env)), - "mappings" => handle_mappings(sub_args, env), - "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), - _ => bail!("unexpected subcommand {sub_name}"), + let subcommand_result = match cli.command { + NeonLocalCmd::Init(_) => unreachable!("init was handled earlier already"), + NeonLocalCmd::Start(args) => rt.block_on(handle_start_all(&args, env)), + NeonLocalCmd::Stop(args) => rt.block_on(handle_stop_all(&args, env)), + NeonLocalCmd::Tenant(subcmd) => rt.block_on(handle_tenant(&subcmd, env)), + NeonLocalCmd::Timeline(subcmd) => rt.block_on(handle_timeline(&subcmd, env)), + NeonLocalCmd::Pageserver(subcmd) => rt.block_on(handle_pageserver(&subcmd, env)), + NeonLocalCmd::StorageController(subcmd) => { + rt.block_on(handle_storage_controller(&subcmd, env)) + } + NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)), + NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)), + NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)), + NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; if &original_env != env { @@ -263,10 +875,13 @@ async fn get_timeline_infos( .collect()) } -// Helper function to parse --tenant_id option, or get the default from config file -fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { - if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { - tenant_id_from_arguments +/// Helper function to get tenant id from an optional --tenant_id option or from the config file +fn get_tenant_id( + tenant_id_arg: Option, + env: &local_env::LocalEnv, +) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = tenant_id_arg { + Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(default_id) } else { @@ -274,13 +889,14 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -// Helper function to parse --tenant_id option, for commands that accept a shard suffix +/// Helper function to get tenant-shard ID from an optional --tenant_id option or from the config file, +/// for commands that accept a shard suffix fn get_tenant_shard_id( - sub_match: &ArgMatches, + tenant_shard_id_arg: Option, env: &local_env::LocalEnv, ) -> anyhow::Result { - if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() { - tenant_id_from_arguments + if let Some(tenant_id_from_arguments) = tenant_shard_id_arg { + Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(TenantShardId::unsharded(default_id)) } else { @@ -288,41 +904,11 @@ fn get_tenant_shard_id( } } -fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("tenant-id") - .map(|tenant_id| TenantId::from_str(tenant_id)) - .transpose() - .context("Failed to parse tenant id from the argument string") -} - -fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("tenant-id") - .map(|id_str| TenantShardId::from_str(id_str)) - .transpose() - .context("Failed to parse tenant shard id from the argument string") -} - -fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("timeline-id") - .map(|timeline_id| TimelineId::from_str(timeline_id)) - .transpose() - .context("Failed to parse timeline id from the argument string") -} - -fn handle_init(init_match: &ArgMatches) -> anyhow::Result { - let num_pageservers = init_match.get_one::("num-pageservers"); - - let force = init_match.get_one("force").expect("we set a default value"); - +fn handle_init(args: &InitCmdArgs) -> anyhow::Result { // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. - let init_conf: NeonLocalInitConf = if let Some(config_path) = - init_match.get_one::("config") - { + let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. - if num_pageservers.is_some() { + if args.num_pageservers.is_some() { bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); } // load and parse the file @@ -346,7 +932,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, ..Default::default() }], - pageservers: (0..num_pageservers.copied().unwrap_or(1)) + pageservers: (0..args.num_pageservers.unwrap_or(1)) .map(|i| { let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; @@ -369,7 +955,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { } }; - LocalEnv::init(init_conf, force) + LocalEnv::init(init_conf, &args.force) .context("materialize initial neon_local environment on disk")?; Ok(LocalEnv::load_config(&local_env::base_path()) .expect("freshly written config should be loadable")) @@ -387,19 +973,16 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -async fn handle_tenant( - tenant_match: &ArgMatches, - env: &mut local_env::LocalEnv, -) -> anyhow::Result<()> { +async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { let pageserver = get_default_pageserver(env); - match tenant_match.subcommand() { - Some(("list", _)) => { + match subcmd { + TenantCmd::List => { for t in pageserver.tenant_list().await? { println!("{} {:?}", t.id, t.state); } } - Some(("import", import_match)) => { - let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + TenantCmd::Import(args) => { + let tenant_id = args.tenant_id; let storage_controller = StorageController::from_env(env); let create_response = storage_controller.tenant_import(tenant_id).await?; @@ -446,31 +1029,14 @@ async fn handle_tenant( env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; } } - Some(("create", create_match)) => { - let tenant_conf: HashMap<_, _> = create_match - .get_many::("config") - .map(|vals: clap::parser::ValuesRef<'_, String>| { - vals.flat_map(|c| c.split_once(':')).collect() - }) - .unwrap_or_default(); - - let shard_count: u8 = create_match - .get_one::("shard-count") - .cloned() - .unwrap_or(0); - - let shard_stripe_size: Option = - create_match.get_one::("shard-stripe-size").cloned(); - - let placement_policy = match create_match.get_one::("placement-policy") { - Some(s) if !s.is_empty() => serde_json::from_str::(s)?, - _ => PlacementPolicy::Attached(0), - }; + TenantCmd::Create(args) => { + let tenant_conf: HashMap<_, _> = + args.config.iter().flat_map(|c| c.split_once(':')).collect(); let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one - let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); + let tenant_id = args.tenant_id.unwrap_or_else(TenantId::generate); // We must register the tenant with the storage controller, so // that when the pageserver restarts, it will be re-attached. @@ -478,29 +1044,26 @@ async fn handle_tenant( storage_controller .tenant_create(TenantCreateRequest { // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the - // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest - // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards) + // storage controller expects a shard-naive tenant_id in this attribute, and the TenantCreateRequest + // type is used both in the storage controller (for creating tenants) and in the pageserver (for + // creating shards) new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters { - count: ShardCount::new(shard_count), - stripe_size: shard_stripe_size + count: ShardCount::new(args.shard_count), + stripe_size: args + .shard_stripe_size .map(ShardStripeSize) .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), }, - placement_policy: Some(placement_policy), + placement_policy: args.placement_policy.clone(), config: tenant_conf, }) .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant - let new_timeline_id = - parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate()); - let pg_version = create_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; + let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have // different shards picking different start lsns. Maybe we have to teach storage controller @@ -513,7 +1076,7 @@ async fn handle_tenant( ancestor_timeline_id: None, ancestor_start_lsn: None, existing_initdb_timeline_id: None, - pg_version: Some(pg_version), + pg_version: Some(args.pg_version), }, ) .await?; @@ -526,23 +1089,19 @@ async fn handle_tenant( println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",); - if create_match.get_flag("set-default") { + if args.set_default { println!("Setting tenant {tenant_id} as a default one"); env.default_tenant_id = Some(tenant_id); } } - Some(("set-default", set_default_match)) => { - let tenant_id = - parse_tenant_id(set_default_match)?.context("No tenant id specified")?; - println!("Setting tenant {tenant_id} as a default one"); - env.default_tenant_id = Some(tenant_id); + TenantCmd::SetDefault(args) => { + println!("Setting tenant {} as a default one", args.tenant_id); + env.default_tenant_id = Some(args.tenant_id); } - Some(("config", create_match)) => { - let tenant_id = get_tenant_id(create_match, env)?; - let tenant_conf: HashMap<_, _> = create_match - .get_many::("config") - .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) - .unwrap_or_default(); + TenantCmd::Config(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let tenant_conf: HashMap<_, _> = + args.config.iter().flat_map(|c| c.split_once(':')).collect(); pageserver .tenant_config(tenant_id, tenant_conf) @@ -550,36 +1109,25 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), - None => bail!("no tenant subcommand provided"), } Ok(()) } -async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = get_default_pageserver(env); - match timeline_match.subcommand() { - Some(("list", list_match)) => { + match cmd { + TimelineCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. - let tenant_shard_id = get_tenant_shard_id(list_match, env)?; + let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timelines = pageserver.timeline_list(&tenant_shard_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } - Some(("create", create_match)) => { - let tenant_id = get_tenant_id(create_match, env)?; - let new_branch_name = create_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; - - let pg_version = create_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let new_timeline_id_opt = parse_timeline_id(create_match)?; + TimelineCmd::Create(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let new_branch_name = &args.branch_name; + let new_timeline_id_opt = args.timeline_id; let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); let storage_controller = StorageController::from_env(env); @@ -588,7 +1136,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_timeline_id: None, existing_initdb_timeline_id: None, ancestor_start_lsn: None, - pg_version: Some(pg_version), + pg_version: Some(args.pg_version), }; let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) @@ -602,67 +1150,42 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local timeline_info.timeline_id ); } - Some(("import", import_match)) => { - let tenant_id = get_tenant_id(import_match, env)?; - let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); - let branch_name = import_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; + TimelineCmd::Import(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let timeline_id = args.timeline_id; + let branch_name = &args.branch_name; // Parse base inputs - let base_tarfile = import_match - .get_one::("base-tarfile") - .ok_or_else(|| anyhow!("No base-tarfile provided"))? - .to_owned(); - let base_lsn = Lsn::from_str( - import_match - .get_one::("base-lsn") - .ok_or_else(|| anyhow!("No base-lsn provided"))?, - )?; - let base = (base_lsn, base_tarfile); + let base = (args.base_lsn, args.base_tarfile.clone()); // Parse pg_wal inputs - let wal_tarfile = import_match.get_one::("wal-tarfile").cloned(); - let end_lsn = import_match - .get_one::("end-lsn") - .map(|s| Lsn::from_str(s).unwrap()); + let wal_tarfile = args.wal_tarfile.clone(); + let end_lsn = args.end_lsn; // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); - let pg_version = import_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - println!("Importing timeline into pageserver ..."); pageserver - .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) + .timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version) .await?; env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } - Some(("branch", branch_match)) => { - let tenant_id = get_tenant_id(branch_match, env)?; - let new_timeline_id = - parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate()); - let new_branch_name = branch_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; - let ancestor_branch_name = branch_match - .get_one::("ancestor-branch-name") - .map(|s| s.as_str()) - .unwrap_or(DEFAULT_BRANCH_NAME); + TimelineCmd::Branch(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); + let new_branch_name = &args.branch_name; + let ancestor_branch_name = args + .ancestor_branch_name + .clone() + .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); let ancestor_timeline_id = env - .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .get_branch_timeline_id(&ancestor_branch_name, tenant_id) .ok_or_else(|| { anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; - let start_lsn = branch_match - .get_one::("ancestor-start-lsn") - .map(|lsn_str| Lsn::from_str(lsn_str)) - .transpose() - .context("Failed to parse ancestor start Lsn from the request")?; + let start_lsn = args.ancestor_start_lsn; let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, @@ -684,25 +1207,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), - None => bail!("no tenant subcommand provided"), } Ok(()) } -async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match ep_match.subcommand() { - Some(ep_subcommand_data) => ep_subcommand_data, - None => bail!("no endpoint subcommand provided"), - }; +async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; - match sub_name { - "list" => { + match subcmd { + EndpointCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. - let tenant_shard_id = get_tenant_shard_id(sub_args, env)?; + let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timeline_infos = get_timeline_infos(env, &tenant_shard_id) .await .unwrap_or_else(|e| { @@ -766,52 +1283,29 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re println!("{table}"); } - "create" => { - let tenant_id = get_tenant_id(sub_args, env)?; - let branch_name = sub_args - .get_one::("branch-name") - .map(|s| s.as_str()) - .unwrap_or(DEFAULT_BRANCH_NAME); - let endpoint_id = sub_args - .get_one::("endpoint_id") - .map(String::to_string) + EndpointCmd::Create(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let branch_name = args + .branch_name + .clone() + .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); + let endpoint_id = args + .endpoint_id + .clone() .unwrap_or_else(|| format!("ep-{branch_name}")); - let update_catalog = sub_args - .get_one::("update-catalog") - .cloned() - .unwrap_or_default(); - let lsn = sub_args - .get_one::("lsn") - .map(|lsn_str| Lsn::from_str(lsn_str)) - .transpose() - .context("Failed to parse Lsn from the request")?; let timeline_id = env - .get_branch_timeline_id(branch_name, tenant_id) + .get_branch_timeline_id(&branch_name, tenant_id) .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; - let pg_port: Option = sub_args.get_one::("pg-port").copied(); - let http_port: Option = sub_args.get_one::("http-port").copied(); - let pg_version = sub_args - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let hot_standby = sub_args - .get_one::("hot-standby") - .copied() - .unwrap_or(false); - - let allow_multiple = sub_args.get_flag("allow-multiple"); - - let mode = match (lsn, hot_standby) { + let mode = match (args.lsn, args.hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - match (mode, hot_standby) { + match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") } @@ -821,7 +1315,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - if !allow_multiple { + if !args.allow_multiple { cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; } @@ -829,34 +1323,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &endpoint_id, tenant_id, timeline_id, - pg_port, - http_port, - pg_version, + args.pg_port, + args.http_port, + args.pg_version, mode, - !update_catalog, + !args.update_catalog, )?; } - "start" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; - - let pageserver_id = - if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - Some(NodeId( - id_str.parse().context("while parsing pageserver id")?, - )) - } else { - None - }; - - let remote_ext_config = sub_args.get_one::("remote-ext-config"); - - let allow_multiple = sub_args.get_flag("allow-multiple"); + EndpointCmd::Start(args) => { + let endpoint_id = &args.endpoint_id; + let pageserver_id = args.endpoint_pageserver_id; + let remote_ext_config = &args.remote_ext_config; // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. - let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? { + let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { safekeepers } else { env.safekeepers.iter().map(|sk| sk.id).collect() @@ -867,12 +1348,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get(endpoint_id.as_str()) .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; - let create_test_user = sub_args - .get_one::("create-test-user") - .cloned() - .unwrap_or_default(); - - if !allow_multiple { + if !args.allow_multiple { cplane.check_conflicting_endpoints( endpoint.mode, endpoint.tenant_id, @@ -894,17 +1370,27 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re // to pass these on to postgres. let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; - let pageservers = locate_result - .shards - .into_iter() - .map(|shard| { - ( + let pageservers = futures::future::try_join_all( + locate_result.shards.into_iter().map(|shard| async move { + if let ComputeMode::Static(lsn) = endpoint.mode { + // Initialize LSN leases for static computes. + let conf = env.get_pageserver_conf(shard.node_id).unwrap(); + let pageserver = PageServerNode::from_env(env, conf); + + pageserver + .http_client + .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn) + .await?; + } + + anyhow::Ok(( Host::parse(&shard.listen_pg_addr) .expect("Storage controller reported bad hostname"), shard.listen_pg_port, - ) - }) - .collect::>(); + )) + }), + ) + .await?; let stripe_size = locate_result.shard_params.stripe_size; (pageservers, stripe_size) @@ -926,72 +1412,61 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &auth_token, safekeepers, pageservers, - remote_ext_config, + remote_ext_config.as_ref(), stripe_size.0 as usize, - create_test_user, + args.create_test_user, ) .await?; } - "reconfigure" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?; + EndpointCmd::Reconfigure(args) => { + let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let pageservers = - if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?); - let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); - vec![( - pageserver.pg_connection_config.host().clone(), - pageserver.pg_connection_config.port(), - )] - } else { - let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_locate(endpoint.tenant_id) - .await? - .shards - .into_iter() - .map(|shard| { - ( - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported malformed host"), - shard.listen_pg_port, - ) - }) - .collect::>() - }; + let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id { + let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); + vec![( + pageserver.pg_connection_config.host().clone(), + pageserver.pg_connection_config.port(), + )] + } else { + let storage_controller = StorageController::from_env(env); + storage_controller + .tenant_locate(endpoint.tenant_id) + .await? + .shards + .into_iter() + .map(|shard| { + ( + Host::parse(&shard.listen_pg_addr) + .expect("Storage controller reported malformed host"), + shard.listen_pg_port, + ) + }) + .collect::>() + }; // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. - let safekeepers = parse_safekeepers(sub_args)?; + let safekeepers = parse_safekeepers(&args.safekeepers)?; endpoint.reconfigure(pageservers, None, safekeepers).await?; } - "stop" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; - let destroy = sub_args.get_flag("destroy"); - let mode = sub_args.get_one::("mode").expect("has a default"); - + EndpointCmd::Stop(args) => { + let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints - .get(endpoint_id.as_str()) + .get(endpoint_id) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(mode, destroy)?; + endpoint.stop(&args.mode, args.destroy)?; } - - _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), } Ok(()) } /// Parse --safekeepers as list of safekeeper ids. -fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { - if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { +fn parse_safekeepers(safekeepers_str: &Option) -> Result>> { + if let Some(safekeepers_str) = safekeepers_str { let mut safekeepers: Vec = Vec::new(); for sk_id in safekeepers_str.split(',').map(str::trim) { let sk_id = NodeId( @@ -1006,44 +1481,25 @@ fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { } } -fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(ep_subcommand_data) => ep_subcommand_data, - None => bail!("no mappings subcommand provided"), - }; - - match sub_name { - "map" => { - let branch_name = sub_args - .get_one::("branch-name") - .expect("branch-name argument missing"); - - let tenant_id = sub_args - .get_one::("tenant-id") - .map(|x| TenantId::from_str(x)) - .expect("tenant-id argument missing") - .expect("malformed tenant-id arg"); - - let timeline_id = sub_args - .get_one::("timeline-id") - .map(|x| TimelineId::from_str(x)) - .expect("timeline-id argument missing") - .expect("malformed timeline-id arg"); - - env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?; +fn handle_mappings(subcmd: &MappingsCmd, env: &mut local_env::LocalEnv) -> Result<()> { + match subcmd { + MappingsCmd::Map(args) => { + env.register_branch_mapping( + args.branch_name.to_owned(), + args.tenant_id, + args.timeline_id, + )?; Ok(()) } - other => unimplemented!("mappings subcommand {other}"), } } -fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result { - let node_id = if let Some(id_str) = args.get_one::("pageserver-id") { - NodeId(id_str.parse().context("while parsing pageserver id")?) - } else { - DEFAULT_PAGESERVER_ID - }; +fn get_pageserver( + env: &local_env::LocalEnv, + pageserver_id_arg: Option, +) -> Result { + let node_id = pageserver_id_arg.unwrap_or(DEFAULT_PAGESERVER_ID); Ok(PageServerNode::from_env( env, @@ -1051,48 +1507,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result &Duration { - let humantime_duration = args - .get_one::("start-timeout") - .expect("invalid value for start-timeout"); - humantime_duration.as_ref() -} - -fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs { - let maybe_instance_id = args.get_one::("instance-id"); - - let base_port = args.get_one::("base-port"); - - if maybe_instance_id.is_some() && base_port.is_none() { - panic!("storage-controller start specificied instance-id but did not provide base-port"); - } - - let start_timeout = args - .get_one::("start-timeout") - .expect("invalid value for start-timeout"); - - NeonStorageControllerStartArgs { - instance_id: maybe_instance_id.copied().unwrap_or(1), - base_port: base_port.copied(), - start_timeout: *start_timeout, - } -} - -fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs { - let maybe_instance_id = args.get_one::("instance-id"); - let immediate = args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - - NeonStorageControllerStopArgs { - instance_id: maybe_instance_id.copied().unwrap_or(1), - immediate, - } -} - -async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - match sub_match.subcommand() { - Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)? - .start(get_start_timeout(subcommand_args)) +async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + PageserverCmd::Start(args) => { + if let Err(e) = get_pageserver(env, args.pageserver_id)? + .start(&args.start_timeout) .await { eprintln!("pageserver start failed: {e}"); @@ -1100,34 +1519,36 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } - Some(("stop", subcommand_args)) => { - let immediate = subcommand_args - .get_one::("stop-mode") - .map(|s| s.as_str()) - == Some("immediate"); - - if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) { + PageserverCmd::Stop(args) => { + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; + if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) { eprintln!("pageserver stop failed: {}", e); exit(1); } } - Some(("restart", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; + PageserverCmd::Restart(args) => { + let pageserver = get_pageserver(env, args.pageserver_id)?; //TODO what shutdown strategy should we use here? if let Err(e) = pageserver.stop(false) { eprintln!("pageserver stop failed: {}", e); exit(1); } - if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await { + if let Err(e) = pageserver.start(&args.start_timeout).await { eprintln!("pageserver start failed: {e}"); exit(1); } } - Some(("status", subcommand_args)) => { - match get_pageserver(env, subcommand_args)?.check_status().await { + PageserverCmd::Status(args) => { + match get_pageserver(env, args.pageserver_id)? + .check_status() + .await + { Ok(_) => println!("Page server is up and running"), Err(err) => { eprintln!("Page server is not available: {}", err); @@ -1135,34 +1556,42 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } } - - Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), - None => bail!("no pageserver subcommand provided"), } Ok(()) } async fn handle_storage_controller( - sub_match: &ArgMatches, + subcmd: &StorageControllerCmd, env: &local_env::LocalEnv, ) -> Result<()> { let svc = StorageController::from_env(env); - match sub_match.subcommand() { - Some(("start", start_match)) => { - if let Err(e) = svc.start(storage_controller_start_args(start_match)).await { + match subcmd { + StorageControllerCmd::Start(args) => { + let start_args = NeonStorageControllerStartArgs { + instance_id: args.instance_id, + base_port: args.base_port, + start_timeout: args.start_timeout, + }; + + if let Err(e) = svc.start(start_args).await { eprintln!("start failed: {e}"); exit(1); } } - Some(("stop", stop_match)) => { - if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await { + StorageControllerCmd::Stop(args) => { + let stop_args = NeonStorageControllerStopArgs { + instance_id: args.instance_id, + immediate: match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }, + }; + if let Err(e) = svc.stop(stop_args).await { eprintln!("stop failed: {}", e); exit(1); } } - Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name), - None => bail!("no storage_controller subcommand provided"), } Ok(()) } @@ -1175,111 +1604,77 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result Vec { - init_match - .get_many::("safekeeper-extra-opt") - .into_iter() - .flatten() - .map(|s| s.to_owned()) - .collect() -} +async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + SafekeeperCmd::Start(args) => { + let safekeeper = get_safekeeper(env, args.id)?; -async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(safekeeper_command_data) => safekeeper_command_data, - None => bail!("no safekeeper subcommand provided"), - }; - - // All the commands take an optional safekeeper name argument - let sk_id = if let Some(id_str) = sub_args.get_one::("id") { - NodeId(id_str.parse().context("while parsing safekeeper id")?) - } else { - DEFAULT_SAFEKEEPER_ID - }; - let safekeeper = get_safekeeper(env, sk_id)?; - - match sub_name { - "start" => { - let extra_opts = safekeeper_extra_opts(sub_args); - - if let Err(e) = safekeeper - .start(extra_opts, get_start_timeout(sub_args)) - .await - { + if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {}", e); exit(1); } } - "stop" => { - let immediate = - sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - + SafekeeperCmd::Stop(args) => { + let safekeeper = get_safekeeper(env, args.id)?; + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); exit(1); } } - "restart" => { - let immediate = - sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + SafekeeperCmd::Restart(args) => { + let safekeeper = get_safekeeper(env, args.id)?; + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); exit(1); } - let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper - .start(extra_opts, get_start_timeout(sub_args)) - .await - { + if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {}", e); exit(1); } } - - _ => { - bail!("Unexpected safekeeper subcommand '{}'", sub_name) - } } Ok(()) } -async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(broker_command_data) => broker_command_data, - None => bail!("no broker subcommand provided"), - }; - - match sub_name { - "start" => { - if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await { +async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + StorageBrokerCmd::Start(args) => { + if let Err(e) = broker::start_broker_process(env, &args.start_timeout).await { eprintln!("broker start failed: {e}"); exit(1); } } - "stop" => { + StorageBrokerCmd::Stop(_args) => { + // FIXME: stop_mode unused if let Err(e) = broker::stop_broker_process(env) { eprintln!("broker stop failed: {e}"); exit(1); } } - - _ => bail!("Unexpected broker subcommand '{}'", sub_name), } Ok(()) } async fn handle_start_all( + args: &StartCmdArgs, env: &'static local_env::LocalEnv, - retry_timeout: &Duration, ) -> anyhow::Result<()> { - let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else { - neon_start_status_check(env, retry_timeout) + // FIXME: this was called "retry_timeout", is it right? + let Err(errors) = handle_start_all_impl(env, args.timeout).await else { + neon_start_status_check(env, args.timeout.as_ref()) .await .context("status check after successful startup of all services")?; return Ok(()); @@ -1304,7 +1699,7 @@ async fn handle_start_all( /// Otherwise, returns the list of errors that occurred during startup. async fn handle_start_all_impl( env: &'static local_env::LocalEnv, - retry_timeout: Duration, + retry_timeout: humantime::Duration, ) -> Result<(), Vec> { // Endpoints are not started automatically @@ -1324,7 +1719,7 @@ async fn handle_start_all_impl( let storage_controller = StorageController::from_env(env); storage_controller .start(NeonStorageControllerStartArgs::with_default_instance_id( - retry_timeout.into(), + retry_timeout, )) .await .map_err(|e| e.context("start storage_controller")) @@ -1345,7 +1740,7 @@ async fn handle_start_all_impl( js.spawn(async move { let safekeeper = SafekeeperNode::from_env(env, node); safekeeper - .start(vec![], &retry_timeout) + .start(&[], &retry_timeout) .await .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) }); @@ -1425,9 +1820,11 @@ async fn neon_start_status_check( anyhow::bail!("\nNeon passed status check") } -async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let immediate = - sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); +async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Result<()> { + let immediate = match args.mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; try_stop_all(env, immediate).await; @@ -1485,400 +1882,3 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } } } - -fn cli() -> Command { - let timeout_arg = Arg::new("start-timeout") - .long("start-timeout") - .short('t') - .global(true) - .help("timeout until we fail the command, e.g. 30s") - .value_parser(value_parser!(humantime::Duration)) - .default_value("10s") - .required(false); - - let branch_name_arg = Arg::new("branch-name") - .long("branch-name") - .help("Name of the branch to be created or used as an alias for other services") - .required(false); - - let endpoint_id_arg = Arg::new("endpoint_id") - .help("Postgres endpoint id") - .required(false); - - let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); - - // --id, when using a pageserver command - let pageserver_id_arg = Arg::new("pageserver-id") - .long("id") - .global(true) - .help("pageserver id") - .required(false); - // --pageserver-id when using a non-pageserver command - let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id") - .long("pageserver-id") - .required(false); - - let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt") - .short('e') - .long("safekeeper-extra-opt") - .num_args(1) - .action(ArgAction::Append) - .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo") - .required(false); - - let tenant_id_arg = Arg::new("tenant-id") - .long("tenant-id") - .help("Tenant id. Represented as a hexadecimal string 32 symbols length") - .required(false); - - let timeline_id_arg = Arg::new("timeline-id") - .long("timeline-id") - .help("Timeline id. Represented as a hexadecimal string 32 symbols length") - .required(false); - - let pg_version_arg = Arg::new("pg-version") - .long("pg-version") - .help("Postgres version to use for the initial tenant") - .required(false) - .value_parser(value_parser!(u32)) - .default_value(DEFAULT_PG_VERSION); - - let pg_port_arg = Arg::new("pg-port") - .long("pg-port") - .required(false) - .value_parser(value_parser!(u16)) - .value_name("pg-port"); - - let http_port_arg = Arg::new("http-port") - .long("http-port") - .required(false) - .value_parser(value_parser!(u16)) - .value_name("http-port"); - - let safekeepers_arg = Arg::new("safekeepers") - .long("safekeepers") - .required(false) - .value_name("safekeepers"); - - let stop_mode_arg = Arg::new("stop-mode") - .short('m') - .value_parser(["fast", "immediate"]) - .default_value("fast") - .help("If 'immediate', don't flush repository data at shutdown") - .required(false) - .value_name("stop-mode"); - - let remote_ext_config_args = Arg::new("remote-ext-config") - .long("remote-ext-config") - .num_args(1) - .help("Configure the remote extensions storage proxy gateway to request for extensions.") - .required(false); - - let lsn_arg = Arg::new("lsn") - .long("lsn") - .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") - .required(false); - - let hot_standby_arg = Arg::new("hot-standby") - .value_parser(value_parser!(bool)) - .long("hot-standby") - .help("If set, the node will be a hot replica on the specified timeline") - .required(false); - - let force_arg = Arg::new("force") - .value_parser(value_parser!(InitForceMode)) - .long("force") - .default_value( - InitForceMode::MustNotExist - .to_possible_value() - .unwrap() - .get_name() - .to_owned(), - ) - .help("Force initialization even if the repository is not empty") - .required(false); - - let num_pageservers_arg = Arg::new("num-pageservers") - .value_parser(value_parser!(u16)) - .long("num-pageservers") - .help("How many pageservers to create (default 1)"); - - let update_catalog = Arg::new("update-catalog") - .value_parser(value_parser!(bool)) - .long("update-catalog") - .help("If set, will set up the catalog for neon_superuser") - .required(false); - - let create_test_user = Arg::new("create-test-user") - .value_parser(value_parser!(bool)) - .long("create-test-user") - .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") - .required(false); - - let allow_multiple = Arg::new("allow-multiple") - .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") - .long("allow-multiple") - .action(ArgAction::SetTrue) - .required(false); - - let instance_id = Arg::new("instance-id") - .long("instance-id") - .help("Identifier used to distinguish storage controller instances (default 1)") - .value_parser(value_parser!(u8)) - .required(false); - - let base_port = Arg::new("base-port") - .long("base-port") - .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)") - .value_parser(value_parser!(u16)) - .required(false); - - Command::new("Neon CLI") - .arg_required_else_help(true) - .version(GIT_VERSION) - .subcommand( - Command::new("init") - .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(num_pageservers_arg.clone()) - .arg( - Arg::new("config") - .long("config") - .required(false) - .value_parser(value_parser!(PathBuf)) - .value_name("config") - ) - .arg(force_arg) - ) - .subcommand( - Command::new("timeline") - .about("Manage timelines") - .arg_required_else_help(true) - .subcommand(Command::new("list") - .about("List all timelines, available to this pageserver") - .arg(tenant_id_arg.clone())) - .subcommand(Command::new("branch") - .about("Create a new timeline, using another timeline as a base, copying its data") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") - .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) - .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn") - .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) - .subcommand(Command::new("create") - .about("Create a new blank timeline") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(pg_version_arg.clone()) - ) - .subcommand(Command::new("import") - .about("Import timeline from basebackup directory") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(Arg::new("base-tarfile") - .long("base-tarfile") - .value_parser(value_parser!(PathBuf)) - .help("Basebackup tarfile to import") - ) - .arg(Arg::new("base-lsn").long("base-lsn") - .help("Lsn the basebackup starts at")) - .arg(Arg::new("wal-tarfile") - .long("wal-tarfile") - .value_parser(value_parser!(PathBuf)) - .help("Wal to add after base") - ) - .arg(Arg::new("end-lsn").long("end-lsn") - .help("Lsn the basebackup ends at")) - .arg(pg_version_arg.clone()) - ) - ).subcommand( - Command::new("tenant") - .arg_required_else_help(true) - .about("Manage tenants") - .subcommand(Command::new("list")) - .subcommand(Command::new("create") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) - .arg(pg_version_arg.clone()) - .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false) - .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) - .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) - .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) - .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) - ) - .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) - .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) - .subcommand(Command::new("config") - .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) - .about("Import a tenant that is present in remote storage, and create branches for its timelines")) - ) - .subcommand( - Command::new("pageserver") - .arg_required_else_help(true) - .about("Manage pageserver") - .arg(pageserver_id_arg) - .subcommand(Command::new("status")) - .subcommand(Command::new("start") - .about("Start local pageserver") - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop local pageserver") - .arg(stop_mode_arg.clone()) - ) - .subcommand(Command::new("restart") - .about("Restart local pageserver") - .arg(timeout_arg.clone()) - ) - ) - .subcommand( - Command::new("storage_controller") - .arg_required_else_help(true) - .about("Manage storage_controller") - .subcommand(Command::new("start").about("Start storage controller") - .arg(timeout_arg.clone()) - .arg(instance_id.clone()) - .arg(base_port)) - .subcommand(Command::new("stop").about("Stop storage controller") - .arg(stop_mode_arg.clone()) - .arg(instance_id)) - ) - .subcommand( - Command::new("storage_broker") - .arg_required_else_help(true) - .about("Manage broker") - .subcommand(Command::new("start") - .about("Start broker") - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop broker") - .arg(stop_mode_arg.clone()) - ) - ) - .subcommand( - Command::new("safekeeper") - .arg_required_else_help(true) - .about("Manage safekeepers") - .subcommand(Command::new("start") - .about("Start local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(safekeeper_extra_opt_arg.clone()) - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - .subcommand(Command::new("restart") - .about("Restart local safekeeper") - .arg(safekeeper_id_arg) - .arg(stop_mode_arg.clone()) - .arg(safekeeper_extra_opt_arg) - .arg(timeout_arg.clone()) - ) - ) - .subcommand( - Command::new("endpoint") - .arg_required_else_help(true) - .about("Manage postgres instances") - .subcommand(Command::new("list").arg(tenant_id_arg.clone())) - .subcommand(Command::new("create") - .about("Create a compute endpoint") - .arg(endpoint_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(lsn_arg.clone()) - .arg(pg_port_arg.clone()) - .arg(http_port_arg.clone()) - .arg(endpoint_pageserver_id_arg.clone()) - .arg( - Arg::new("config-only") - .help("Don't do basebackup, create endpoint directory with only config files") - .long("config-only") - .required(false)) - .arg(pg_version_arg.clone()) - .arg(hot_standby_arg.clone()) - .arg(update_catalog) - .arg(allow_multiple.clone()) - ) - .subcommand(Command::new("start") - .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") - .arg(endpoint_id_arg.clone()) - .arg(endpoint_pageserver_id_arg.clone()) - .arg(safekeepers_arg.clone()) - .arg(remote_ext_config_args) - .arg(create_test_user) - .arg(allow_multiple.clone()) - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("reconfigure") - .about("Reconfigure the endpoint") - .arg(endpoint_pageserver_id_arg) - .arg(safekeepers_arg) - .arg(endpoint_id_arg.clone()) - .arg(tenant_id_arg.clone()) - ) - .subcommand( - Command::new("stop") - .arg(endpoint_id_arg) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .action(ArgAction::SetTrue) - .required(false) - ) - .arg( - Arg::new("mode") - .help("Postgres shutdown mode, passed to \"pg_ctl -m \"") - .long("mode") - .action(ArgAction::Set) - .required(false) - .value_parser(["smart", "fast", "immediate"]) - .default_value("fast") - ) - ) - - ) - .subcommand( - Command::new("mappings") - .arg_required_else_help(true) - .about("Manage neon_local branch name mappings") - .subcommand( - Command::new("map") - .about("Create new mapping which cannot exist already") - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - ) - ) - // Obsolete old name for 'endpoint'. We now just print an error if it's used. - .subcommand( - Command::new("pg") - .hide(true) - .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false)) - .trailing_var_arg(true) - ) - .subcommand( - Command::new("start") - .about("Start page server and safekeepers") - .arg(timeout_arg.clone()) - ) - .subcommand( - Command::new("stop") - .about("Stop page server and safekeepers") - .arg(stop_mode_arg) - ) -} - -#[test] -fn verify_cli() { - cli().debug_assert(); -} diff --git a/control_plane/src/branch_mappings.rs b/control_plane/src/branch_mappings.rs new file mode 100644 index 0000000000..e89313df39 --- /dev/null +++ b/control_plane/src/branch_mappings.rs @@ -0,0 +1,94 @@ +//! Branch mappings for convenience + +use std::collections::HashMap; +use std::fs; +use std::path::Path; + +use anyhow::{bail, Context}; +use serde::{Deserialize, Serialize}; + +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct BranchMappings { + /// Default tenant ID to use with the 'neon_local' command line utility, when + /// --tenant_id is not explicitly specified. This comes from the branches. + pub default_tenant_id: Option, + + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub mappings: HashMap>, +} + +impl BranchMappings { + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let existing_values = self.mappings.entry(branch_name.clone()).or_default(); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: TenantId, + ) -> Option { + // If it looks like a timeline ID, return it as it is + if let Ok(timeline_id) = branch_name.parse::() { + return Some(timeline_id); + } + + self.mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + .map(TimelineId::from) + } + + pub fn timeline_name_mappings(&self) -> HashMap { + self.mappings + .iter() + .flat_map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) + }) + }) + .collect() + } + + pub fn persist(&self, path: &Path) -> anyhow::Result<()> { + let content = &toml::to_string_pretty(self)?; + fs::write(path, content).with_context(|| { + format!( + "Failed to write branch information into path '{}'", + path.display() + ) + }) + } + + pub fn load(path: &Path) -> anyhow::Result { + let branches_file_contents = fs::read_to_string(path)?; + Ok(toml::from_str(branches_file_contents.as_str())?) + } +} diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7554a03a68..18f396b886 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -561,6 +561,7 @@ impl Endpoint { operation_uuid: None, features: self.features.clone(), swap_size_bytes: None, + disk_quota_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d616154af6..9dc2a0c36b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -168,6 +168,9 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub heartbeat_interval: Duration, + + #[serde(with = "humantime_serde")] + pub long_reconcile_threshold: Option, } impl NeonStorageControllerConf { @@ -190,6 +193,7 @@ impl Default for NeonStorageControllerConf { split_threshold: None, max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, + long_reconcile_threshold: None, } } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 573f1688d5..7a019bce88 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -113,7 +113,7 @@ impl SafekeeperNode { pub async fn start( &self, - extra_opts: Vec, + extra_opts: &[String], retry_timeout: &Duration, ) -> anyhow::Result<()> { print!( @@ -196,7 +196,7 @@ impl SafekeeperNode { ]); } - args.extend(extra_opts); + args.extend_from_slice(extra_opts); background_process::start_process( &format!("safekeeper-{id}"), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0c0e67dff0..6d07c43af0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -347,7 +347,7 @@ impl StorageController { if !tokio::fs::try_exists(&pg_data_path).await? { let initdb_args = [ - "-D", + "--pgdata", pg_data_path.as_ref(), "--username", &username(), @@ -517,6 +517,13 @@ impl StorageController { args.push(format!("--max-secondary-lag-bytes={lag}")) } + if let Some(threshold) = self.config.long_reconcile_threshold { + args.push(format!( + "--long-reconcile-threshold={}", + humantime::Duration::from(threshold) + )) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() diff --git a/docs/rfcs/038-aux-file-v2.md b/docs/rfcs/038-aux-file-v2.md new file mode 100644 index 0000000000..9c3c336008 --- /dev/null +++ b/docs/rfcs/038-aux-file-v2.md @@ -0,0 +1,112 @@ +# AUX file v2 + +## Summary + +This is a retrospective RFC describing a new storage strategy for AUX files. + +## Motivation + +The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`. +Every time the compute node streams a `neon-file` record to the pageserver, it will +update the aux file hash map, and then write the serialized hash map into the key. +This creates serious space bloat. There was a fix to log delta records (i.e., update +a key in the hash map) to the aux file key. In this way, the pageserver only stores +the deltas at each of the LSNs. However, this improved v1 storage strategy still +requires us to store everything in an aux file cache in memory, because we cannot +fetch a single key (or file) from the compound `AUX_FILES_KEY`. + +### Prior art + +For storing large amount of small files, we can use a key-value store where the key +is the filename and the value is the file content. + +## Requirements + +- No space bloat, fixed space amplification. +- No write bloat, fixed write amplification. + +## Impacted Components + +pageserver + +## Sparse Keyspace + +In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF +exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior +assumption, there are code that traverses the keyspace by iterating every single key. + +```rust +loop { + // do something + key = key.next(); +} +``` + +If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run. +Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would +exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead, +they should fetch all the layer files in the key range, and then do a merge of them. + +In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`. + +## AUX v2 Keyspace and Key Mapping + +Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the +keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash +of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`. + +For example, `pg_logical/mappings/test1` will be encoded as: + +``` +62 0000 01 01 7F8B83D94F7081693471ABF91C +^ aux prefix + ^ assigned prefix of pg_logical/ + ^ assigned prefix of mappings/ + ^ 13B FNV hash of test1 + ^ not used due to key representation +``` + +The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace. + +Note that inside pageserver, there are two representations of the keys: the 18B full key representation +and the 16B compact key representation. For the 18B representation, some fields have restricted ranges +of values. Therefore, the aux keys only use the 16B compact portion of the full key. + +It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of +each of the aux key is an array that contains all filenames and file content that should be stored in +this key. + +We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before, +and we do not need addition code to support reconstructing the value. We simply get the latest image from +the storage. + +## Inbound Logical Replication Key Mapping + +For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data. +This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during +generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace. + +## Sparse Keyspace Read Path + +There are two places we need to read the aux files from the pageserver: + +* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that. +* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. + +## Compaction and Image Layer Generation + +With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage. + +* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes. +* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later. + +## Migration + +We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration). + +During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached +with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2). + +If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program. + +The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 883c624f71..83515a00a0 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -50,6 +50,16 @@ pub struct ComputeSpec { #[serde(default)] pub swap_size_bytes: Option, + /// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs + /// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the + /// spec is first received. + /// + /// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of disk quota. + #[serde(default)] + pub disk_quota_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -268,6 +278,22 @@ pub struct GenericOption { /// declare a `trait` on it. pub type GenericOptions = Option>; +/// Configured the local-proxy application with the relevant JWKS and roles it should +/// use for authorizing connect requests using JWT. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct LocalProxySpec { + pub jwks: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct JwksSettings { + pub id: String, + pub role_names: Vec, + pub jwks_url: String, + pub provider_name: String, + pub jwt_audience: Option, +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index e274d24585..085540e7b9 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String { } fn log_query_error(query: &str, e: &QueryError) { + // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`. match e { QueryError::Disconnected(ConnectionError::Io(io_error)) => { if is_expected_io_error(io_error) { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 5c0abda522..9524a5149b 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -93,9 +93,9 @@ impl Conf { ); let output = self .new_pg_command("initdb")? - .arg("-D") + .arg("--pgdata") .arg(&self.datadir) - .args(["-U", "postgres", "--no-instructions", "--no-sync"]) + .args(["--username", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); ensure!( diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 05eb538d42..66f21cd1ef 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -6,12 +6,14 @@ license.workspace = true [dependencies] hyper.workspace = true -opentelemetry = { workspace = true, features=["rt-tokio"] } -opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry = { workspace = true, features = ["trace"] } +opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] } +opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true +tracing-subscriber.workspace = true [dev-dependencies] tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 9cf2495771..c4aad53cdb 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -10,7 +10,6 @@ //! //! ```rust,no_run //! use tracing_subscriber::prelude::*; -//! use tracing_opentelemetry::OpenTelemetryLayer; //! //! #[tokio::main] //! async fn main() { @@ -22,7 +21,7 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new); +//! let otlp_layer = tracing_utils::init_tracing("my_application").await; //! //! // Put it all together //! tracing_subscriber::registry() @@ -35,15 +34,15 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use opentelemetry::sdk::Resource; -use opentelemetry::KeyValue; -use opentelemetry_otlp::WithExportConfig; -use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}; - -pub use tracing_opentelemetry::OpenTelemetryLayer; - pub mod http; +use opentelemetry::trace::TracerProvider; +use opentelemetry::KeyValue; +use opentelemetry_sdk::Resource; +use tracing::Subscriber; +use tracing_subscriber::registry::LookupSpan; +use tracing_subscriber::Layer; + /// Set up OpenTelemetry exporter, using configuration from environment variables. /// /// `service_name` is set as the OpenTelemetry 'service.name' resource (see @@ -71,7 +70,10 @@ pub mod http; /// /// This doesn't block, but is marked as 'async' to hint that this must be called in /// asynchronous execution context. -pub async fn init_tracing(service_name: &str) -> Option { +pub async fn init_tracing(service_name: &str) -> Option> +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; @@ -80,9 +82,10 @@ pub async fn init_tracing(service_name: &str) -> Option Option { +pub fn init_tracing_without_runtime(service_name: &str) -> Option> +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; @@ -113,54 +116,36 @@ pub fn init_tracing_without_runtime( Some(init_tracing_internal(service_name.to_string())) } -fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer { - // Set up exporter from the OTEL_EXPORTER_* environment variables - let mut exporter = opentelemetry_otlp::new_exporter().http().with_env(); +fn init_tracing_internal(service_name: String) -> impl Layer +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ + // Sets up exporter from the OTEL_EXPORTER_* environment variables. + let exporter = opentelemetry_otlp::new_exporter().http(); - // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the - // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the - // OpenTelemetry spec at - // , - // the full exporter URL is formed by appending "/v1/traces" to the value - // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does - // that with the grpc-tonic exporter. Other exporters, like the HTTP - // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without - // appending "/v1/traces". - // - // See https://github.com/open-telemetry/opentelemetry-rust/pull/950 - // - // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting - // the endpoint url with the "/v1/traces" path ourselves. If the bug is - // fixed in a later version, we can remove this code. But if we don't - // remember to remove this, it won't do any harm either, as the crate will - // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint - // is set directly with `with_endpoint`. - if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() { - if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) { - if !endpoint.ends_with('/') { - endpoint.push('/'); - } - endpoint.push_str("v1/traces"); - exporter = exporter.with_endpoint(endpoint); - } - } + // TODO: opentelemetry::global::set_error_handler() with custom handler that + // bypasses default tracing layers, but logs regular looking log + // messages. // Propagate trace information in the standard W3C TraceContext format. opentelemetry::global::set_text_map_propagator( - opentelemetry::sdk::propagation::TraceContextPropagator::new(), + opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - opentelemetry_otlp::new_pipeline() + let tracer = opentelemetry_otlp::new_pipeline() .tracing() .with_exporter(exporter) - .with_trace_config( - opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new( + .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( + Resource::new(vec![KeyValue::new( opentelemetry_semantic_conventions::resource::SERVICE_NAME, service_name, - )])), - ) - .install_batch(opentelemetry::runtime::Tokio) + )]), + )) + .install_batch(opentelemetry_sdk::runtime::Tokio) .expect("could not initialize opentelemetry exporter") + .tracer("global"); + + tracing_opentelemetry::layer().with_tracer(tracer) } // Shutdown trace pipeline gracefully, so that it has a chance to send any diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 2d95ac42e6..592f1ded0d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -736,4 +736,22 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn timeline_init_lsn_lease( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease", + self.mgmt_api_endpoint, + ); + + self.request(Method::POST, &uri, LsnLeaseRequest { lsn }) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d15a0e47a4..e9e52acee6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::config::PageserverIdentity; -use pageserver::control_plane_client::ControlPlaneClient; +use pageserver::controller_upcall_client::ControllerUpcallClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; @@ -396,7 +396,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - ControlPlaneClient::new(conf, &shutdown_pageserver), + ControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); if let Some(deletion_workers) = deletion_workers { diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/controller_upcall_client.rs similarity index 80% rename from pageserver/src/control_plane_client.rs rename to pageserver/src/controller_upcall_client.rs index d0a967b920..73fc6dc3ab 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; use pageserver_api::config::NodeMetadata; -/// The Pageserver's client for using the control plane API: this is a small subset -/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) -pub struct ControlPlaneClient { +/// The Pageserver's client for using the storage controller upcall API: this is a small API +/// for dealing with generations (see docs/rfcs/025-generation-numbers.md). +/// +/// The server presenting this API may either be the storage controller or some other +/// service (such as the Neon control plane) providing a store of generation numbers. +pub struct ControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, @@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi { ) -> impl Future, RetryForeverError>> + Send; } -impl ControlPlaneClient { +impl ControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { @@ -114,7 +117,7 @@ impl ControlPlaneClient { } } -impl ControlPlaneGenerationsApi for ControlPlaneClient { +impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down async fn re_attach( &self, @@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .join("validate") .expect("Failed to build validate path"); - let request = ValidateRequest { - tenants: tenants - .into_iter() - .map(|(id, gen)| ValidateRequestTenant { - id, - gen: gen - .into() - .expect("Generation should always be valid for a Tenant doing deletions"), - }) - .collect(), - }; + // When sending validate requests, break them up into chunks so that we + // avoid possible edge cases of generating any HTTP requests that + // require database I/O across many thousands of tenants. + let mut result: HashMap = HashMap::with_capacity(tenants.len()); + for tenant_chunk in (tenants).chunks(128) { + let request = ValidateRequest { + tenants: tenant_chunk + .iter() + .map(|(id, generation)| ValidateRequestTenant { + id: *id, + gen: (*generation).into().expect( + "Generation should always be valid for a Tenant doing deletions", + ), + }) + .collect(), + }; - failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); - if self.cancel.is_cancelled() { - return Err(RetryForeverError::ShuttingDown); + failpoint_support::sleep_millis_async!( + "control-plane-client-validate-sleep", + &self.cancel + ); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } + + let response: ValidateResponse = + self.retry_http_forever(&re_attach_path, request).await?; + for rt in response.tenants { + result.insert(rt.id, rt.valid); + } } - let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; - - Ok(response - .tenants - .into_iter() - .map(|rt| (rt.id, rt.valid)) - .collect()) + Ok(result.into_iter().collect()) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 22f7d5b824..73bdc90213 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; @@ -622,7 +622,7 @@ impl DeletionQueue { /// If remote_storage is None, then the returned workers will also be None. pub fn new( remote_storage: GenericRemoteStorage, - control_plane_client: Option, + controller_upcall_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) where @@ -662,7 +662,7 @@ impl DeletionQueue { conf, backend_rx, executor_tx, - control_plane_client, + controller_upcall_client, lsn_table.clone(), cancel.clone(), ), @@ -704,7 +704,7 @@ mod test { use tokio::task::JoinHandle; use crate::{ - control_plane_client::RetryForeverError, + controller_upcall_client::RetryForeverError, repository::Key, tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index d215fd2b7d..1d55581ebd 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -25,8 +25,8 @@ use tracing::info; use tracing::warn; use crate::config::PageServerConf; -use crate::control_plane_client::ControlPlaneGenerationsApi; -use crate::control_plane_client::RetryForeverError; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::RetryForeverError; use crate::metrics; use crate::virtual_file::MaybeFatalIo; @@ -61,7 +61,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - control_plane_client: Option, + controller_upcall_client: Option, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -94,7 +94,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - control_plane_client: Option, + controller_upcall_client: Option, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -102,7 +102,7 @@ where conf, rx, tx, - control_plane_client, + controller_upcall_client, lsn_table, pending_lists: Vec::new(), validated_lists: Vec::new(), @@ -145,8 +145,8 @@ where return Ok(()); } - let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client { - match control_plane_client + let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { + match controller_upcall_client .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) .await { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ba38120bf1..94375e62b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -56,6 +56,7 @@ use utils::http::endpoint::request_span; use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -80,7 +81,6 @@ use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, @@ -824,7 +824,7 @@ async fn get_lsn_by_timestamp_handler( let lease = if with_lease { timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) + .init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) .inspect_err(|_| { warn!("fail to grant a lease to {}", lsn); }) @@ -1692,9 +1692,18 @@ async fn lsn_lease_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let result = timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) - .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + let result = async { + timeline + .init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) + .map_err(|e| { + ApiError::InternalServerError( + e.context(format!("invalid lsn lease request at {lsn}")), + ) + }) + } + .instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await?; json_response(StatusCode::OK, result) } @@ -1710,8 +1719,13 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; + let gc_result = state + .tenant_manager + .immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx) + .await?; json_response(StatusCode::OK, gc_result) } @@ -1728,6 +1742,10 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + + if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { + flags |= CompactFlags::ForceL0Compaction; + } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } @@ -1774,6 +1792,9 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { + flags |= CompactFlags::ForceL0Compaction; + } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 7a9cf495c7..08abfbd647 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -6,7 +6,7 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; -pub mod control_plane_client; +pub mod controller_upcall_client; pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 366bd82903..b76efa5b48 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,6 +8,8 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; +use postgres_backend::{is_expected_io_error, QueryError}; +use pq_proto::framed::ConnectionError; use strum::{EnumCount, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; @@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { pub(crate) struct BasebackupQueryTime { ok: Histogram, error: Histogram, + client_error: Histogram, } pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { @@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| BasebackupQueryTime { ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), error: vec.get_metric_with_label_values(&["error"]).unwrap(), + client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(), } }); @@ -1557,7 +1561,7 @@ impl BasebackupQueryTime { } impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { - pub(crate) fn observe(self, res: &Result) { + pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed(); let ex_throttled = self .ctx @@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { elapsed } }; - let metric = if res.is_ok() { - &self.parent.ok - } else { - &self.parent.error + // If you want to change categorize of a specific error, also change it in `log_query_error`. + let metric = match res { + Ok(_) => &self.parent.ok, + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) + if is_expected_io_error(io_error) => + { + &self.parent.client_error + } + Err(_) => &self.parent.error, }; metric.observe(ex_throttled.as_secs_f64()); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9261b7481d..8fa6b9a7f0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -273,10 +273,20 @@ async fn page_service_conn_main( info!("Postgres client disconnected ({io_error})"); Ok(()) } else { - Err(io_error).context("Postgres connection error") + let tenant_id = conn_handler.timeline_handles.tenant_id(); + Err(io_error).context(format!( + "Postgres connection error for tenant_id={:?} client at peer_addr={}", + tenant_id, peer_addr + )) } } - other => other.context("Postgres query error"), + other => { + let tenant_id = conn_handler.timeline_handles.tenant_id(); + other.context(format!( + "Postgres query error for tenant_id={:?} client peer_addr={}", + tenant_id, peer_addr + )) + } } } @@ -340,6 +350,10 @@ impl TimelineHandles { } }) } + + fn tenant_id(&self) -> Option { + self.wrapper.tenant_id.get().copied() + } } pub(crate) struct TenantManagerWrapper { @@ -819,7 +833,7 @@ impl PageServerHandler { set_tracing_field_shard_id(&timeline); let lease = timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) + .renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) .inspect_err(|e| { warn!("{e}"); }) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2aebf4f999..29f682c62a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; @@ -96,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; +use crate::walingest::WalLagCooldown; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; @@ -182,27 +184,54 @@ pub struct TenantSharedResources { pub(super) struct AttachedTenantConf { tenant_conf: TenantConfOpt, location: AttachedLocationConfig, + /// The deadline before which we are blocked from GC so that + /// leases have a chance to be renewed. + lsn_lease_deadline: Option, } impl AttachedTenantConf { fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + // Sets a deadline before which we cannot proceed to GC due to lsn lease. + // + // We do this as the leases mapping are not persisted to disk. By delaying GC by lease + // length, we guarantee that all the leases we granted before will have a chance to renew + // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. + let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single { + Some( + tokio::time::Instant::now() + + tenant_conf + .lsn_lease_length + .unwrap_or(LsnLease::DEFAULT_LENGTH), + ) + } else { + // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale + // because we don't do GC in these modes. + None + }; + Self { tenant_conf, location, + lsn_lease_deadline, } } fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { - LocationMode::Attached(attach_conf) => Ok(Self { - tenant_conf: location_conf.tenant_conf, - location: *attach_conf, - }), + LocationMode::Attached(attach_conf) => { + Ok(Self::new(location_conf.tenant_conf, *attach_conf)) + } LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") } } } + + fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + self.lsn_lease_deadline + .map(|d| tokio::time::Instant::now() < d) + .unwrap_or(false) + } } struct TimelinePreload { timeline_id: TimelineId, @@ -291,6 +320,9 @@ pub struct Tenant { /// background warmup. pub(crate) activate_now_sem: tokio::sync::Semaphore, + /// Time it took for the tenant to activate. Zero if not active yet. + attach_wal_lag_cooldown: Arc>, + // Cancellation token fires when we have entered shutdown(). This is a parent of // Timelines' cancellation token. pub(crate) cancel: CancellationToken, @@ -972,11 +1004,15 @@ impl Tenant { // Remote preload is complete. drop(remote_load_completion); + // We will time the duration of the attach phase unless this is a creation (attach will do no work) + let attach_start = std::time::Instant::now(); let attached = { let _attach_timer = Some(TENANT.attach.start_timer()); tenant_clone.attach(preload, &ctx).await }; + let attach_duration = attach_start.elapsed(); + _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration)); match attached { Ok(()) => { @@ -1822,6 +1858,11 @@ impl Tenant { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); } + + if conf.is_gc_blocked_by_lsn_lease_deadline() { + info!("Skipping GC because lsn lease deadline is not reached"); + return Ok(GcResult::default()); + } } let _guard = match self.gc_block.start().await { @@ -2630,6 +2671,8 @@ impl Tenant { Arc::new(AttachedTenantConf { tenant_conf: new_tenant_conf.clone(), location: inner.location, + // Attached location is not changed, no need to update lsn lease deadline. + lsn_lease_deadline: inner.lsn_lease_deadline, }) }); @@ -2719,6 +2762,7 @@ impl Tenant { pg_version, state, last_aux_file_policy, + self.attach_wal_lag_cooldown.clone(), self.cancel.child_token(), ); @@ -2825,6 +2869,7 @@ impl Tenant { Some(Duration::from_secs(3600 * 24)), )), activate_now_sem: tokio::sync::Semaphore::new(0), + attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), cancel: CancellationToken::default(), gate: Gate::default(), timeline_get_throttle: Arc::new(throttle::Throttle::new( @@ -3887,9 +3932,9 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; let initdb_command = tokio::process::Command::new(&initdb_bin_path) - .args(["-D", initdb_target_dir.as_ref()]) - .args(["-U", &conf.superuser]) - .args(["-E", "utf8"]) + .args(["--pgdata", initdb_target_dir.as_ref()]) + .args(["--username", &conf.superuser]) + .args(["--encoding", "utf8"]) .arg("--no-instructions") .arg("--no-sync") .env_clear() @@ -4461,13 +4506,17 @@ mod tests { tline.freeze_and_flush().await.map_err(|e| e.into()) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") .await? .load() .await; + // Advance to the lsn lease deadline so that GC is not blocked by + // initial transition into AttachedSingle. + tokio::time::advance(tenant.get_lsn_lease_length()).await; + tokio::time::resume(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7244,9 +7293,17 @@ mod tests { Ok(()) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_lsn_lease() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; + let (tenant, ctx) = TenantHarness::create("test_lsn_lease") + .await + .unwrap() + .load() + .await; + // Advance to the lsn lease deadline so that GC is not blocked by + // initial transition into AttachedSingle. + tokio::time::advance(tenant.get_lsn_lease_length()).await; + tokio::time::resume(); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -7274,24 +7331,33 @@ mod tests { let leased_lsns = [0x30, 0x50, 0x70]; let mut leases = Vec::new(); - let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| { - leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?); - Ok(()) + leased_lsns.iter().for_each(|n| { + leases.push( + timeline + .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx) + .expect("lease request should succeed"), + ); }); - // Renewing with shorter lease should not change the lease. - let updated_lease_0 = - timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?; - assert_eq!(updated_lease_0.valid_until, leases[0].valid_until); + let updated_lease_0 = timeline + .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx) + .expect("lease renewal should succeed"); + assert_eq!( + updated_lease_0.valid_until, leases[0].valid_until, + " Renewing with shorter lease should not change the lease." + ); - // Renewing with a long lease should renew lease with later expiration time. - let updated_lease_1 = timeline.make_lsn_lease( - Lsn(leased_lsns[1]), - timeline.get_lsn_lease_length() * 2, - &ctx, - )?; - - assert!(updated_lease_1.valid_until > leases[1].valid_until); + let updated_lease_1 = timeline + .renew_lsn_lease( + Lsn(leased_lsns[1]), + timeline.get_lsn_lease_length() * 2, + &ctx, + ) + .expect("lease renewal should succeed"); + assert!( + updated_lease_1.valid_until > leases[1].valid_until, + "Renewing with a long lease should renew lease with later expiration time." + ); // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( @@ -7308,7 +7374,8 @@ mod tests { &CancellationToken::new(), &ctx, ) - .await?; + .await + .unwrap(); // Keeping everything <= Lsn(0x80) b/c leases: // 0/10: initdb layer @@ -7322,13 +7389,16 @@ mod tests { // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); - let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx); - assert!(res.is_err()); + timeline + .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) + .expect_err("lease request on GC-ed LSN should fail"); // Should still be able to renew a currently valid lease // Assumption: original lease to is still valid for 0/50. - let _ = - timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?; + // (use `Timeline::init_lsn_lease` for testing so it always does validation) + timeline + .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx) + .expect("lease renewal with validation should succeed"); Ok(()) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 547b43a399..502cb62fe8 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,7 +8,6 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use anyhow::bail; pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithmSettings; @@ -441,29 +440,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { } } -impl TryFrom for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(item: toml_edit::Item) -> Result { - match item { - toml_edit::Item::Value(value) => { - let d = value.into_deserializer(); - return serde_path_to_error::deserialize(d) - .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); - } - toml_edit::Item::Table(table) => { - let deserializer = - toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table)); - return serde_path_to_error::deserialize(deserializer) - .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); - } - _ => { - bail!("expected non-inline table but found {item}") - } - } - } -} - /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index 1271d25b76..f7a7836a12 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,29 +1,12 @@ -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; -use super::remote_timeline_client::index::GcBlockingReason; -use tokio::time::Instant; use utils::id::TimelineId; -type TimelinesBlocked = HashMap>; +use super::remote_timeline_client::index::GcBlockingReason; -#[derive(Default)] -struct Storage { - timelines_blocked: TimelinesBlocked, - /// The deadline before which we are blocked from GC so that - /// leases have a chance to be renewed. - lsn_lease_deadline: Option, -} +type Storage = HashMap>; -impl Storage { - fn is_blocked_by_lsn_lease_deadline(&self) -> bool { - self.lsn_lease_deadline - .map(|d| Instant::now() < d) - .unwrap_or(false) - } -} - -/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc -/// blocking. +/// GcBlock provides persistent (per-timeline) gc blocking. #[derive(Default)] pub(crate) struct GcBlock { /// The timelines which have current reasons to block gc. @@ -66,17 +49,6 @@ impl GcBlock { } } - /// Sets a deadline before which we cannot proceed to GC due to lsn lease. - /// - /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease - /// length, we guarantee that all the leases we granted before will have a chance to renew - /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. - pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) { - let deadline = Instant::now() + lsn_lease_length; - let mut g = self.reasons.lock().unwrap(); - g.lsn_lease_deadline = Some(deadline); - } - /// Describe the current gc blocking reasons. /// /// TODO: make this json serializable. @@ -102,7 +74,7 @@ impl GcBlock { ) -> anyhow::Result { let (added, uploaded) = { let mut g = self.reasons.lock().unwrap(); - let set = g.timelines_blocked.entry(timeline.timeline_id).or_default(); + let set = g.entry(timeline.timeline_id).or_default(); let added = set.insert(reason); // LOCK ORDER: intentionally hold the lock, see self.reasons. @@ -133,7 +105,7 @@ impl GcBlock { let (remaining_blocks, uploaded) = { let mut g = self.reasons.lock().unwrap(); - match g.timelines_blocked.entry(timeline.timeline_id) { + match g.entry(timeline.timeline_id) { Entry::Occupied(mut oe) => { let set = oe.get_mut(); set.remove(reason); @@ -147,7 +119,7 @@ impl GcBlock { } } - let remaining_blocks = g.timelines_blocked.len(); + let remaining_blocks = g.len(); // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons let uploaded = timeline @@ -172,11 +144,11 @@ impl GcBlock { pub(crate) fn before_delete(&self, timeline: &super::Timeline) { let unblocked = { let mut g = self.reasons.lock().unwrap(); - if g.timelines_blocked.is_empty() { + if g.is_empty() { return; } - g.timelines_blocked.remove(&timeline.timeline_id); + g.remove(&timeline.timeline_id); BlockingReasons::clean_and_summarize(g).is_none() }; @@ -187,11 +159,10 @@ impl GcBlock { } /// Initialize with the non-deleted timelines of this tenant. - pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) { + pub(crate) fn set_scanned(&self, scanned: Storage) { let mut g = self.reasons.lock().unwrap(); - assert!(g.timelines_blocked.is_empty()); - g.timelines_blocked - .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { tracing::info!(summary=?reasons, "initialized with gc blocked"); @@ -205,7 +176,6 @@ pub(super) struct Guard<'a> { #[derive(Debug)] pub(crate) struct BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: bool, timelines: usize, reasons: enumset::EnumSet, } @@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}", - self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons + "{} timelines block for {:?}", + self.timelines, self.reasons ) } } @@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons { impl BlockingReasons { fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { let mut reasons = enumset::EnumSet::empty(); - g.timelines_blocked.retain(|_key, value| { + g.retain(|_key, value| { reasons = reasons.union(*value); !value.is_empty() }); - let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); - if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline { + if !g.is_empty() { Some(BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, - timelines: g.timelines_blocked.len(), + timelines: g.len(), reasons, }) } else { @@ -240,17 +208,14 @@ impl BlockingReasons { } fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { - let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); - if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline { + if g.is_empty() { None } else { let reasons = g - .timelines_blocked .values() .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); Some(BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, - timelines: g.timelines_blocked.len(), + timelines: g.len(), reasons, }) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 1e7c1e10a5..9d9852c525 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::control_plane_client::{ - ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, +use crate::controller_upcall_client::{ + ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; @@ -122,7 +122,7 @@ pub(crate) enum ShardSelector { Known(ShardIndex), } -/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// A convenience for use with the re_attach ControllerUpcallClient function: rather /// than the serializable struct, we build this enum that encapsulates /// the invariant that attached tenants always have generations. /// @@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { match attach_conf.generation.cmp(&tenant.generation) { Ordering::Equal => { - if attach_conf.attach_mode == AttachmentMode::Single { - tenant - .gc_block - .set_lsn_lease_deadline(tenant.get_lsn_lease_length()); - } - // A transition from Attached to Attached in the same generation, we may // take our fast path and just provide the updated configuration // to the tenant. @@ -2199,6 +2197,82 @@ impl TenantManager { Ok((wanted_bytes, shard_count as u32)) } + + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] + pub(crate) async fn immediate_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + gc_req: TimelineGcRequest, + cancel: CancellationToken, + ctx: &RequestContext, + ) -> Result { + let tenant = { + let guard = self.tenants.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + // Run in task_mgr to avoid race with tenant_detach operation + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; + + fail::fail_point!("immediate_gc_task_pre"); + + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); + } + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().map(|x| &x.remote_client); + + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(|e| match e { + GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, + GcError::TimelineNotFound => { + ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) + } + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + }) + } } #[derive(Debug, thiserror::Error)] @@ -2343,7 +2417,7 @@ enum TenantSlotDropError { /// Errors that can happen any time we are walking the tenant map to try and acquire /// the TenantSlot for a particular tenant. #[derive(Debug, thiserror::Error)] -pub enum TenantMapError { +pub(crate) enum TenantMapError { // Tried to read while initializing #[error("tenant map is still initializing")] StillInitializing, @@ -2373,7 +2447,7 @@ pub enum TenantMapError { /// The `old_value` may be dropped before the SlotGuard is dropped, by calling /// `drop_old_value`. It is an error to call this without shutting down /// the conents of `old_value`. -pub struct SlotGuard { +pub(crate) struct SlotGuard { tenant_shard_id: TenantShardId, old_value: Option, upserted: bool, @@ -2766,81 +2840,6 @@ use { utils::http::error::ApiError, }; -#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] -pub(crate) async fn immediate_gc( - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - gc_req: TimelineGcRequest, - cancel: CancellationToken, - ctx: &RequestContext, -) -> Result { - let tenant = { - let guard = TENANTS.read().unwrap(); - guard - .get(&tenant_shard_id) - .cloned() - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))? - }; - - let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - - tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - - // Run in task_mgr to avoid race with tenant_detach operation - let ctx: RequestContext = - ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - - let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - - fail::fail_point!("immediate_gc_task_pre"); - - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. - - #[cfg(feature = "testing")] - { - // we need to synchronize with drop completion for python tests without polling for - // log messages - if let Ok(result) = result.as_mut() { - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!( - total = js.len(), - "starting to wait for the gc'd layers to be dropped" - ); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } - - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().map(|x| &x.remote_client); - - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care about the shutdown error, just exit fast - drop(rtc.wait_completion().await); - } - } - - result.map_err(|e| match e { - GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, - GcError::TimelineNotFound => { - ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) - } - other => ApiError::InternalServerError(anyhow::anyhow!(other)), - }) -} - #[cfg(test)] mod tests { use std::collections::BTreeMap; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 9fbe2f0da5..97506b7e9a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -178,6 +178,7 @@ async fn download_object<'a>( destination_file .flush() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("flush source file at {dst_path}")) .map_err(DownloadError::Other)?; @@ -185,6 +186,7 @@ async fn download_object<'a>( destination_file .sync_all() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("failed to fsync source file at {dst_path}")) .map_err(DownloadError::Other)?; @@ -232,6 +234,7 @@ async fn download_object<'a>( destination_file .sync_all() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("failed to fsync source file at {dst_path}")) .map_err(DownloadError::Other)?; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2b212cfed5..6f9eda85f5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -40,11 +40,11 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadCoalesceMode, VectoredReadPlanner, + VectoredReadPlanner, }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; @@ -589,7 +589,9 @@ impl DeltaLayerWriterInner { ); // fsync the file - file.sync_all().await?; + file.sync_all() + .await + .maybe_fatal_err("delta_layer sync_all")?; trace!("created delta layer {}", self.path); @@ -1133,7 +1135,7 @@ impl DeltaLayerInner { ctx: &RequestContext, ) -> anyhow::Result { use crate::tenant::vectored_blob_io::{ - BlobMeta, VectoredReadBuilder, VectoredReadExtended, + BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; use futures::stream::TryStreamExt; @@ -1183,8 +1185,8 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; - let mut read_builder: Option = None; - let read_mode = VectoredReadCoalesceMode::get(); + let mut read_builder: Option = None; + let align = virtual_file::get_io_buffer_alignment(); let max_read_size = self .max_vectored_read_bytes @@ -1228,12 +1230,12 @@ impl DeltaLayerInner { { None } else { - read_builder.replace(VectoredReadBuilder::new( + read_builder.replace(ChunkedVectoredReadBuilder::new( offsets.start.pos(), offsets.end.pos(), meta, max_read_size, - read_mode, + align, )) } } else { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 940d169db0..3dcd7bc962 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,7 +41,7 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; @@ -889,7 +889,9 @@ impl ImageLayerWriterInner { // set inner.file here. The first read will have to re-open it. // fsync the file - file.sync_all().await?; + file.sync_all() + .await + .maybe_fatal_err("image_layer sync_all")?; trace!("created image layer {}", self.path); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index f0e2ca5c83..2923bd3558 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -442,11 +442,13 @@ impl Layer { // Visibility was modified to Visible: maybe log about this match ctx.task_kind() { TaskKind::CalculateSyntheticSize + | TaskKind::OndemandLogicalSizeCalculation | TaskKind::GarbageCollector | TaskKind::MgmtRequest => { // This situation is expected in code paths do binary searches of the LSN space to resolve // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size, - // and on-demand for certain HTTP API requests. + // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included + // because it is run as a sub-task of synthetic size. } _ => { // In all other contexts, it is unusual to do I/O involving layers which are not visible at @@ -457,7 +459,7 @@ impl Layer { // which was covered by a concurrent compaction. tracing::info!( "Layer {} became visible as a result of access", - self.0.desc.key() + self.0.desc.layer_name() ); } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 3f0f8a21c8..547739e773 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; - tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length()); loop { tokio::select! { _ = cancel.cancelled() => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 157c6ab91e..1d79b2b74b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -48,7 +48,6 @@ use utils::{ sync::gate::{Gate, GateGuard}, }; -use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -62,14 +61,17 @@ use std::{ collections::btree_map::Entry, ops::{Deref, Range}, }; +use std::{pin::pin, sync::OnceLock}; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ + config::AttachmentMode, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, }, + walingest::WalLagCooldown, walredo, }; use crate::{ @@ -428,6 +430,8 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, pub(crate) handles: handle::PerTimelineState, + + pub(crate) attach_wal_lag_cooldown: Arc>, } pub struct WalReceiverInfo { @@ -736,6 +740,7 @@ pub enum GetLogicalSizePriority { pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, + ForceL0Compaction, EnhancedGcBottomMostCompaction, DryRun, } @@ -1324,16 +1329,38 @@ impl Timeline { Ok(()) } - /// Obtains a temporary lease blocking garbage collection for the given LSN. - /// - /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also - /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if - /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and - /// the requesting lease. - pub(crate) fn make_lsn_lease( + /// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`. + pub(crate) fn init_lsn_lease( &self, lsn: Lsn, length: Duration, + ctx: &RequestContext, + ) -> anyhow::Result { + self.make_lsn_lease(lsn, length, true, ctx) + } + + /// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period. + pub(crate) fn renew_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + ctx: &RequestContext, + ) -> anyhow::Result { + self.make_lsn_lease(lsn, length, false, ctx) + } + + /// Obtains a temporary lease blocking garbage collection for the given LSN. + /// + /// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error + /// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present. + /// + /// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease. + /// The returned lease is therefore the maximum between the existing lease and the requesting lease. + fn make_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + init: bool, _ctx: &RequestContext, ) -> anyhow::Result { let lease = { @@ -1347,8 +1374,8 @@ impl Timeline { let entry = gc_info.leases.entry(lsn); - let lease = { - if let Entry::Occupied(mut occupied) = entry { + match entry { + Entry::Occupied(mut occupied) => { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; @@ -1360,20 +1387,28 @@ impl Timeline { } existing_lease.clone() - } else { - // Reject already GC-ed LSN (lsn < latest_gc_cutoff) - let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); - if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + Entry::Vacant(vacant) => { + // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and + // not blocked by the lsn lease deadline. + let validate = { + let conf = self.tenant_conf.load(); + conf.location.attach_mode == AttachmentMode::Single + && !conf.is_gc_blocked_by_lsn_lease_deadline() + }; + + if init || validate { + let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } } let dt: DateTime = valid_until.into(); info!("lease created, valid until {}", dt); - entry.or_insert(LsnLease { valid_until }).clone() + vacant.insert(LsnLease { valid_until }).clone() } - }; - - lease + } }; Ok(lease) @@ -1950,8 +1985,6 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } - // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072 - #[allow(unused)] pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2101,6 +2134,7 @@ impl Timeline { pg_version: u32, state: TimelineState, aux_file_policy: Option, + attach_wal_lag_cooldown: Arc>, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2242,6 +2276,8 @@ impl Timeline { l0_flush_global_state: resources.l0_flush_global_state, handles: Default::default(), + + attach_wal_lag_cooldown, }; if aux_file_policy == Some(AuxFilePolicy::V1) { diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index cd61418f3d..6009b0b79a 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -11,6 +11,7 @@ pub(crate) struct RangeAnalysis { has_image: bool, num_of_deltas_above_image: usize, total_num_of_deltas: usize, + num_of_l0: usize, } impl Timeline { @@ -20,8 +21,10 @@ impl Timeline { let mut delta_ranges = Vec::new(); let mut image_ranges = Vec::new(); + let num_of_l0; let all_layer_files = { let guard = self.layers.read().await; + num_of_l0 = guard.layer_map().unwrap().level0_deltas().len(); guard.all_persistent_layers() }; let lsn = self.get_last_record_lsn(); @@ -82,6 +85,7 @@ impl Timeline { has_image: image_layer.is_some(), num_of_deltas_above_image: maybe_delta_layers.len(), total_num_of_deltas: pitr_delta_layers.len(), + num_of_l0, }); } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 3de386a2d5..9f64471432 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -353,7 +353,13 @@ impl Timeline { // 2. Compact let timer = self.metrics.compact_time_histo.start_timer(); - let fully_compacted = self.compact_level0(target_file_size, ctx).await?; + let fully_compacted = self + .compact_level0( + target_file_size, + flags.contains(CompactFlags::ForceL0Compaction), + ctx, + ) + .await?; timer.stop_and_record(); let mut partitioning = dense_partitioning; @@ -658,6 +664,7 @@ impl Timeline { async fn compact_level0( self: &Arc, target_file_size: u64, + force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { let CompactLevel0Phase1Result { @@ -679,9 +686,15 @@ impl Timeline { let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? + self.compact_level0_phase1( + phase1_layers_locked, + stats, + target_file_size, + force_compaction_ignore_threshold, + &ctx, + ) + .instrument(phase1_span) + .await? }; if new_layers.is_empty() && deltas_to_compact.is_empty() { @@ -700,6 +713,7 @@ impl Timeline { guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, + force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = @@ -711,11 +725,26 @@ impl Timeline { // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); + if force_compaction_ignore_threshold { + if !level0_deltas.is_empty() { + info!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact, but forcing compaction" + ); + } else { + info!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact, cannot force compaction" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + } else { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } } let mut level0_deltas = level0_deltas diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index aa37a45898..1faa6bab99 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -185,171 +185,7 @@ pub(crate) enum VectoredReadExtended { No, } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum VectoredReadCoalesceMode { - /// Only coalesce exactly adjacent reads. - AdjacentOnly, - /// In addition to adjacent reads, also consider reads whose corresponding - /// `end` and `start` offsets reside at the same chunk. - Chunked(usize), -} - -impl VectoredReadCoalesceMode { - /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0, - /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher. - pub(crate) fn get() -> Self { - let align = virtual_file::get_io_buffer_alignment_raw(); - if align == 0 { - VectoredReadCoalesceMode::AdjacentOnly - } else { - VectoredReadCoalesceMode::Chunked(align) - } - } -} - -pub(crate) enum VectoredReadBuilder { - Adjacent(AdjacentVectoredReadBuilder), - Chunked(ChunkedVectoredReadBuilder), -} - -impl VectoredReadBuilder { - fn new_impl( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: Option, - mode: VectoredReadCoalesceMode, - ) -> Self { - match mode { - VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent( - AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size), - ), - VectoredReadCoalesceMode::Chunked(chunk_size) => { - Self::Chunked(ChunkedVectoredReadBuilder::new( - start_offset, - end_offset, - meta, - max_read_size, - chunk_size, - )) - } - } - } - - pub(crate) fn new( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: usize, - mode: VectoredReadCoalesceMode, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode) - } - - pub(crate) fn new_streaming( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - mode: VectoredReadCoalesceMode, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, None, mode) - } - - pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta), - VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta), - } - } - - pub(crate) fn build(self) -> VectoredRead { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.build(), - VectoredReadBuilder::Chunked(builder) => builder.build(), - } - } - - pub(crate) fn size(&self) -> usize { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.size(), - VectoredReadBuilder::Chunked(builder) => builder.size(), - } - } -} - -pub(crate) struct AdjacentVectoredReadBuilder { - /// Start offset of the read. - start: u64, - // End offset of the read. - end: u64, - /// Start offset and metadata for each blob in this read - blobs_at: VecMap, - max_read_size: Option, -} - -impl AdjacentVectoredReadBuilder { - /// Start building a new vectored read. - /// - /// Note that by design, this does not check against reading more than `max_read_size` to - /// support reading larger blobs than the configuration value. The builder will be single use - /// however after that. - pub(crate) fn new( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: Option, - ) -> Self { - let mut blobs_at = VecMap::default(); - blobs_at - .append(start_offset, meta) - .expect("First insertion always succeeds"); - - Self { - start: start_offset, - end: end_offset, - blobs_at, - max_read_size, - } - } - /// Attempt to extend the current read with a new blob if the start - /// offset matches with the current end of the vectored read - /// and the resuting size is below the max read size - pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { - tracing::trace!(start, end, "trying to extend"); - let size = (end - start) as usize; - let not_limited_by_max_read_size = { - if let Some(max_read_size) = self.max_read_size { - self.size() + size <= max_read_size - } else { - true - } - }; - - if self.end == start && not_limited_by_max_read_size { - self.end = end; - self.blobs_at - .append(start, meta) - .expect("LSNs are ordered within vectored reads"); - - return VectoredReadExtended::Yes; - } - - VectoredReadExtended::No - } - - pub(crate) fn size(&self) -> usize { - (self.end - self.start) as usize - } - - pub(crate) fn build(self) -> VectoredRead { - VectoredRead { - start: self.start, - end: self.end, - blobs_at: self.blobs_at, - } - } -} - +/// A vectored read builder that tries to coalesce all reads that fits in a chunk. pub(crate) struct ChunkedVectoredReadBuilder { /// Start block number start_blk_no: usize, @@ -373,7 +209,7 @@ impl ChunkedVectoredReadBuilder { /// Note that by design, this does not check against reading more than `max_read_size` to /// support reading larger blobs than the configuration value. The builder will be single use /// however after that. - pub(crate) fn new( + fn new_impl( start_offset: u64, end_offset: u64, meta: BlobMeta, @@ -396,6 +232,25 @@ impl ChunkedVectoredReadBuilder { } } + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + align: usize, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align) + } + + pub(crate) fn new_streaming( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + align: usize, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, None, align) + } + /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. /// /// The resulting size also must be below the max read size. @@ -474,17 +329,17 @@ pub struct VectoredReadPlanner { max_read_size: usize, - mode: VectoredReadCoalesceMode, + align: usize, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, - mode, + align, } } @@ -545,7 +400,7 @@ impl VectoredReadPlanner { } pub fn finish(self) -> Vec { - let mut current_read_builder: Option = None; + let mut current_read_builder: Option = None; let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { @@ -558,12 +413,12 @@ impl VectoredReadPlanner { }; if extended == VectoredReadExtended::No { - let next_read_builder = VectoredReadBuilder::new( + let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, BlobMeta { key, lsn }, self.max_read_size, - self.mode, + self.align, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -688,7 +543,7 @@ impl<'a> VectoredBlobReader<'a> { /// `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. pub struct StreamingVectoredReadPlanner { - read_builder: Option, + read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, @@ -699,21 +554,21 @@ pub struct StreamingVectoredReadPlanner { /// Size of the current batch cnt: usize, - mode: VectoredReadCoalesceMode, + align: usize, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, - mode, + align, } } @@ -762,11 +617,11 @@ impl StreamingVectoredReadPlanner { } None => { self.read_builder = { - Some(VectoredReadBuilder::new_streaming( + Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, BlobMeta { key, lsn }, - self.mode, + self.align, )) }; } @@ -1092,7 +947,7 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -1104,7 +959,8 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode); + let read_builder = + ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 57856eea80..5b7b279888 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -466,6 +466,7 @@ impl VirtualFile { &[] }; utils::crashsafe::overwrite(&final_path, &tmp_path, content) + .maybe_fatal_err("crashsafe_overwrite") }) .await .expect("blocking task is never aborted") @@ -475,7 +476,7 @@ impl VirtualFile { pub async fn sync_all(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; - res + res.maybe_fatal_err("sync_all") }) } @@ -483,7 +484,7 @@ impl VirtualFile { pub async fn sync_data(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; - res + res.maybe_fatal_err("sync_data") }) } @@ -1147,7 +1148,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) panic!("virtual_file::init called twice"); } if set_io_buffer_alignment(io_buffer_alignment).is_err() { - panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two"); + panic!( + "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}" + ); } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); @@ -1174,14 +1177,16 @@ fn get_open_files() -> &'static OpenFiles { static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); -/// Returns true if `x` is zero or a power of two. -fn is_zero_or_power_of_two(x: usize) -> bool { - (x == 0) || ((x & (x - 1)) == 0) +/// Returns true if the alignment is a power of two and is greater or equal to 512. +fn is_valid_io_buffer_alignment(align: usize) -> bool { + align.is_power_of_two() && align >= 512 } +/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is +/// not a power of two or less than 512 bytes. #[allow(unused)] pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { - if is_zero_or_power_of_two(align) { + if is_valid_io_buffer_alignment(align) { IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); Ok(()) } else { @@ -1189,19 +1194,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { } } -/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified. +/// Gets the io buffer alignment. /// -/// This function should be used to check the raw config value. -pub(crate) fn get_io_buffer_alignment_raw() -> usize { +/// This function should be used for getting the actual alignment value to use. +pub(crate) fn get_io_buffer_alignment() -> usize { let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); if cfg!(test) { let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; if let Some(test_align) = utils::env::var(env_var_name) { - if is_zero_or_power_of_two(test_align) { + if is_valid_io_buffer_alignment(test_align) { test_align } else { - panic!("IO buffer alignment ({test_align}) is not a power of two"); + panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}"); } } else { align @@ -1211,14 +1216,6 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize { } } -/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero. -/// -/// This function should be used for getting the actual alignment value to use. -pub(crate) fn get_io_buffer_alignment() -> usize { - let align = get_io_buffer_alignment_raw(); - align.max(1) -} - #[cfg(test)] mod tests { use crate::context::DownloadBehavior; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 229c01a681..95d1f76920 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,7 +21,10 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::sync::Arc; +use std::sync::OnceLock; use std::time::Duration; +use std::time::Instant; use std::time::SystemTime; use pageserver_api::shard::ShardIdentity; @@ -69,7 +72,29 @@ impl CheckPoint { } } +/// Temporary limitation of WAL lag warnings after attach +/// +/// After tenant attach, we want to limit WAL lag warnings because +/// we don't look at the WAL until the attach is complete, which +/// might take a while. +pub struct WalLagCooldown { + /// Until when should this limitation apply at all + active_until: std::time::Instant, + /// The maximum lag to suppress. Lags above this limit get reported anyways. + max_lag: Duration, +} + +impl WalLagCooldown { + pub fn new(attach_start: Instant, attach_duration: Duration) -> Self { + Self { + active_until: attach_start + attach_duration * 3 + Duration::from_secs(120), + max_lag: attach_duration * 2 + Duration::from_secs(60), + } + } +} + pub struct WalIngest { + attach_wal_lag_cooldown: Arc>, shard: ShardIdentity, checkpoint: CheckPoint, checkpoint_modified: bool, @@ -103,6 +128,7 @@ impl WalIngest { shard: *timeline.get_shard_identity(), checkpoint, checkpoint_modified: false, + attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(), warn_ingest_lag: WarnIngestLag { lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), @@ -1429,6 +1455,13 @@ impl WalIngest { Ok(lag) => { if lag > conf.wait_lsn_timeout { rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { + if let Some(cooldown) = self.attach_wal_lag_cooldown.get() { + if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag { + return; + } + } else { + // Still loading? We shouldn't be here + } let lag = humantime::format_duration(lag); warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); }) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 2b461c8641..892a272252 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -42,6 +42,7 @@ #include "hll.h" #include "bitmap.h" +#include "neon.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -173,7 +174,9 @@ lfc_disable(char const *op) * If the reason of error is ENOSPC, then truncation of file may * help to reclaim some space */ + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE); int rc = ftruncate(lfc_desc, 0); + pgstat_report_wait_end(); if (rc < 0) elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); @@ -769,8 +772,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ); rc = preadv(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + pgstat_report_wait_end(); if (rc != (BLCKSZ * blocks_in_chunk)) { @@ -944,8 +949,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + pgstat_report_wait_end(); + if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 07a19a7114..0ca8a70d6d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -490,7 +490,7 @@ pageserver_connect(shardno_t shard_no, int elevel) WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE, PQsocket(shard->conn), 0, - PG_WAIT_EXTENSION); + WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc); if (rc & WL_LATCH_SET) { @@ -512,7 +512,7 @@ pageserver_connect(shardno_t shard_no, int elevel) WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE, PQsocket(shard->conn), 0, - PG_WAIT_EXTENSION); + WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc); if (rc & WL_LATCH_SET) { @@ -608,7 +608,8 @@ pageserver_connect(shardno_t shard_no, int elevel) WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + WAIT_EVENT_NEON_PS_CONFIGURING); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -656,7 +657,8 @@ static int call_PQgetCopyData(shardno_t shard_no, char **buffer) { int ret; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); @@ -666,7 +668,8 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -937,7 +940,7 @@ PagestoreShmemInit(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pagestore_shared = ShmemInitStruct("libpagestore shared state", - PagestoreShmemSize(), + sizeof(PagestoreShmemState), &found); if (!found) { diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index fe8e276d1c..c3ed96710a 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -41,6 +41,9 @@ #include "pagestore_client.h" #include "control_plane_connector.h" #include "walsender_hooks.h" +#if PG_MAJORVERSION_NUM >= 16 +#include "storage/ipc.h" +#endif PG_MODULE_MAGIC; void _PG_init(void); @@ -49,6 +52,23 @@ static int logical_replication_max_snap_files = 300; static int running_xacts_overflow_policy; +#if PG_MAJORVERSION_NUM >= 16 +static shmem_startup_hook_type prev_shmem_startup_hook; + +static void neon_shmem_startup_hook(void); +#endif +#if PG_MAJORVERSION_NUM >= 17 +uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; +uint32 WAIT_EVENT_NEON_LFC_READ; +uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; +uint32 WAIT_EVENT_NEON_LFC_WRITE; +uint32 WAIT_EVENT_NEON_PS_STARTING; +uint32 WAIT_EVENT_NEON_PS_CONFIGURING; +uint32 WAIT_EVENT_NEON_PS_SEND; +uint32 WAIT_EVENT_NEON_PS_READ; +uint32 WAIT_EVENT_NEON_WAL_DL; +#endif + enum RunningXactsOverflowPolicies { OP_IGNORE, OP_SKIP, @@ -635,6 +655,9 @@ _PG_init(void) */ #if PG_VERSION_NUM >= 160000 load_file("$libdir/neon_rmgr", false); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = neon_shmem_startup_hook; #endif pg_init_libpagestore(); @@ -721,3 +744,25 @@ backpressure_throttling_time(PG_FUNCTION_ARGS) { PG_RETURN_UINT64(BackpressureThrottlingTime()); } + +#if PG_MAJORVERSION_NUM >= 16 +static void +neon_shmem_startup_hook(void) +{ + /* Initialize */ + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + +#if PG_PG_MAJORVERSION_NUM >= 17 + WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance"); + WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); + WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); + WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); + WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); + WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); + WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); + WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO"); + WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download"); +#endif +} +#endif diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 0b36bdbb65..af69116e21 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,8 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -# TODO: bump default version to 1.5, after we are certain that we don't -# need to rollback the compute image -default_version = '1.4' +default_version = '1.5' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 5c653fc6c6..79aa88b8d3 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -12,6 +12,7 @@ #ifndef NEON_H #define NEON_H #include "access/xlogreader.h" +#include "utils/wait_event.h" /* GUCs */ extern char *neon_auth_token; @@ -22,6 +23,28 @@ extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; +#if PG_MAJORVERSION_NUM >= 17 +extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; +extern uint32 WAIT_EVENT_NEON_LFC_READ; +extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; +extern uint32 WAIT_EVENT_NEON_LFC_WRITE; +extern uint32 WAIT_EVENT_NEON_PS_STARTING; +extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; +extern uint32 WAIT_EVENT_NEON_PS_SEND; +extern uint32 WAIT_EVENT_NEON_PS_READ; +extern uint32 WAIT_EVENT_NEON_WAL_DL; +#else +#define WAIT_EVENT_NEON_LFC_MAINTENANCE PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ +#define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE +#define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE +#define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_READ PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ +#endif + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index de653826c0..9bce81bf2e 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -27,7 +27,8 @@ NeonPerfCountersShmemSize(void) { Size size = 0; - size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters))); + size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS, + sizeof(neon_per_backend_counters))); return size; } @@ -39,7 +40,7 @@ NeonPerfCountersShmemInit(void) neon_per_backend_counters_shared = ShmemInitStruct("Neon perf counters", - mul_size(MaxBackends, + mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)), &found); Assert(found == IsUnderPostmaster); @@ -137,7 +138,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) metrics[i].is_bucket = false; metrics[i].value = (double) counters->pageserver_requests_sent_total; i++; - metrics[i].name = "pageserver_requests_disconnects_total"; + metrics[i].name = "pageserver_disconnects_total"; metrics[i].is_bucket = false; metrics[i].value = (double) counters->pageserver_disconnects_total; i++; @@ -192,7 +193,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); - for (int procno = 0; procno < MaxBackends; procno++) + for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { PGPROC *proc = GetPGProcByNumber(procno); int pid = proc->pid; @@ -231,7 +232,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); /* Aggregate the counters across all backends */ - for (int procno = 0; procno < MaxBackends; procno++) + for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 02163ada55..49d477c4f8 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -96,6 +96,14 @@ typedef struct /* Pointer to the shared memory array of neon_per_backend_counters structs */ extern neon_per_backend_counters *neon_per_backend_counters_shared; +/* + * Size of the perf counters array in shared memory. One slot for each backend + * and aux process. IOW one for each PGPROC slot, except for slots reserved + * for prepared transactions, because they're not real processes and cannot do + * I/O. + */ +#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS) + #if PG_VERSION_NUM >= 170000 #define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber]) #else diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1c87f4405c..155756f8b3 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1773,6 +1773,20 @@ neon_init(void) if (MyPState != NULL) return; + /* + * Sanity check that theperf counters array is sized correctly. We got + * this wrong once, and the formula for max number of backends and aux + * processes might well change in the future, so better safe than sorry. + * This is a very cheap check so we do it even without assertions. On + * v14, this gets called before initializing MyProc, so we cannot perform + * the check here. That's OK, we don't expect the logic to change in old + * releases. + */ +#if PG_VERSION_NUM>=150000 + if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) + elog(ERROR, "MyNeonCounters points past end of array"); +#endif + prfs_size = offsetof(PrefetchState, prf_buffer) + sizeof(PrefetchRequest) * readahead_buffer_size; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c1914421ec..78402a29d5 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -213,7 +213,7 @@ WalProposerPoll(WalProposer *wp) rc = wp->api.wait_event_set(wp, timeout, &sk, &events); /* Exit loop if latch is set (we got new WAL) */ - if ((rc == 1 && events & WL_LATCH_SET)) + if (rc == 1 && (events & WL_LATCH_SET)) break; /* diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 4d0d06e6de..01f88a5ab3 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -422,6 +422,9 @@ backpressure_throttling_impl(void) TimestampTz start, stop; bool retry = false; + char *new_status = NULL; + const char *old_status; + int len; if (PointerIsValid(PrevProcessInterruptsCallback)) retry = PrevProcessInterruptsCallback(); @@ -442,14 +445,24 @@ backpressure_throttling_impl(void) if (lag == 0) return retry; - /* Suspend writers until replicas catch up */ - set_ps_display("backpressure throttling"); + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 64 + 1); + memcpy(new_status, old_status, len); + snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); + set_ps_display(new_status); + new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); pg_usleep(BACK_PRESSURE_DELAY); stop = GetCurrentTimestamp(); pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + + /* Reset ps display */ + set_ps_display(new_status); + pfree(new_status); + return true; } @@ -1473,11 +1486,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, { NeonWALReadResult res; - res = NeonWALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id()); +#if PG_MAJORVERSION_NUM >= 17 + if (!sk->wp->config->syncSafekeepers) + { + Size rbytes; + rbytes = WALReadFromBuffers(buf, startptr, count, + walprop_pg_get_timeline_id()); + + startptr += rbytes; + count -= rbytes; + } +#endif + + if (count == 0) + { + res = NEON_WALREAD_SUCCESS; + } + else + { + Assert(count > 0); + + /* Now read the remaining WAL from the WAL file */ + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); + } if (res == NEON_WALREAD_SUCCESS) { @@ -1779,7 +1814,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) */ - if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger) + if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger) { /* Reset our latch */ ResetLatch(MyLatch); @@ -1791,7 +1826,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 * If the event contains something about the socket, it means we got an * event from a safekeeper socket. */ - if (rc == 1 && (event.events & (WL_SOCKET_MASK))) + if (rc == 1 && (event.events & WL_SOCKET_MASK)) { *sk = (Safekeeper *) event.user_data; *events = event.events; diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index bd3856e9d9..575dddef02 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -160,7 +160,7 @@ NeonWALPageRead( WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, sock, timeout_ms, - WAIT_EVENT_WAL_SENDER_MAIN); + WAIT_EVENT_NEON_WAL_DL); } } } diff --git a/poetry.lock b/poetry.lock index 48943a73e9..07f30d10e7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2064,73 +2064,80 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "psycopg2-binary" -version = "2.9.6" +version = "2.9.9" description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"}, + {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"}, ] [[package]] @@ -2577,7 +2584,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2702,13 +2708,13 @@ files = [ [[package]] name = "requests" -version = "2.32.0" +version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, - {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] @@ -3131,16 +3137,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3378,4 +3374,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" +content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 501ce050e0..bfeb845583 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -24,6 +24,7 @@ bytes = { workspace = true, features = ["serde"] } camino.workspace = true chrono.workspace = true clap.workspace = true +compute_api.workspace = true consumption_metrics.workspace = true dashmap.workspace = true env_logger.workspace = true @@ -81,7 +82,6 @@ tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 7c408f817c..13639af3aa 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -73,6 +73,9 @@ pub(crate) enum AuthErrorImpl { #[error("Authentication timed out")] UserTimeout(Elapsed), + + #[error("Disconnected due to inactivity after {0}.")] + ConfirmationTimeout(humantime::Duration), } #[derive(Debug, Error)] @@ -103,6 +106,10 @@ impl AuthError { pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { AuthErrorImpl::UserTimeout(elapsed).into() } + + pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { + AuthErrorImpl::ConfirmationTimeout(timeout).into() + } } impl> From for AuthError { @@ -125,6 +132,7 @@ impl UserFacingError for AuthError { AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), AuthErrorImpl::TooManyConnections => self.to_string(), AuthErrorImpl::UserTimeout(_) => self.to_string(), + AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), } } } @@ -143,6 +151,7 @@ impl ReportableError for AuthError { AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, + AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 4e9f4591ad..0eeed27fb2 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -80,6 +80,14 @@ pub(crate) trait TestBackend: Send + Sync + 'static { fn get_allowed_ips_and_secret( &self, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; + fn dyn_clone(&self) -> Box; +} + +#[cfg(test)] +impl Clone for Box { + fn clone(&self) -> Self { + TestBackend::dyn_clone(&**self) + } } impl std::fmt::Display for Backend<'_, (), ()> { @@ -557,7 +565,7 @@ mod tests { stream::{PqStream, Stream}, }; - use super::{auth_quirks, AuthRateLimiter}; + use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; struct Auth { ips: Vec, @@ -585,6 +593,14 @@ mod tests { )) } + async fn get_endpoint_jwks( + &self, + _ctx: &RequestMonitoring, + _endpoint: crate::EndpointId, + ) -> anyhow::Result> { + unimplemented!() + } + async fn wake_compute( &self, _ctx: &RequestMonitoring, @@ -595,12 +611,16 @@ mod tests { } static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool: ThreadPool::new(1), scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_auth_broker: false, + accept_jwts: false, + webauth_confirmation_timeout: std::time::Duration::from_secs(5), }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 94e5999a5f..b62a11ccb2 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -8,11 +8,14 @@ use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{Deserialize, Deserializer}; +use serde::{de::Visitor, Deserialize, Deserializer}; use signature::Verifier; use tokio::time::Instant; -use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName}; +use crate::{ + context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, + RoleName, +}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -27,7 +30,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - role_name: RoleName, ) -> impl Future>> + Send; } @@ -35,10 +37,11 @@ pub(crate) struct AuthRule { pub(crate) id: String, pub(crate) jwks_url: url::Url, pub(crate) audience: Option, + pub(crate) role_names: Vec, } #[derive(Default)] -pub(crate) struct JwkCache { +pub struct JwkCache { client: reqwest::Client, map: DashMap<(EndpointId, RoleName), Arc>, @@ -54,18 +57,28 @@ pub(crate) struct JwkCacheEntry { } impl JwkCacheEntry { - fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> { - self.key_sets.values().find_map(|key_set| { - key_set - .find_key(key_id) - .map(|jwk| (jwk, key_set.audience.as_deref())) - }) + fn find_jwk_and_audience( + &self, + key_id: &str, + role_name: &RoleName, + ) -> Option<(&jose_jwk::Jwk, Option<&str>)> { + self.key_sets + .values() + // make sure our requested role has access to the key set + .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name)) + // try and find the requested key-id in the key set + .find_map(|key_set| { + key_set + .find_key(key_id) + .map(|jwk| (jwk, key_set.audience.as_deref())) + }) } } struct KeySet { jwks: jose_jwk::JwkSet, audience: Option, + role_names: Vec, } impl KeySet { @@ -106,7 +119,6 @@ impl JwkCacheEntryLock { ctx: &RequestMonitoring, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, auth_rules: &F, ) -> anyhow::Result> { // double check that no one beat us to updating the cache. @@ -119,11 +131,10 @@ impl JwkCacheEntryLock { } } - let rules = auth_rules - .fetch_auth_rules(ctx, endpoint, role_name) - .await?; + let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?; let mut key_sets = ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); + // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { @@ -151,6 +162,7 @@ impl JwkCacheEntryLock { KeySet { jwks, audience: rule.audience, + role_names: rule.role_names, }, ); } @@ -173,7 +185,6 @@ impl JwkCacheEntryLock { ctx: &RequestMonitoring, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, fetch: &F, ) -> Result, anyhow::Error> { let now = Instant::now(); @@ -183,9 +194,7 @@ impl JwkCacheEntryLock { let Some(cached) = guard else { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; - return self - .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) - .await; + return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; }; let last_update = now.duration_since(cached.last_retrieved); @@ -196,9 +205,7 @@ impl JwkCacheEntryLock { let permit = self.acquire_permit().await; // it's been too long since we checked the keys. wait for them to update. - return self - .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) - .await; + return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; } // every 5 minutes we should spawn a job to eagerly update the token. @@ -212,7 +219,7 @@ impl JwkCacheEntryLock { let ctx = ctx.clone(); tokio::spawn(async move { if let Err(e) = entry - .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch) + .renew_jwks(permit, &ctx, &client, endpoint, &fetch) .await { tracing::warn!(error=?e, "could not fetch JWKs in background job"); @@ -232,7 +239,7 @@ impl JwkCacheEntryLock { jwt: &str, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, + role_name: &RoleName, fetch: &F, ) -> Result<(), anyhow::Error> { // JWT compact form is defined to be @@ -254,30 +261,22 @@ impl JwkCacheEntryLock { let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) .context("Provided authentication token is not a valid JWT encoding")?; - ensure!(header.typ == "JWT"); let kid = header.key_id.context("missing key id")?; let mut guard = self - .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch) + .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) .await?; // get the key from the JWKs if possible. If not, wait for the keys to update. let (jwk, expected_audience) = loop { - match guard.find_jwk_and_audience(kid) { + match guard.find_jwk_and_audience(kid, role_name) { Some(jwk) => break jwk, None if guard.last_retrieved.elapsed() > MIN_RENEW => { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; guard = self - .renew_jwks( - permit, - ctx, - client, - endpoint.clone(), - role_name.clone(), - fetch, - ) + .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } _ => { @@ -296,7 +295,7 @@ impl JwkCacheEntryLock { verify_ec_signature(header_payload.as_bytes(), &sig, key)?; } jose_jwk::Key::Rsa(key) => { - verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?; + verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => bail!("unsupported key type {key:?}"), }; @@ -308,23 +307,24 @@ impl JwkCacheEntryLock { tracing::debug!(?payload, "JWT signature valid with claims"); - match (expected_audience, payload.audience) { - // check the audience matches - (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"), - // the audience is expected but is missing - (Some(_), None) => bail!("invalid JWT token audience"), - // we don't care for the audience field - (None, _) => {} + if let Some(aud) = expected_audience { + ensure!( + payload.audience.0.iter().any(|s| s == aud), + "invalid JWT token audience" + ); } let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY); + ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); } if let Some(nbf) = payload.not_before { - ensure!(nbf < now + CLOCK_SKEW_LEEWAY); + ensure!( + nbf < now + CLOCK_SKEW_LEEWAY, + "JWT token is not yet ready to use" + ); } Ok(()) @@ -336,7 +336,7 @@ impl JwkCache { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - role_name: RoleName, + role_name: &RoleName, fetch: &F, jwt: &str, ) -> Result<(), anyhow::Error> { @@ -377,7 +377,7 @@ fn verify_rsa_signature( data: &[u8], sig: &[u8], key: &jose_jwk::Rsa, - alg: &Option, + alg: &jose_jwa::Algorithm, ) -> anyhow::Result<()> { use jose_jwa::{Algorithm, Signing}; use rsa::{ @@ -388,7 +388,7 @@ fn verify_rsa_signature( let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; match alg { - Some(Algorithm::Signing(Signing::Rs256)) => { + Algorithm::Signing(Signing::Rs256) => { let key = VerifyingKey::::new(key); let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; @@ -402,9 +402,6 @@ fn verify_rsa_signature( /// #[derive(serde::Deserialize, serde::Serialize)] struct JwtHeader<'a> { - /// must be "JWT" - #[serde(rename = "typ")] - typ: &'a str, /// must be a supported alg #[serde(rename = "alg")] algorithm: jose_jwa::Algorithm, @@ -414,11 +411,12 @@ struct JwtHeader<'a> { } /// -#[derive(serde::Deserialize, serde::Serialize, Debug)] +#[derive(serde::Deserialize, Debug)] +#[allow(dead_code)] struct JwtPayload<'a> { /// Audience - Recipient for which the JWT is intended - #[serde(rename = "aud")] - audience: Option<&'a str>, + #[serde(rename = "aud", default)] + audience: OneOrMany, /// Expiration - Time after which the JWT expires #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] expiration: Option, @@ -441,6 +439,59 @@ struct JwtPayload<'a> { session_id: Option<&'a str>, } +/// `OneOrMany` supports parsing either a single item or an array of items. +/// +/// Needed for +/// +/// > The "aud" (audience) claim identifies the recipients that the JWT is +/// > intended for. Each principal intended to process the JWT MUST +/// > identify itself with a value in the audience claim. If the principal +/// > processing the claim does not identify itself with a value in the +/// > "aud" claim when this claim is present, then the JWT MUST be +/// > rejected. In the general case, the "aud" value is **an array of case- +/// > sensitive strings**, each containing a StringOrURI value. In the +/// > special case when the JWT has one audience, the "aud" value MAY be a +/// > **single case-sensitive string** containing a StringOrURI value. The +/// > interpretation of audience values is generally application specific. +/// > Use of this claim is OPTIONAL. +#[derive(Default, Debug)] +struct OneOrMany(Vec); + +impl<'de> Deserialize<'de> for OneOrMany { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct OneOrManyVisitor; + impl<'de> Visitor<'de> for OneOrManyVisitor { + type Value = OneOrMany; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a single string or an array of strings") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(OneOrMany(vec![v.to_owned()])) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut v = vec![]; + while let Some(s) = seq.next_element()? { + v.push(s); + } + Ok(OneOrMany(v)) + } + } + deserializer.deserialize_any(OneOrManyVisitor) + } +} + fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { let d = >::deserialize(d)?; Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) @@ -534,7 +585,6 @@ mod tests { key: jose_jwk::Key::Ec(pk), prm: jose_jwk::Parameters { kid: Some(kid), - alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), ..Default::default() }, }; @@ -548,7 +598,6 @@ mod tests { key: jose_jwk::Key::Rsa(pk), prm: jose_jwk::Parameters { kid: Some(kid), - alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), ..Default::default() }, }; @@ -557,7 +606,6 @@ mod tests { fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { let header = JwtHeader { - typ: "JWT", algorithm: jose_jwa::Algorithm::Signing(sig), key_id: Some(&kid), }; @@ -572,7 +620,7 @@ mod tests { format!("{header}.{body}") } - fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String { + fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String { use p256::ecdsa::{Signature, SigningKey}; let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); @@ -660,11 +708,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let (ec1, jwk3) = new_ec_jwk("3".into()); let (ec2, jwk4) = new_ec_jwk("4".into()); - let jwt1 = new_rsa_jwt("1".into(), rs1); - let jwt2 = new_rsa_jwt("2".into(), rs2); - let jwt3 = new_ec_jwt("3".into(), ec1); - let jwt4 = new_ec_jwt("4".into(), ec2); - let foo_jwks = jose_jwk::JwkSet { keys: vec![jwk1, jwk3], }; @@ -706,47 +749,98 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let client = reqwest::Client::new(); #[derive(Clone)] - struct Fetch(SocketAddr); + struct Fetch(SocketAddr, Vec); impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - _role_name: RoleName, ) -> anyhow::Result> { Ok(vec![ AuthRule { id: "foo".to_owned(), jwks_url: format!("http://{}/foo", self.0).parse().unwrap(), audience: None, + role_names: self.1.clone(), }, AuthRule { id: "bar".to_owned(), jwks_url: format!("http://{}/bar", self.0).parse().unwrap(), audience: None, + role_names: self.1.clone(), }, ]) } } - let role_name = RoleName::from("user"); + let role_name1 = RoleName::from("anonymous"); + let role_name2 = RoleName::from("authenticated"); + + let fetch = Fetch( + addr, + vec![ + RoleNameInt::from(&role_name1), + RoleNameInt::from(&role_name2), + ], + ); + let endpoint = EndpointId::from("ep"); let jwk_cache = Arc::new(JwkCacheEntryLock::default()); - for token in [jwt1, jwt2, jwt3, jwt4] { - jwk_cache - .check_jwt( - &RequestMonitoring::test(), - &token, - &client, - endpoint.clone(), - role_name.clone(), - &Fetch(addr), - ) - .await - .unwrap(); + let jwt1 = new_rsa_jwt("1".into(), rs1); + let jwt2 = new_rsa_jwt("2".into(), rs2); + let jwt3 = new_ec_jwt("3".into(), &ec1); + let jwt4 = new_ec_jwt("4".into(), &ec2); + + // had the wrong kid, therefore will have the wrong ecdsa signature + let bad_jwt = new_ec_jwt("3".into(), &ec2); + // this role_name is not accepted + let bad_role_name = RoleName::from("cloud_admin"); + + let err = jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &bad_jwt, + &client, + endpoint.clone(), + &role_name1, + &fetch, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("signature error")); + + let err = jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &jwt1, + &client, + endpoint.clone(), + &bad_role_name, + &fetch, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("jwk not found")); + + let tokens = [jwt1, jwt2, jwt3, jwt4]; + let role_names = [role_name1, role_name2]; + for role in &role_names { + for token in &tokens { + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + token, + &client, + endpoint.clone(), + role, + &fetch, + ) + .await + .unwrap(); + } } } } diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 2ff2ca00f0..f56b0a0a6d 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, net::SocketAddr}; +use std::net::SocketAddr; use anyhow::Context; use arc_swap::ArcSwapOption; @@ -10,21 +10,19 @@ use crate::{ NodeInfo, }, context::RequestMonitoring, - intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag}, - EndpointId, RoleName, + intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, + EndpointId, }; -use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; +use super::jwt::{AuthRule, FetchAuthRules}; pub struct LocalBackend { - pub(crate) jwks_cache: JwkCache, pub(crate) node_info: NodeInfo, } impl LocalBackend { pub fn new(postgres_addr: SocketAddr) -> Self { LocalBackend { - jwks_cache: JwkCache::default(), node_info: NodeInfo { config: { let mut cfg = ConnCfg::new(); @@ -48,26 +46,17 @@ impl LocalBackend { #[derive(Clone, Copy)] pub(crate) struct StaticAuthRules; -pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); - -#[derive(Debug, Clone)] -pub struct JwksRoleSettings { - pub roles: HashMap, - pub project_id: ProjectIdInt, - pub branch_id: BranchIdInt, -} +pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); impl FetchAuthRules for StaticAuthRules { async fn fetch_auth_rules( &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - role_name: RoleName, ) -> anyhow::Result> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .and_then(|m| m.roles.get(&role_name)) .context("JWKs settings for this role were not configured")?; let mut rules = vec![]; for setting in &role_mappings.jwks { @@ -75,6 +64,7 @@ impl FetchAuthRules for StaticAuthRules { id: setting.id.clone(), jwks_url: setting.jwks_url.clone(), audience: setting.jwt_audience.clone(), + role_names: setting.role_names.clone(), }); } diff --git a/proxy/src/auth/backend/web.rs b/proxy/src/auth/backend/web.rs index 05f437355e..45710d244d 100644 --- a/proxy/src/auth/backend/web.rs +++ b/proxy/src/auth/backend/web.rs @@ -89,7 +89,12 @@ pub(super) async fn authenticate( // Wait for web console response (see `mgmt`). info!(parent: &span, "waiting for console's reply..."); - let db_info = waiter.await.map_err(WebAuthError::from)?; + let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter) + .await + .map_err(|_elapsed| { + auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into()) + })? + .map_err(WebAuthError::from)?; if auth_config.ip_allowlist_check_enabled { if let Some(allowed_ips) = &db_info.allowed_ips { diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 94365ddf05..b18810adbe 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,34 +1,38 @@ -use std::{ - net::SocketAddr, - path::{Path, PathBuf}, - pin::pin, - sync::Arc, - time::Duration, -}; +use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; -use anyhow::{bail, ensure}; +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use compute_api::spec::LocalProxySpec; use dashmap::DashMap; -use futures::{future::Either, FutureExt}; +use futures::future::Either; use proxy::{ - auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP}, + auth::backend::{ + jwt::JwkCache, + local::{LocalBackend, JWKS_ROLE_MAP}, + }, cancellation::CancellationHandlerMain, config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - console::{locks::ApiLocks, messages::JwksRoleMapping}, + console::{ + locks::ApiLocks, + messages::{EndpointJwksResponse, JwksSettings}, + }, http::health_server::AppMetrics, + intern::RoleNameInt, metrics::{Metrics, ThreadPoolMetrics}, rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, scram::threadpool::ThreadPool, serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, + RoleName, }; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, task::JoinSet}; +use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -72,9 +76,12 @@ struct LocalProxyCliArgs { /// Address of the postgres server #[clap(long, default_value = "127.0.0.1:5432")] compute: SocketAddr, - /// File address of the local proxy config file + /// Path of the local proxy config file #[clap(long, default_value = "./localproxy.json")] - config_path: PathBuf, + config_path: Utf8PathBuf, + /// Path of the local proxy PID file + #[clap(long, default_value = "./localproxy.pid")] + pid_path: Utf8PathBuf, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -126,6 +133,24 @@ async fn main() -> anyhow::Result<()> { let args = LocalProxyCliArgs::parse(); let config = build_config(&args)?; + // before we bind to any ports, write the process ID to a file + // so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + // + // This also claims a "lock" that makes sure only one instance + // of local-proxy runs at a time. + let _process_guard = loop { + match pid_file::claim_for_current_process(&args.pid_path) { + Ok(guard) => break guard, + Err(e) => { + // compute-ctl might have tried to read the pid-file to let us + // know about some config change. We should try again. + error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; let http_listener = TcpListener::bind(args.http).await?; let shutdown = CancellationToken::new(); @@ -139,12 +164,30 @@ async fn main() -> anyhow::Result<()> { 16, )); - refresh_config(args.config_path.clone()).await; + // write the process ID to a file so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + let pid = std::process::id(); + info!("process running in PID {pid}"); + std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?; let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || { - refresh_config(args.config_path.clone()).map(Ok) + + let refresh_config_notify = Arc::new(Notify::new()); + maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), { + let refresh_config_notify = Arc::clone(&refresh_config_notify); + move || { + refresh_config_notify.notify_one(); + } })); + + // trigger the first config load **after** setting up the signal hook + // to avoid the race condition where: + // 1. No config file registered when local-proxy starts up + // 2. The config file is written but the signal hook is not yet received + // 3. local-proxy completes startup but has no config loaded, despite there being a registerd config. + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + maintenance_tasks.spawn(proxy::http::health_server::task_main( metrics_listener, AppMetrics { @@ -227,14 +270,18 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig allow_self_signed_compute: false, http_config, authentication_config: AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool: ThreadPool::new(0), scram_protocol_timeout: Duration::from_secs(10), rate_limiter_enabled: false, rate_limiter: BucketRateLimiter::new(vec![]), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_auth_broker: false, + accept_jwts: true, + webauth_confirmation_timeout: Duration::ZERO, }, - require_client_ip: false, + proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), region: "local".into(), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, @@ -245,81 +292,84 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }))) } -async fn refresh_config(path: PathBuf) { - match refresh_config_inner(&path).await { - Ok(()) => {} - Err(e) => { - error!(error=?e, ?path, "could not read config file"); +async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + loop { + rx.notified().await; + + match refresh_config_inner(&path).await { + Ok(()) => {} + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } } } } -async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> { +async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { let bytes = tokio::fs::read(&path).await?; - let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - let mut settings = None; + let mut jwks_set = vec![]; - for mapping in data.roles.values_mut() { - for jwks in &mut mapping.jwks { - ensure!( - jwks.jwks_url.has_authority() - && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); + for jwks in data.jwks { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - ensure!( - jwks.jwks_url - .host() - .is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); - // clear username, password and ports - jwks.jwks_url.set_username("").expect( + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( "url can be a base and has a valid host and is not a file. should not error", ); - jwks.jwks_url.set_password(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks.jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks.jwks_url.set_fragment(None); - jwks.jwks_url.query_pairs_mut().clear().finish(); - - if jwks.jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks.jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id)); - ensure!( - *pr == jwks.project_id, - "inconsistent project IDs configured" - ); - ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured"); } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + jwks_set.push(JwksSettings { + id: jwks.id, + jwks_url, + provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) } - if let Some((project_id, branch_id)) = settings { - JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings { - roles: data.roles, - project_id, - branch_id, - }))); - } + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); Ok(()) } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 20d2d3df9a..53f1586abe 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), )); - let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async { - Ok(()) - })); + let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {})); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 2ac66ffe8c..0585902c3b 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -8,6 +8,7 @@ use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::AuthRateLimiter; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; @@ -17,6 +18,7 @@ use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; use proxy::config::ProjectInfoCacheOptions; +use proxy::config::ProxyProtocolV2; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; @@ -102,6 +104,9 @@ struct ProxyCliArgs { default_value = "http://localhost:3000/authenticate_proxy_request/" )] auth_endpoint: String, + /// if this is not local proxy, this toggles whether we accept jwt or passwords for http + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_auth_broker: bool, /// path to TLS key for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir @@ -144,9 +149,6 @@ struct ProxyCliArgs { /// size of the threadpool for password hashing #[clap(long, default_value_t = 4)] scram_thread_pool_size: u8, - /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, @@ -229,6 +231,15 @@ struct ProxyCliArgs { /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] is_private_access_proxy: bool, + + /// Configure whether all incoming requests have a Proxy Protocol V2 packet. + // TODO(conradludgate): switch default to rejected or required once we've updated all deployments + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + proxy_protocol_v2: ProxyProtocolV2, + + /// Time the proxy waits for the webauth session to be confirmed by the control plane. + #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] + webauth_confirmation_timeout: std::time::Duration, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -382,9 +393,27 @@ async fn main() -> anyhow::Result<()> { info!("Starting mgmt on {mgmt_address}"); let mgmt_listener = TcpListener::bind(mgmt_address).await?; - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); - let proxy_listener = TcpListener::bind(proxy_address).await?; + let proxy_listener = if !args.is_auth_broker { + let proxy_address: SocketAddr = args.proxy.parse()?; + info!("Starting proxy on {proxy_address}"); + + Some(TcpListener::bind(proxy_address).await?) + } else { + None + }; + + // TODO: rename the argument to something like serverless. + // It now covers more than just websockets, it also covers SQL over HTTP. + let serverless_listener = if let Some(serverless_address) = args.wss { + let serverless_address: SocketAddr = serverless_address.parse()?; + info!("Starting wss on {serverless_address}"); + Some(TcpListener::bind(serverless_address).await?) + } else if args.is_auth_broker { + bail!("wss arg must be present for auth-broker") + } else { + None + }; + let cancellation_token = CancellationToken::new(); let cancel_map = CancelMap::default(); @@ -430,21 +459,17 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - client_tasks.spawn(proxy::proxy::task_main( - config, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - - // TODO: rename the argument to something like serverless. - // It now covers more than just websockets, it also covers SQL over HTTP. - if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; - info!("Starting wss on {serverless_address}"); - let serverless_listener = TcpListener::bind(serverless_address).await?; + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + if let Some(serverless_listener) = serverless_listener { client_tasks.spawn(serverless::task_main( config, serverless_listener, @@ -461,10 +486,7 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals( - cancellation_token.clone(), - || async { Ok(()) }, - )); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {})); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { @@ -677,7 +699,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { )?; let http_config = HttpConfig { - accept_websockets: true, + accept_websockets: !args.is_auth_broker, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, @@ -692,12 +714,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, + webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; let config = Box::leak(Box::new(ProxyConfig { @@ -707,7 +733,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { allow_self_signed_compute: args.allow_self_signed_compute, http_config, authentication_config, - require_client_ip: args.require_client_ip, + proxy_protocol_v2: args.proxy_protocol_v2, handshake_timeout: args.handshake_timeout, region: args.region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 373e4cf650..e0d666adf7 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,5 +1,8 @@ use crate::{ - auth::{self, backend::AuthRateLimiter}, + auth::{ + self, + backend::{jwt::JwkCache, AuthRateLimiter}, + }, console::locks::ApiLocks, rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, scram::threadpool::ThreadPool, @@ -7,6 +10,7 @@ use crate::{ Host, }; use anyhow::{bail, ensure, Context, Ok}; +use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; use rustls::{ @@ -30,7 +34,7 @@ pub struct ProxyConfig { pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, - pub require_client_ip: bool, + pub proxy_protocol_v2: ProxyProtocolV2, pub region: String, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, @@ -38,6 +42,16 @@ pub struct ProxyConfig { pub connect_to_compute_retry_config: RetryConfig, } +#[derive(Copy, Clone, Debug, ValueEnum, PartialEq)] +pub enum ProxyProtocolV2 { + /// Connection will error if PROXY protocol v2 header is missing + Required, + /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing. + Supported, + /// Connection will error if PROXY protocol v2 header is provided + Rejected, +} + #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, @@ -67,6 +81,10 @@ pub struct AuthenticationConfig { pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, pub ip_allowlist_check_enabled: bool, + pub jwks_cache: JwkCache, + pub is_auth_broker: bool, + pub accept_jwts: bool, + pub webauth_confirmation_timeout: tokio::time::Duration, } impl TlsConfig { @@ -250,18 +268,26 @@ impl CertResolver { let common_name = pem.subject().to_string(); - // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as - // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so - // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names - // and passed None instead, which blows up number of cases downstream code should handle. Proper coding - // here should better avoid Option for common_names, and do wildcard-based certificate selection instead - // of cutting off '*.' parts. - let common_name = if common_name.starts_with("CN=*.") { - common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() } else { - common_name.strip_prefix("CN=").map(|s| s.to_string()) - } - .context("Failed to parse common name from certificate")?; + bail!("Failed to parse common name from certificate") + }; let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 85683acb82..1696e229ce 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,13 +1,11 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; use std::fmt::{self, Display}; use crate::auth::IpPattern; -use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; -use crate::RoleName; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. @@ -348,11 +346,6 @@ impl ColdStartInfo { } } -#[derive(Debug, Deserialize, Clone)] -pub struct JwksRoleMapping { - pub roles: HashMap, -} - #[derive(Debug, Deserialize, Clone)] pub struct EndpointJwksResponse { pub jwks: Vec, @@ -361,11 +354,10 @@ pub struct EndpointJwksResponse { #[derive(Debug, Deserialize, Clone)] pub struct JwksSettings { pub id: String, - pub project_id: ProjectIdInt, - pub branch_id: BranchIdInt, pub jwks_url: url::Url, pub provider_name: String, pub jwt_audience: Option, + pub role_names: Vec, } #[cfg(test)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 16e8da605b..95097f2de9 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -5,7 +5,10 @@ pub mod neon; use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ auth::{ - backend::{ComputeCredentialKeys, ComputeUserInfo}, + backend::{ + jwt::{AuthRule, FetchAuthRules}, + ComputeCredentialKeys, ComputeUserInfo, + }, IpPattern, }, cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, @@ -16,7 +19,7 @@ use crate::{ intern::ProjectIdInt, metrics::ApiLockMetrics, rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, + scram, EndpointCacheKey, EndpointId, }; use dashmap::DashMap; use std::{hash::Hash, sync::Arc, time::Duration}; @@ -334,6 +337,12 @@ pub(crate) trait Api { user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result>; + /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, @@ -343,6 +352,7 @@ pub(crate) trait Api { } #[non_exhaustive] +#[derive(Clone)] pub enum ConsoleBackend { /// Current Cloud API (V2). Console(neon::Api), @@ -386,6 +396,20 @@ impl Api for ConsoleBackend { } } + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + match self { + Self::Console(api) => api.get_endpoint_jwks(ctx, endpoint).await, + #[cfg(any(test, feature = "testing"))] + Self::Postgres(api) => api.get_endpoint_jwks(ctx, endpoint).await, + #[cfg(test)] + Self::Test(_api) => Ok(vec![]), + } + } + async fn wake_compute( &self, ctx: &RequestMonitoring, @@ -552,3 +576,13 @@ impl WakeComputePermit { res } } + +impl FetchAuthRules for ConsoleBackend { + async fn fetch_auth_rules( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.get_endpoint_jwks(ctx, endpoint).await + } +} diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 1b77418de6..b548a0203a 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,7 +4,9 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::context::RequestMonitoring; +use crate::{ + auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, +}; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; use crate::{ @@ -118,6 +120,39 @@ impl Api { }) } + async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; + + let connection = tokio::spawn(connection); + + let res = client.query( + "select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1", + &[&endpoint.as_str()], + ) + .await?; + + let mut rows = vec![]; + for row in res { + rows.push(AuthRule { + id: row.get("id"), + jwks_url: url::Url::parse(row.get("jwks_url"))?, + audience: row.get("audience"), + role_names: row + .get::<_, Vec>("role_names") + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }); + } + + drop(client); + connection.await??; + + Ok(rows) + } + async fn do_wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config @@ -185,6 +220,14 @@ impl super::Api for Api { )) } + async fn get_endpoint_jwks( + &self, + _ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.do_get_endpoint_jwks(endpoint).await + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index b004bf4ecf..2d527f378c 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,27 +7,33 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, + auth::backend::{jwt::AuthRule, ComputeUserInfo}, compute, - console::messages::{ColdStartInfo, Reason}, + console::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, http, metrics::{CacheOutcome, Metrics}, rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, + scram, EndpointCacheKey, EndpointId, }; use crate::{cache::Cached, context::RequestMonitoring}; +use ::http::{header::AUTHORIZATION, HeaderName}; +use anyhow::bail; use futures::TryFutureExt; use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{debug, error, info, info_span, warn, Instrument}; +const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); + +#[derive(Clone)] pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub(crate) locks: &'static ApiLocks, pub(crate) wake_compute_endpoint_rate_limiter: Arc, - jwt: String, + // put in a shared ref so we don't copy secrets all over in memory + jwt: Arc, } impl Api { @@ -38,7 +44,9 @@ impl Api { locks: &'static ApiLocks, wake_compute_endpoint_rate_limiter: Arc, ) -> Self { - let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default(); + let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") + .unwrap_or_default() + .into(); Self { endpoint, caches, @@ -71,9 +79,9 @@ impl Api { async { let request = self .endpoint - .get("proxy_get_role_secret") - .header("X-Request-ID", &request_id) - .header("Authorization", format!("Bearer {}", &self.jwt)) + .get_path("proxy_get_role_secret") + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), @@ -125,6 +133,61 @@ impl Api { .await } + async fn do_get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &endpoint.normalize()) + .await + { + bail!("endpoint not found"); + } + let request_id = ctx.session_id().to_string(); + async { + let request = self + .endpoint + .get_with_url(|url| { + url.path_segments_mut() + .push("endpoints") + .push(endpoint.as_str()) + .push("jwks"); + }) + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .build()?; + + info!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + + let body = parse_body::(response).await?; + + let rules = body + .jwks + .into_iter() + .map(|jwks| AuthRule { + id: jwks.id, + jwks_url: jwks.jwks_url, + audience: jwks.jwt_audience, + role_names: jwks.role_names, + }) + .collect(); + + Ok(rules) + } + .map_err(crate::error::log_error) + .instrument(info_span!("http", id = request_id)) + .await + } + async fn do_wake_compute( &self, ctx: &RequestMonitoring, @@ -135,7 +198,7 @@ impl Api { async { let mut request_builder = self .endpoint - .get("proxy_wake_compute") + .get_path("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) @@ -262,6 +325,15 @@ impl super::Api for Api { )) } + #[tracing::instrument(skip_all)] + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.do_get_endpoint_jwks(ctx, endpoint).await + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/http.rs b/proxy/src/http.rs index c77d95f47d..14720b5c6b 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -86,9 +86,17 @@ impl Endpoint { /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub(crate) fn get(&self, path: &str) -> RequestBuilder { + pub(crate) fn get_path(&self, path: &str) -> RequestBuilder { + self.get_with_url(|u| { + u.path_segments_mut().push(path); + }) + } + + /// Return a [builder](RequestBuilder) for a `GET` request, + /// accepting a closure to modify the url path segments for more complex paths queries. + pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder { let mut url = self.endpoint.clone(); - url.path_segments_mut().push(path); + f(&mut url); self.client.get(url.into_inner()) } @@ -144,7 +152,7 @@ mod tests { // Validate that this pattern makes sense. let req = endpoint - .get("frobnicate") + .get_path("frobnicate") .query(&[ ("foo", Some("10")), // should be just `foo=10` ("bar", None), // shouldn't be passed at all @@ -162,7 +170,7 @@ mod tests { let endpoint = Endpoint::new(url, Client::new()); let req = endpoint - .get("frobnicate") + .get_path("frobnicate") .query(&[("session_id", uuid::Uuid::nil())]) .build()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index e5144cfe2e..108420d7d7 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -130,14 +130,14 @@ impl Default for StringInterner { } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub(crate) struct RoleNameTag; +pub struct RoleNameTag; impl InternId for RoleNameTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } -pub(crate) type RoleNameInt = InternedString; +pub type RoleNameInt = InternedString; impl From<&RoleName> for RoleNameInt { fn from(value: &RoleName) -> Self { RoleNameTag::get_interner().get_or_intern(value) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 0070839aa8..ea0a9beced 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -82,7 +82,7 @@ impl_trait_overcaptures, )] -use std::{convert::Infallible, future::Future}; +use std::convert::Infallible; use anyhow::{bail, Context}; use intern::{EndpointIdInt, EndpointIdTag, InternId}; @@ -117,13 +117,12 @@ pub mod usage_metrics; pub mod waiters; /// Handle unix signals appropriately. -pub async fn handle_signals( +pub async fn handle_signals( token: CancellationToken, mut refresh_config: F, ) -> anyhow::Result where - F: FnMut() -> Fut, - Fut: Future>, + F: FnMut(), { use tokio::signal::unix::{signal, SignalKind}; @@ -136,7 +135,7 @@ where // Hangup is commonly used for config reload. _ = hangup.recv() => { warn!("received SIGHUP"); - refresh_config().await?; + refresh_config(); } // Shut down the whole application. _ = interrupt.recv() => { diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3b30ad8b46..2e773fabb3 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,4 +1,3 @@ -use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::{ filter::{EnvFilter, LevelFilter}, prelude::*, @@ -23,9 +22,7 @@ pub async fn init() -> anyhow::Result { .with_writer(std::io::stderr) .with_target(false); - let otlp_layer = tracing_utils::init_tracing("proxy") - .await - .map(OpenTelemetryLayer::new); + let otlp_layer = tracing_utils::init_tracing("proxy").await; tracing_subscriber::registry() .with(env_filter) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index ff199ac701..7003af2aba 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -10,6 +10,7 @@ pub(crate) mod wake_compute; pub use copy_bidirectional::copy_bidirectional_client_compute; pub use copy_bidirectional::ErrorSource; +use crate::config::ProxyProtocolV2; use crate::{ auth, cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, @@ -93,15 +94,19 @@ pub async fn task_main( connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await { - Ok((socket, Some(addr))) => (socket, addr.ip()), Err(e) => { error!("per-client task finished with an error: {e:#}"); return; } - Ok((_socket, None)) if config.require_client_ip => { - error!("missing required client IP"); + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); return; } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), Ok((socket, None)) => (socket, peer_addr.ip()), }; diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 752d982726..058ec06e02 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -525,6 +525,10 @@ impl TestBackend for TestConnectMechanism { { unimplemented!("not used in tests") } + + fn dyn_clone(&self) -> Box { + Box::new(self.clone()) + } } fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 2702aeebfe..c027a0cd20 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -43,6 +43,13 @@ impl ThreadPool { pub fn new(n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. + if n_workers == 0 { + return Arc::new(Self { + runtime: None, + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + }); + } + Arc::new_cyclic(|pool| { let pool = pool.clone(); let worker_id = AtomicUsize::new(0); diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 84f98cb8ad..a7e3fa709b 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -5,6 +5,7 @@ mod backend; pub mod cancel_set; mod conn_pool; +mod http_conn_pool; mod http_util; mod json; mod sql_over_http; @@ -19,7 +20,8 @@ use anyhow::Context; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; -use http_body_util::Full; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Empty}; use hyper1::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; @@ -81,7 +83,28 @@ pub async fn task_main( } }); + let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config); + { + let http_conn_pool = Arc::clone(&http_conn_pool); + tokio::spawn(async move { + http_conn_pool.gc_worker(StdRng::from_entropy()).await; + }); + } + + // shutdown the connection pool + tokio::spawn({ + let cancellation_token = cancellation_token.clone(); + let http_conn_pool = http_conn_pool.clone(); + async move { + cancellation_token.cancelled().await; + tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) + .await + .unwrap(); + } + }); + let backend = Arc::new(PoolingBackend { + http_conn_pool: Arc::clone(&http_conn_pool), pool: Arc::clone(&conn_pool), config, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), @@ -342,7 +365,7 @@ async fn request_handler( // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -386,7 +409,7 @@ async fn request_handler( ); // Return the response so the spawned future can continue. - Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) + Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestMonitoring::new( session_id, @@ -409,7 +432,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Full::new(Bytes::new())) + .body(Empty::new().map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index aa236907db..89eeec3e6f 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,6 +1,8 @@ -use std::{sync::Arc, time::Duration}; +use std::{io, sync::Arc, time::Duration}; use async_trait::async_trait; +use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use tokio::net::{lookup_host, TcpStream}; use tracing::{field::display, info}; use crate::{ @@ -27,9 +29,13 @@ use crate::{ Host, }; -use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::{ + conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, + http_conn_pool::{self, poll_http2_client}, +}; pub(crate) struct PoolingBackend { + pub(crate) http_conn_pool: Arc, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, pub(crate) endpoint_rate_limiter: Arc, @@ -103,32 +109,44 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, ctx: &RequestMonitoring, + config: &AuthenticationConfig, user_info: &ComputeUserInfo, - jwt: &str, - ) -> Result { + jwt: String, + ) -> Result<(), AuthError> { match &self.config.auth_backend { - crate::auth::Backend::Console(_, ()) => { - Err(AuthError::auth_failed("JWT login is not yet supported")) - } - crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed( - "JWT login over web auth proxy is not supported", - )), - crate::auth::Backend::Local(cache) => { - cache + crate::auth::Backend::Console(console, ()) => { + config .jwks_cache .check_jwt( ctx, user_info.endpoint.clone(), - user_info.user.clone(), - &StaticAuthRules, - jwt, + &user_info.user, + &**console, + &jwt, ) .await .map_err(|e| AuthError::auth_failed(e.to_string()))?; - Ok(ComputeCredentials { - info: user_info.clone(), - keys: crate::auth::backend::ComputeCredentialKeys::None, - }) + + Ok(()) + } + crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed( + "JWT login over web auth proxy is not supported", + )), + crate::auth::Backend::Local(_) => { + config + .jwks_cache + .check_jwt( + ctx, + user_info.endpoint.clone(), + &user_info.user, + &StaticAuthRules, + &jwt, + ) + .await + .map_err(|e| AuthError::auth_failed(e.to_string()))?; + + // todo: rewrite JWT signature with key shared somehow between local proxy and postgres + Ok(()) } } } @@ -174,14 +192,55 @@ impl PoolingBackend { ) .await } + + // Wake up the destination if needed + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + pub(crate) async fn connect_to_local_proxy( + &self, + ctx: &RequestMonitoring, + conn_info: ConnInfo, + ) -> Result { + info!("pool: looking for an existing connection"); + if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) { + return Ok(client); + } + + let conn_id = uuid::Uuid::new_v4(); + tracing::Span::current().record("conn_id", display(conn_id)); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + let backend = self + .config + .auth_backend + .as_ref() + .map(|()| ComputeCredentials { + info: conn_info.user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + }); + crate::proxy::connect_compute::connect_to_compute( + ctx, + &HyperMechanism { + conn_id, + conn_info, + pool: self.http_conn_pool.clone(), + locks: &self.config.connect_compute_locks, + }, + &backend, + false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, + ) + .await + } } #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), - #[error("could not connection to compute")] - ConnectionError(#[from] tokio_postgres::Error), + #[error("could not connection to postgres in compute")] + PostgresConnectionError(#[from] tokio_postgres::Error), + #[error("could not connection to local-proxy in compute")] + LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not get auth info")] GetAuthInfo(#[from] GetAuthInfoError), @@ -193,11 +252,20 @@ pub(crate) enum HttpConnError { TooManyConnectionAttempts(#[from] ApiLockError), } +#[derive(Debug, thiserror::Error)] +pub(crate) enum LocalProxyConnError { + #[error("error with connection to local-proxy")] + Io(#[source] std::io::Error), + #[error("could not establish h2 connection")] + H2(#[from] hyper1::Error), +} + impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::ConnectionError(p) => p.get_error_kind(), + HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), + HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), @@ -210,7 +278,8 @@ impl UserFacingError for HttpConnError { fn to_string_client(&self) -> String { match self { HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), - HttpConnError::ConnectionError(p) => p.to_string(), + HttpConnError::PostgresConnectionError(p) => p.to_string(), + HttpConnError::LocalProxyConnectionError(p) => p.to_string(), HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), @@ -224,7 +293,8 @@ impl UserFacingError for HttpConnError { impl CouldRetry for HttpConnError { fn could_retry(&self) -> bool { match self { - HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::PostgresConnectionError(e) => e.could_retry(), + HttpConnError::LocalProxyConnectionError(e) => e.could_retry(), HttpConnError::ConnectionClosedAbruptly(_) => false, HttpConnError::GetAuthInfo(_) => false, HttpConnError::AuthError(_) => false, @@ -236,7 +306,7 @@ impl CouldRetry for HttpConnError { impl ShouldRetryWakeCompute for HttpConnError { fn should_retry_wake_compute(&self) -> bool { match self { - HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(), + HttpConnError::PostgresConnectionError(e) => e.should_retry_wake_compute(), // we never checked cache validity HttpConnError::TooManyConnectionAttempts(_) => false, _ => true, @@ -244,6 +314,38 @@ impl ShouldRetryWakeCompute for HttpConnError { } } +impl ReportableError for LocalProxyConnError { + fn get_error_kind(&self) -> ErrorKind { + match self { + LocalProxyConnError::Io(_) => ErrorKind::Compute, + LocalProxyConnError::H2(_) => ErrorKind::Compute, + } + } +} + +impl UserFacingError for LocalProxyConnError { + fn to_string_client(&self) -> String { + "Could not establish HTTP connection to the database".to_string() + } +} + +impl CouldRetry for LocalProxyConnError { + fn could_retry(&self) -> bool { + match self { + LocalProxyConnError::Io(_) => false, + LocalProxyConnError::H2(_) => false, + } + } +} +impl ShouldRetryWakeCompute for LocalProxyConnError { + fn should_retry_wake_compute(&self) -> bool { + match self { + LocalProxyConnError::Io(_) => false, + LocalProxyConnError::H2(_) => false, + } + } +} + struct TokioMechanism { pool: Arc>, conn_info: ConnInfo, @@ -293,3 +395,99 @@ impl ConnectMechanism for TokioMechanism { fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} } + +struct HyperMechanism { + pool: Arc, + conn_info: ConnInfo, + conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, +} + +#[async_trait] +impl ConnectMechanism for HyperMechanism { + type Connection = http_conn_pool::Client; + type ConnectError = HttpConnError; + type Error = HttpConnError; + + async fn connect_once( + &self, + ctx: &RequestMonitoring, + node_info: &CachedNodeInfo, + timeout: Duration, + ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + + // let port = node_info.config.get_ports().first().unwrap_or_else(10432); + let res = connect_http2(&host, 10432, timeout).await; + drop(pause); + let (client, connection) = permit.release_result(res)?; + + Ok(poll_http2_client( + self.pool.clone(), + ctx, + &self.conn_info, + client, + connection, + self.conn_id, + node_info.aux.clone(), + )) + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} + +async fn connect_http2( + host: &str, + port: u16, + timeout: Duration, +) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> { + // assumption: host is an ip address so this should not actually perform any requests. + // todo: add that assumption as a guarantee in the control-plane API. + let mut addrs = lookup_host((host, port)) + .await + .map_err(LocalProxyConnError::Io)?; + + let mut last_err = None; + + let stream = loop { + let Some(addr) = addrs.next() else { + return Err(last_err.unwrap_or_else(|| { + LocalProxyConnError::Io(io::Error::new( + io::ErrorKind::InvalidInput, + "could not resolve any addresses", + )) + })); + }; + + match tokio::time::timeout(timeout, TcpStream::connect(addr)).await { + Ok(Ok(stream)) => { + stream.set_nodelay(true).map_err(LocalProxyConnError::Io)?; + break stream; + } + Ok(Err(e)) => { + last_err = Some(LocalProxyConnError::Io(e)); + } + Err(e) => { + last_err = Some(LocalProxyConnError::Io(io::Error::new( + io::ErrorKind::TimedOut, + e, + ))); + } + }; + }; + + let (client, connection) = hyper1::client::conn::http2::Builder::new(TokioExecutor::new()) + .timer(TokioTimer::new()) + .keep_alive_interval(Duration::from_secs(20)) + .keep_alive_while_idle(true) + .keep_alive_timeout(Duration::from_secs(5)) + .handshake(TokioIo::new(stream)) + .await?; + + Ok((client, connection)) +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs new file mode 100644 index 0000000000..b31ed22a7c --- /dev/null +++ b/proxy/src/serverless/http_conn_pool.rs @@ -0,0 +1,342 @@ +use dashmap::DashMap; +use hyper1::client::conn::http2; +use hyper_util::rt::{TokioExecutor, TokioIo}; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::{sync::Arc, sync::Weak}; +use tokio::net::TcpStream; + +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{context::RequestMonitoring, EndpointCacheKey}; + +use tracing::{debug, error}; +use tracing::{info, info_span, Instrument}; + +use super::conn_pool::ConnInfo; + +pub(crate) type Send = http2::SendRequest; +pub(crate) type Connect = + http2::Connection, hyper1::body::Incoming, TokioExecutor>; + +#[derive(Clone)] +struct ConnPoolEntry { + conn: Send, + conn_id: uuid::Uuid, + aux: MetricsAuxInfo, +} + +// Per-endpoint connection pool +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub(crate) struct EndpointConnPool { + // TODO(conrad): + // either we should open more connections depending on stream count + // (not exposed by hyper, need our own counter) + // or we can change this to an Option rather than a VecDeque. + // + // Opening more connections to the same db because we run out of streams + // seems somewhat redundant though. + // + // Probably we should run a semaphore and just the single conn. TBD. + conns: VecDeque, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self) -> Option { + let Self { conns, .. } = self; + + loop { + let conn = conns.pop_front()?; + if !conn.conn.is_closed() { + conns.push_back(conn.clone()); + return Some(conn); + } + } + } + + fn remove_conn(&mut self, conn_id: uuid::Uuid) -> bool { + let Self { + conns, + global_connections_count, + .. + } = self; + + let old_len = conns.len(); + conns.retain(|conn| conn.conn_id != conn_id); + let new_len = conns.len(); + let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } + removed > 0 + } +} + +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if !self.conns.is_empty() { + self.global_connections_count + .fetch_sub(self.conns.len(), atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.conns.len() as i64); + } + } +} + +pub(crate) struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, +} + +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), + }) + } + + pub(crate) fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); + let current_len = shard.len(); + let mut clients_removed = 0; + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { conns, .. } = pool.get_mut(); + + let old_len = conns.len(); + + conns.retain(|conn| !conn.conn.is_closed()); + + let new_len = conns.len(); + let removed = old_len - new_len; + clients_removed += removed; + + // we only remove this pool if it has no active connections + if conns.is_empty() { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + + let new_len = shard.len(); + drop(shard); + timer.observe(); + + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub(crate) fn get( + self: &Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Option { + let endpoint = conn_info.endpoint_cache_key()?; + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + let client = endpoint_pool.write().get_conn_entry()?; + + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + Some(Client::new(client.conn, client.aux)) + } + + fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc> { + // fast path + if let Some(pool) = self.global_pool.get(endpoint) { + return pool.clone(); + } + + // slow path + let new_pool = Arc::new(RwLock::new(EndpointConnPool { + conns: VecDeque::new(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + })); + + // find or create a pool for this endpoint + let mut created = false; + let pool = self + .global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + new_pool + }) + .clone(); + + // log new global pool size + if created { + let global_pool_size = self + .global_pool_size + .fetch_add(1, atomic::Ordering::Relaxed) + + 1; + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } +} + +pub(crate) fn poll_http2_client( + global_pool: Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + client: Send, + connection: Connect, + conn_id: uuid::Uuid, + aux: MetricsAuxInfo, +) -> Client { + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let session_id = ctx.session_id(); + + let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info(); + span.in_scope(|| { + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); + }); + + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => { + let pool = global_pool.get_or_create_endpoint_pool(&endpoint); + + pool.write().conns.push_back(ConnPoolEntry { + conn: client.clone(), + conn_id, + aux: aux.clone(), + }); + + Arc::downgrade(&pool) + } + None => Weak::new(), + }; + + tokio::spawn( + async move { + let _conn_gauge = conn_gauge; + let res = connection.await; + match res { + Ok(()) => info!("connection closed"), + Err(e) => error!(%session_id, "connection error: {}", e), + } + + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_conn(conn_id) { + info!("closed connection removed"); + } + } + } + .instrument(span), + ); + + Client::new(client, aux) +} + +pub(crate) struct Client { + pub(crate) inner: Send, + aux: MetricsAuxInfo, +} + +impl Client { + pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self { + Self { inner, aux } + } + + pub(crate) fn metrics(&self) -> Arc { + USAGE_METRICS.register(Ids { + endpoint_id: self.aux.endpoint_id, + branch_id: self.aux.branch_id, + }) + } +} diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index abf0ffe290..d766a46577 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -5,13 +5,13 @@ use bytes::Bytes; use anyhow::Context; use http::{Response, StatusCode}; -use http_body_util::Full; +use http_body_util::{combinators::BoxBody, BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; /// Like [`ApiError::into_response`] -pub(crate) fn api_error_into_response(this: ApiError) -> Response> { +pub(crate) fn api_error_into_response(this: ApiError) -> Response> { match this { ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#?}"), // use debug printing so that we give the cause @@ -64,17 +64,24 @@ struct HttpErrorBody { impl HttpErrorBody { /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] - fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + fn response_from_msg_and_status( + msg: String, + status: StatusCode, + ) -> Response> { HttpErrorBody { msg }.to_response(status) } /// Same as [`utils::http::error::HttpErrorBody::to_response`] - fn to_response(&self, status: StatusCode) -> Response> { + fn to_response(&self, status: StatusCode) -> Response> { Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail - .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .body( + Full::new(Bytes::from(serde_json::to_string(self).unwrap())) + .map_err(|x| match x {}) + .boxed(), + ) .unwrap() } } @@ -83,14 +90,14 @@ impl HttpErrorBody { pub(crate) fn json_response( status: StatusCode, data: T, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let json = serde_json::to_string(&data) .context("Failed to serialize JSON response") .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") - .body(Full::new(Bytes::from(json))) + .body(Full::new(Bytes::from(json)).map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7c78439a0a..f3a7ed9329 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,6 +8,8 @@ use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; use http::header::AUTHORIZATION; +use http::Method; +use http_body_util::combinators::BoxBody; use http_body_util::BodyExt; use http_body_util::Full; use hyper1::body::Body; @@ -38,9 +40,11 @@ use url::Url; use urlencoding; use utils::http::error::ApiError; +use crate::auth::backend::ComputeCredentials; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; use crate::auth::ComputeUserInfoParseError; +use crate::config::AuthenticationConfig; use crate::config::ProxyConfig; use crate::config::TlsConfig; use crate::context::RequestMonitoring; @@ -56,6 +60,7 @@ use crate::usage_metrics::MetricCounterRecorder; use crate::DbName; use crate::RoleName; +use super::backend::LocalProxyConnError; use super::backend::PoolingBackend; use super::conn_pool::AuthData; use super::conn_pool::Client; @@ -123,8 +128,8 @@ pub(crate) enum ConnInfoError { MissingUsername, #[error("invalid username: {0}")] InvalidUsername(#[from] std::string::FromUtf8Error), - #[error("missing password")] - MissingPassword, + #[error("missing authentication credentials: {0}")] + MissingCredentials(Credentials), #[error("missing hostname")] MissingHostname, #[error("invalid hostname: {0}")] @@ -133,6 +138,14 @@ pub(crate) enum ConnInfoError { MalformedEndpoint, } +#[derive(Debug, thiserror::Error)] +pub(crate) enum Credentials { + #[error("required password")] + Password, + #[error("required authorization bearer token in JWT format")] + BearerJwt, +} + impl ReportableError for ConnInfoError { fn get_error_kind(&self) -> ErrorKind { ErrorKind::User @@ -146,6 +159,7 @@ impl UserFacingError for ConnInfoError { } fn get_conn_info( + config: &'static AuthenticationConfig, ctx: &RequestMonitoring, headers: &HeaderMap, tls: Option<&TlsConfig>, @@ -181,21 +195,32 @@ fn get_conn_info( ctx.set_user(username.clone()); let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + if !config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + } + let auth = auth .to_str() .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; AuthData::Jwt( auth.strip_prefix("Bearer ") - .ok_or(ConnInfoError::MissingPassword)? + .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? .into(), ) } else if let Some(pass) = connection_url.password() { + // wrong credentials provided + if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { std::borrow::Cow::Borrowed(b) => b.into(), std::borrow::Cow::Owned(b) => b.into(), }) + } else if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); } else { - return Err(ConnInfoError::MissingPassword); + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); }; let endpoint = match connection_url.host() { @@ -247,7 +272,7 @@ pub(crate) async fn handle( request: Request, backend: Arc, cancel: CancellationToken, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { @@ -279,7 +304,7 @@ pub(crate) async fn handle( let mut message = e.to_string_client(); let db_error = match &e { - SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + SqlOverHttpError::ConnectCompute(HttpConnError::PostgresConnectionError(e)) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; @@ -504,7 +529,7 @@ async fn handle_inner( ctx: &RequestMonitoring, request: Request, backend: Arc, -) -> Result>, SqlOverHttpError> { +) -> Result>, SqlOverHttpError> { let _requeset_gauge = Metrics::get() .proxy .connection_requests @@ -514,18 +539,50 @@ async fn handle_inner( "handling interactive connection from client" ); - // - // Determine the destination and connection params - // - let headers = request.headers(); - - // TLS config should be there. - let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?; + let conn_info = get_conn_info( + &config.authentication_config, + ctx, + request.headers(), + config.tls_config.as_ref(), + )?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" ); + match conn_info.auth { + AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { + handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await + } + auth => { + handle_db_inner( + cancel, + config, + ctx, + request, + conn_info.conn_info, + auth, + backend, + ) + .await + } + } +} + +async fn handle_db_inner( + cancel: CancellationToken, + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + request: Request, + conn_info: ConnInfo, + auth: AuthData, + backend: Arc, +) -> Result>, SqlOverHttpError> { + // + // Determine the destination and connection params + // + let headers = request.headers(); + // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in @@ -563,26 +620,36 @@ async fn handle_inner( let authenticate_and_connect = Box::pin( async { - let keys = match &conn_info.auth { + let keys = match auth { AuthData::Password(pw) => { backend .authenticate_with_password( ctx, &config.authentication_config, - &conn_info.conn_info.user_info, - pw, + &conn_info.user_info, + &pw, ) .await? } AuthData::Jwt(jwt) => { backend - .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt) - .await? + .authenticate_with_jwt( + ctx, + &config.authentication_config, + &conn_info.user_info, + jwt, + ) + .await?; + + ComputeCredentials { + info: conn_info.user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + } } }; let client = backend - .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool) + .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else @@ -640,7 +707,11 @@ async fn handle_inner( let len = json_output.len(); let response = response - .body(Full::new(Bytes::from(json_output))) + .body( + Full::new(Bytes::from(json_output)) + .map_err(|x| match x {}) + .boxed(), + ) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -656,6 +727,65 @@ async fn handle_inner( Ok(response) } +static HEADERS_TO_FORWARD: &[&HeaderName] = &[ + &AUTHORIZATION, + &CONN_STRING, + &RAW_TEXT_OUTPUT, + &ARRAY_MODE, + &TXN_ISOLATION_LEVEL, + &TXN_READ_ONLY, + &TXN_DEFERRABLE, +]; + +async fn handle_auth_broker_inner( + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + request: Request, + conn_info: ConnInfo, + jwt: String, + backend: Arc, +) -> Result>, SqlOverHttpError> { + backend + .authenticate_with_jwt( + ctx, + &config.authentication_config, + &conn_info.user_info, + jwt, + ) + .await + .map_err(HttpConnError::from)?; + + let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; + + let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); + + let (mut parts, body) = request.into_parts(); + let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri); + + // todo(conradludgate): maybe auth-broker should parse these and re-serialize + // these instead just to ensure they remain normalised. + for &h in HEADERS_TO_FORWARD { + if let Some(hv) = parts.headers.remove(h) { + req = req.header(h, hv); + } + } + + let req = req + .body(body) + .expect("all headers and params received via hyper should be valid for request"); + + // todo: map body to count egress + let _metrics = client.metrics(); + + Ok(client + .inner + .send_request(req) + .await + .map_err(LocalProxyConnError::from) + .map_err(HttpConnError::from)? + .map(|b| b.boxed())) +} + impl QueryData { async fn process( self, @@ -705,7 +835,9 @@ impl QueryData { // query failed or was cancelled. Ok(Err(error)) => { let db_error = match &error { - SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + SqlOverHttpError::ConnectCompute( + HttpConnError::PostgresConnectionError(e), + ) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; diff --git a/pyproject.toml b/pyproject.toml index ad3961ef55..556edf5589 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,10 +6,10 @@ package-mode = false [tool.poetry.dependencies] python = "^3.9" pytest = "^7.4.4" -psycopg2-binary = "^2.9.6" +psycopg2-binary = "^2.9.9" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.32.0" +requests = "^2.32.3" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 67f32b3cc0..f27413a08f 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -23,6 +23,7 @@ crc32c.workspace = true fail.workspace = true hex.workspace = true humantime.workspace = true +http.workspace = true hyper.workspace = true futures.workspace = true once_cell.workspace = true diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 5270934f5e..1e5f963a4f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -374,14 +374,16 @@ type JoinTaskRes = Result, JoinError>; async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // fsync the datadir to make sure we have a consistent state on disk. - let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; - let started = Instant::now(); - utils::crashsafe::syncfs(dfd)?; - let elapsed = started.elapsed(); - info!( - elapsed_ms = elapsed.as_millis(), - "syncfs data directory done" - ); + if !conf.no_sync { + let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; + let started = Instant::now(); + utils::crashsafe::syncfs(dfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "syncfs data directory done" + ); + } info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 2e11a279ca..3116d88dff 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -161,7 +161,7 @@ pub static HTTP_RUNTIME: Lazy = Lazy::new(|| { .thread_name("HTTP worker") .enable_all() .build() - .expect("Failed to create WAL service runtime") + .expect("Failed to create HTTP runtime") }); pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 7bdee35cd7..1a932ef699 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -9,7 +9,7 @@ use crate::walproposer_sim::{ pub mod walproposer_sim; -// Generates 2000 random seeds and runs a schedule for each of them. +// Generates 500 random seeds and runs a schedule for each of them. // If you see this test fail, please report the last seed to the // @safekeeper team. #[test] @@ -17,7 +17,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let clock = init_logger(); let mut config = TestConfig::new(Some(clock)); - for _ in 0..2000 { + for _ in 0..500 { let seed: u64 = rand::thread_rng().gen(); config.network = generate_network_opts(seed); diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 771d905c90..047b4be8fa 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -13,7 +13,7 @@ use desim::{ node_os::NodeOs, proto::{AnyMessage, NetEvent, NodeEvent}, }; -use hyper::Uri; +use http::Uri; use safekeeper::{ safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, state::{TimelinePersistentState, TimelineState}, diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 5359f586e4..877805f22e 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -10,13 +10,16 @@ bench = [] [dependencies] anyhow.workspace = true async-stream.workspace = true +bytes.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true futures.workspace = true futures-core.workspace = true futures-util.workspace = true humantime.workspace = true -hyper = { workspace = true, features = ["full"] } +hyper_1 = { workspace = true, features = ["full"] } +http-body-util.workspace = true +hyper-util = "0.1" once_cell.workspace = true parking_lot.workspace = true prost.workspace = true diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 15acd0e49c..f01e6adf5a 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -13,10 +13,13 @@ use clap::{command, Parser}; use futures_core::Stream; use futures_util::StreamExt; +use http_body_util::Full; use hyper::header::CONTENT_TYPE; -use hyper::server::conn::AddrStream; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Method, StatusCode}; +use hyper::service::service_fn; +use hyper::{Method, StatusCode}; +use hyper_1 as hyper; +use hyper_1::body::Incoming; +use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; use parking_lot::RwLock; use std::collections::HashMap; use std::convert::Infallible; @@ -24,9 +27,11 @@ use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; +use tokio::net::TcpListener; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; use tokio::time; +use tonic::body::{self, empty_body, BoxBody}; use tonic::codegen::Service; use tonic::transport::server::Connected; use tonic::Code; @@ -45,9 +50,7 @@ use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, }; -use storage_broker::{ - parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, -}; +use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR}; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::sentry_init::init_sentry; @@ -599,8 +602,8 @@ impl BrokerService for Broker { // We serve only metrics and healthcheck through http1. async fn http1_handler( - req: hyper::Request, -) -> Result, Infallible> { + req: hyper::Request, +) -> Result, Infallible> { let resp = match (req.method(), req.uri().path()) { (&Method::GET, "/metrics") => { let mut buffer = vec![]; @@ -611,16 +614,16 @@ async fn http1_handler( hyper::Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, encoder.format_type()) - .body(Body::from(buffer)) + .body(body::boxed(Full::new(bytes::Bytes::from(buffer)))) .unwrap() } (&Method::GET, "/status") => hyper::Response::builder() .status(StatusCode::OK) - .body(Body::empty()) + .body(empty_body()) .unwrap(), _ => hyper::Response::builder() .status(StatusCode::NOT_FOUND) - .body(Body::empty()) + .body(empty_body()) .unwrap(), }; Ok(resp) @@ -662,52 +665,74 @@ async fn main() -> Result<(), Box> { }; let storage_broker_server = BrokerServiceServer::new(storage_broker_impl); - info!("listening on {}", &args.listen_addr); - // grpc is served along with http1 for metrics on a single port, hence we // don't use tonic's Server. - hyper::Server::bind(&args.listen_addr) - .http2_keep_alive_interval(Some(args.http2_keepalive_interval)) - .serve(make_service_fn(move |conn: &AddrStream| { - let storage_broker_server_cloned = storage_broker_server.clone(); - let connect_info = conn.connect_info(); - async move { - Ok::<_, Infallible>(service_fn(move |mut req| { - // That's what tonic's MakeSvc.call does to pass conninfo to - // the request handler (and where its request.remote_addr() - // expects it to find). - req.extensions_mut().insert(connect_info.clone()); - - // Technically this second clone is not needed, but consume - // by async block is apparently unavoidable. BTW, error - // message is enigmatic, see - // https://github.com/rust-lang/rust/issues/68119 - // - // We could get away without async block at all, but then we - // need to resort to futures::Either to merge the result, - // which doesn't caress an eye as well. - let mut storage_broker_server_svc = storage_broker_server_cloned.clone(); - async move { - if req.headers().get("content-type").map(|x| x.as_bytes()) - == Some(b"application/grpc") - { - let res_resp = storage_broker_server_svc.call(req).await; - // Grpc and http1 handlers have slightly different - // Response types: it is UnsyncBoxBody for the - // former one (not sure why) and plain hyper::Body - // for the latter. Both implement HttpBody though, - // and EitherBody is used to merge them. - res_resp.map(|resp| resp.map(EitherBody::Left)) - } else { - let res_resp = http1_handler(req).await; - res_resp.map(|resp| resp.map(EitherBody::Right)) - } - } - })) + let tcp_listener = TcpListener::bind(&args.listen_addr).await?; + info!("listening on {}", &args.listen_addr); + loop { + let (stream, addr) = match tcp_listener.accept().await { + Ok(v) => v, + Err(e) => { + info!("couldn't accept connection: {e}"); + continue; } - })) - .await?; - Ok(()) + }; + + let mut builder = hyper_util::server::conn::auto::Builder::new(TokioExecutor::new()); + builder.http1().timer(TokioTimer::new()); + builder + .http2() + .timer(TokioTimer::new()) + .keep_alive_interval(Some(args.http2_keepalive_interval)); + + let storage_broker_server_cloned = storage_broker_server.clone(); + let connect_info = stream.connect_info(); + let service_fn_ = async move { + service_fn(move |mut req| { + // That's what tonic's MakeSvc.call does to pass conninfo to + // the request handler (and where its request.remote_addr() + // expects it to find). + req.extensions_mut().insert(connect_info.clone()); + + // Technically this second clone is not needed, but consume + // by async block is apparently unavoidable. BTW, error + // message is enigmatic, see + // https://github.com/rust-lang/rust/issues/68119 + // + // We could get away without async block at all, but then we + // need to resort to futures::Either to merge the result, + // which doesn't caress an eye as well. + let mut storage_broker_server_svc = storage_broker_server_cloned.clone(); + async move { + if req.headers().get("content-type").map(|x| x.as_bytes()) + == Some(b"application/grpc") + { + let res_resp = storage_broker_server_svc.call(req).await; + // Grpc and http1 handlers have slightly different + // Response types: it is UnsyncBoxBody for the + // former one (not sure why) and plain hyper::Body + // for the latter. Both implement HttpBody though, + // and `Either` is used to merge them. + res_resp.map(|resp| resp.map(http_body_util::Either::Left)) + } else { + let res_resp = http1_handler(req).await; + res_resp.map(|resp| resp.map(http_body_util::Either::Right)) + } + } + }) + } + .await; + + tokio::task::spawn(async move { + let res = builder + .serve_connection(TokioIo::new(stream), service_fn_) + .await; + + if let Err(e) = res { + info!("error serving connection from {addr}: {e}"); + } + }); + } } #[cfg(test)] diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index aa5d0bad5f..f2ea0f0b2f 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,6 +1,4 @@ -use hyper::body::HttpBody; -use std::pin::Pin; -use std::task::{Context, Poll}; +use hyper_1 as hyper; use std::time::Duration; use tonic::codegen::StdError; use tonic::transport::{ClientTlsConfig, Endpoint}; @@ -94,56 +92,3 @@ pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result; - -// Provides impl HttpBody for two different types implementing it. Inspired by -// https://github.com/hyperium/tonic/blob/master/examples/src/hyper_warp/server.rs -pub enum EitherBody { - Left(A), - Right(B), -} - -impl HttpBody for EitherBody -where - A: HttpBody + Send + Unpin, - B: HttpBody + Send + Unpin, - A::Error: Into, - B::Error: Into, -{ - type Data = A::Data; - type Error = Box; - - fn is_end_stream(&self) -> bool { - match self { - EitherBody::Left(b) => b.is_end_stream(), - EitherBody::Right(b) => b.is_end_stream(), - } - } - - fn poll_data( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - match self.get_mut() { - EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err), - EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err), - } - } - - fn poll_trailers( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll, Self::Error>> { - match self.get_mut() { - EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into), - EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into), - } - } -} - -fn map_option_err>(err: Option>) -> Option> { - err.map(|e| e.map_err(Into::into)) -} diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 00e90f4467..62cb0e9e5d 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, - MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -108,6 +108,9 @@ struct Cli { // Period with which to send heartbeats to registered nodes #[arg(long)] heartbeat_interval: Option, + + #[arg(long)] + long_reconcile_threshold: Option, } enum StrictMode { @@ -293,6 +296,10 @@ async fn async_main() -> anyhow::Result<()> { .heartbeat_interval .map(humantime::Duration::into) .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), + long_reconcile_threshold: args + .long_reconcile_threshold + .map(humantime::Duration::into) + .unwrap_or(LONG_RECONCILE_THRESHOLD_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5cfcfb4b1f..5989aeba91 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -87,6 +87,10 @@ pub(crate) struct StorageControllerMetricGroup { measured::HistogramVec, pub(crate) storage_controller_leadership_status: measured::GaugeVec, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_reconcile_long_running: + measured::CounterVec, } impl StorageControllerMetrics { @@ -168,6 +172,17 @@ pub(crate) struct LeadershipStatusGroup { pub(crate) status: LeadershipStatus, } +#[derive(measured::LabelGroup, Clone)] +#[label(set = ReconcileLongRunningLabelGroupSet)] +pub(crate) struct ReconcileLongRunningLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) tenant_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) shard_number: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) sequence: &'a str, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 2c42da4043..4864a021fe 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -454,7 +454,7 @@ impl Reconciler { Ok(l) => l, Err(e) => { tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; continue; } }; @@ -469,10 +469,7 @@ impl Reconciler { } } None => { - // Expected timeline isn't yet visible on migration destination. - // (IRL we would have to account for timeline deletion, but this - // is just test helper) - any_behind = true; + // Timeline was deleted in the meantime - ignore it } } } @@ -481,7 +478,7 @@ impl Reconciler { tracing::info!("✅ LSN caught up. Proceeding..."); break; } else { - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; } } @@ -562,6 +559,8 @@ impl Reconciler { self.location_config(&dest_ps, dest_conf, None, false) .await?; + pausable_failpoint!("reconciler-live-migrate-pre-await-lsn"); + if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) @@ -572,30 +571,7 @@ impl Reconciler { // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. - let mut notify_attempts = 0; - while let Err(e) = self.compute_notify().await { - match e { - NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), - NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), - _ => { - tracing::warn!( - "Live migration blocked by compute notification error, retrying: {e}" - ); - } - } - - exponential_backoff( - notify_attempts, - // Generous waits: control plane operations which might be blocking us usually complete on the order - // of hundreds to thousands of milliseconds, so no point busy polling. - 1.0, - 10.0, - &self.cancel, - ) - .await; - notify_attempts += 1; - } - + self.compute_notify_blocking(&origin_ps).await?; pausable_failpoint!("reconciler-live-migrate-post-notify"); // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then @@ -869,6 +845,117 @@ impl Reconciler { Ok(()) } } + + /// Keep trying to notify the compute indefinitely, only dropping out if: + /// - the node `origin` becomes unavailable -> Ok(()) + /// - the node `origin` no longer has our tenant shard attached -> Ok(()) + /// - our cancellation token fires -> Err(ReconcileError::Cancelled) + /// + /// This is used during live migration, where we do not wish to detach + /// an origin location until the compute definitely knows about the new + /// location. + /// + /// In cases where the origin node becomes unavailable, we return success, indicating + /// to the caller that they should continue irrespective of whether the compute was notified, + /// because the origin node is unusable anyway. Notification will be retried later via the + /// [`Self::compute_notify_failure`] flag. + async fn compute_notify_blocking(&mut self, origin: &Node) -> Result<(), ReconcileError> { + let mut notify_attempts = 0; + while let Err(e) = self.compute_notify().await { + match e { + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), + _ => { + tracing::warn!( + "Live migration blocked by compute notification error, retrying: {e}" + ); + } + } + + // Did the origin pageserver become unavailable? + if !origin.is_available() { + tracing::info!("Giving up on compute notification because {origin} is unavailable"); + break; + } + + // Does the origin pageserver still host the shard we are interested in? We should only + // continue waiting for compute notification to be acked if the old location is still usable. + let tenant_shard_id = self.tenant_shard_id; + match origin + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(Some(location_conf))) => { + if matches!( + location_conf.mode, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale + ) { + tracing::debug!( + "Still attached to {origin}, will wait & retry compute notification" + ); + } else { + tracing::info!( + "Giving up on compute notification because {origin} is in state {:?}", + location_conf.mode + ); + return Ok(()); + } + // Fall through + } + Some(Ok(None)) => { + tracing::info!( + "No longer attached to {origin}, giving up on compute notification" + ); + return Ok(()); + } + Some(Err(e)) => { + match e { + mgmt_api::Error::Cancelled => { + tracing::info!( + "Giving up on compute notification because {origin} is unavailable" + ); + return Ok(()); + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => { + tracing::info!( + "No longer attached to {origin}, giving up on compute notification" + ); + return Ok(()); + } + e => { + // Other API errors are unexpected here. + tracing::warn!("Unexpected error checking location on {origin}: {e}"); + + // Fall through, we will retry compute notification. + } + } + } + None => return Err(ReconcileError::Cancel), + }; + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; + } + + Ok(()) + } } /// We tweak the externally-set TenantConfig while configuring diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a5e0129684..bd5759422c 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -128,6 +128,9 @@ pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); /// How often to send heartbeats to registered nodes? pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); +/// How long is too long for a reconciliation? +pub const LONG_RECONCILE_THRESHOLD_DEFAULT: Duration = Duration::from_secs(120); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -348,6 +351,8 @@ pub struct Config { pub start_as_candidate: bool, pub http_service_port: i32, + + pub long_reconcile_threshold: Duration, } impl From for ApiError { @@ -4974,7 +4979,12 @@ impl Service { { let mut nodes_mut = (**nodes).clone(); - nodes_mut.remove(&node_id); + if let Some(mut removed_node) = nodes_mut.remove(&node_id) { + // Ensure that any reconciler holding an Arc<> to this node will + // drop out when trying to RPC to it (setting Offline state sets the + // cancellation token on the Node object). + removed_node.set_availability(NodeAvailability::Offline); + } *nodes = Arc::new(nodes_mut); } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index afc89eae00..953c73119b 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -5,7 +5,9 @@ use std::{ }; use crate::{ - metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, + }, persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ @@ -14,6 +16,7 @@ use crate::{ }, service::ReconcileResultRequest, }; +use futures::future::{self, Either}; use pageserver_api::controller_api::{ AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, }; @@ -1083,6 +1086,47 @@ impl TenantShard { } } + async fn reconcile( + sequence: Sequence, + mut reconciler: Reconciler, + must_notify: bool, + ) -> ReconcileResult { + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); + + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + ReconcileResult { + sequence, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + } + } + #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( @@ -1122,7 +1166,7 @@ impl TenantShard { let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); - let mut reconciler = Reconciler { + let reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, placement_policy: self.policy.clone(), @@ -1142,6 +1186,7 @@ impl TenantShard { }; let reconcile_seq = self.sequence; + let long_reconcile_threshold = service_config.long_reconcile_threshold; tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); let must_notify = self.pending_compute_notification; @@ -1178,41 +1223,55 @@ impl TenantShard { return; } - // Attempt to make observed state match intent state - let result = reconciler.reconcile().await; + let (tenant_id_label, shard_number_label, sequence_label) = { + ( + reconciler.tenant_shard_id.tenant_id.to_string(), + reconciler.tenant_shard_id.shard_number.0.to_string(), + reconcile_seq.to_string(), + ) + }; - // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work - if result.is_ok() && must_notify { - // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] - reconciler.compute_notify().await.ok(); + let label_group = ReconcileLongRunningLabelGroup { + tenant_id: &tenant_id_label, + shard_number: &shard_number_label, + sequence: &sequence_label, + }; + + let reconcile_fut = Self::reconcile(reconcile_seq, reconciler, must_notify); + let long_reconcile_fut = { + let label_group = label_group.clone(); + async move { + tokio::time::sleep(long_reconcile_threshold).await; + + tracing::warn!("Reconcile passed the long running threshold of {long_reconcile_threshold:?}"); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .inc(label_group); + } + }; + + let reconcile_fut = std::pin::pin!(reconcile_fut); + let long_reconcile_fut = std::pin::pin!(long_reconcile_fut); + + let (was_long, result) = + match future::select(reconcile_fut, long_reconcile_fut).await { + Either::Left((reconcile_result, _)) => (false, reconcile_result), + Either::Right((_, reconcile_fut)) => (true, reconcile_fut.await), + }; + + if was_long { + let id = metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .with_labels(label_group); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .remove_metric(id); } - // Update result counter - let outcome_label = match &result { - Ok(_) => ReconcileOutcome::Success, - Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, - Err(_) => ReconcileOutcome::Error, - }; - - metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_reconcile_complete - .inc(ReconcileCompleteLabelGroup { - status: outcome_label, - }); - - // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might - // try and schedule more work in response to our result. - let result = ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, - }; - result_tx .send(ReconcileResultRequest::ReconcileResult(result)) .ok(); diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index c96d9cad3b..1e69ddbf15 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -4,7 +4,7 @@ use std::time::Duration; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; @@ -18,6 +18,7 @@ use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; +use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; @@ -326,15 +327,25 @@ async fn maybe_delete_index( } // All validations passed: erase the object - match remote_client - .delete(&obj.key, &CancellationToken::new()) - .await + let cancel = CancellationToken::new(); + match backoff::retry( + || remote_client.delete(&obj.key, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "maybe_delete_index", + &cancel, + ) + .await { - Ok(_) => { + None => { + unreachable!("Using a dummy cancellation token"); + } + Some(Ok(_)) => { tracing::info!("Successfully deleted index"); summary.indices_deleted += 1; } - Err(e) => { + Some(Err(e)) => { tracing::warn!("Failed to delete index: {e}"); summary.remote_storage_errors += 1; } diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 770b32b11e..fb9c2d2b86 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -340,23 +340,27 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: @pytest.fixture(scope="function", autouse=True) -def sync_after_each_test(): - # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true` +def sync_between_tests(): + # The fixture calls `sync(2)` after each test if `SYNC_BETWEEN_TESTS` env var is `true` # - # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`) + # In CI, `SYNC_BETWEEN_TESTS` is set to `true` only for benchmarks (`test_runner/performance`) # that are run on self-hosted runners because some of these tests are pretty write-heavy # and create issues to start the processes within 10s - key = "SYNC_AFTER_EACH_TEST" + key = "SYNC_BETWEEN_TESTS" enabled = os.environ.get(key) == "true" + if enabled: + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync before test {elapsed=}") + yield - if not enabled: - # regress test, or running locally - return - - start = time.time() - # we only run benches on unices, the method might not exist on windows - os.sync() - elapsed = time.time() - start - log.info(f"called sync after test {elapsed=}") + if enabled: + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync after test {elapsed=}") diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 70fe632f49..6a53a34bc9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -20,7 +20,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum from fcntl import LOCK_EX, LOCK_UN, flock -from functools import cached_property, partial +from functools import cached_property from itertools import chain, product from pathlib import Path from types import TracebackType @@ -86,7 +86,7 @@ from fixtures.remote_storage import ( remote_storage_to_toml_dict, ) from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, @@ -950,9 +950,6 @@ class NeonEnv: safekeepers - An array containing objects representing the safekeepers - pg_bin - pg_bin.run() can be used to execute Postgres client binaries, - like psql or pg_dump - initial_tenant - tenant ID of the initial tenant created in the repository neon_cli - can be used to run the 'neon' CLI tool @@ -3300,6 +3297,8 @@ class PgBin: @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: + """pg_bin.run() can be used to execute Postgres client binaries, like psql or pg_dump""" + return PgBin(test_output_dir, pg_distrib_dir, pg_version) @@ -3311,7 +3310,7 @@ class VanillaPostgres(PgProtocol): self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "--pgdata", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def enable_tls(self): @@ -4101,12 +4100,26 @@ class Endpoint(PgProtocol, LogUtils): with open(remote_extensions_spec_path, "w") as file: json.dump(spec, file, indent=4) - def stop(self, mode: str = "fast") -> "Endpoint": + def stop( + self, + mode: str = "fast", + sks_wait_walreceiver_gone: Optional[tuple[List[Safekeeper], TimelineId]] = None, + ) -> "Endpoint": """ Stop the Postgres instance if it's running. - Because test teardown might try and stop an endpoint concurrently with test code - stopping the endpoint, this method is thread safe + Because test teardown might try and stop an endpoint concurrently with + test code stopping the endpoint, this method is thread safe + + If sks_wait_walreceiever_gone is not None, wait for the safekeepers in + this list to have no walreceivers, i.e. compute endpoint connection be + gone. When endpoint is stopped in immediate mode and started again this + avoids race of old connection delivering some data after + sync-safekeepers check, which makes basebackup unusable. TimelineId is + needed because endpoint doesn't know it. + + A better solution would be bump term when sync-safekeepers is skipped on + start, see #9079. Returns self. """ @@ -4118,6 +4131,11 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, check_return_code=self.check_stop_result, mode=mode ) + if sks_wait_walreceiver_gone is not None: + for sk in sks_wait_walreceiver_gone[0]: + cli = sk.http_client() + wait_walreceivers_absent(cli, self.tenant_id, sks_wait_walreceiver_gone[1]) + return self def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint": @@ -5210,7 +5228,7 @@ def flush_ep_to_pageserver( for sk in env.safekeepers: cli = sk.http_client() # wait until compute connections are gone - wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline)) + wait_walreceivers_absent(cli, tenant, timeline) commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) # Note: depending on WAL filtering implementation, probably most shards diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0dd557c59f..49ad54d456 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -586,6 +586,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + force_l0_compaction=False, wait_until_uploaded=False, enhanced_gc_bottom_most_compaction=False, ): @@ -595,6 +596,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if force_l0_compaction: + query["force_l0_compaction"] = "true" if wait_until_uploaded: query["wait_until_uploaded"] = "true" if enhanced_gc_bottom_most_compaction: @@ -701,6 +704,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + force_l0_compaction=False, wait_until_uploaded=False, compact: Optional[bool] = None, **kwargs, @@ -711,6 +715,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if force_l0_compaction: + query["force_l0_compaction"] = "true" if wait_until_uploaded: query["wait_until_uploaded"] = "true" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 96c84d1616..7f170eeea3 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -8,6 +8,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics +from fixtures.utils import wait_until # Walreceiver as returned by sk's timeline status endpoint. @@ -161,6 +162,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): walreceivers=walreceivers, ) + # Get timeline_start_lsn, waiting until it's nonzero. It is a way to ensure + # that the timeline is fully initialized at the safekeeper. + def get_non_zero_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + def timeline_start_lsn_non_zero() -> Lsn: + s = self.timeline_status(tenant_id, timeline_id).timeline_start_lsn + assert s > Lsn(0) + return s + + return wait_until(30, 1, timeline_start_lsn_non_zero) + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 0e4b5d7883..2a081c6ccb 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -1,11 +1,20 @@ from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.utils import wait_until -def are_walreceivers_absent( +def wait_walreceivers_absent( sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): - status = sk_http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") - return len(status.walreceivers) == 0 + """ + Wait until there is no walreceiver connections from the compute(s) on the + safekeeper. + """ + + def walreceivers_absent(): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + assert len(status.walreceivers) == 0 + + wait_until(30, 0.5, walreceivers_absent) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 065a78bf9b..1ea0267e87 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -175,7 +175,9 @@ class Workload: if upload: # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) ps_http.timeline_checkpoint( - tenant_shard_id, self.timeline_id, wait_until_uploaded=True + tenant_shard_id, + self.timeline_id, + wait_until_uploaded=True, ) log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") else: diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index d7c4cf059a..43140c05ff 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -53,7 +53,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - tenant, _ = env.neon_cli.create_tenant( + tenant, timeline_main = env.neon_cli.create_tenant( conf={ # disable background GC "gc_period": "0s", @@ -70,8 +70,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): } ) - timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) - endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant) + endpoint_main = env.endpoints.create_start("main", tenant_id=tenant) main_cur = endpoint_main.connect().cursor() @@ -92,7 +91,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( - "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 + "test_branch", ancestor_branch_name="main", ancestor_start_lsn=lsn1, tenant_id=tenant ) endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index cb34551b53..98bd3a6a5f 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -63,7 +63,10 @@ page_cache_size=10 log.info(f"Running churn round {i}/{churn_rounds} ...") workload.churn_rows(row_count, env.pageserver.id) - ps_http.timeline_compact(tenant_id, timeline_id) + # Force L0 compaction to ensure the number of layers is within bounds; we don't want to count L0 layers + # in this benchmark. In other words, this smoke test ensures number of L1 layers are bound. + ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1 log.info("Validating at workload end ...") workload.validate(env.pageserver.id) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index fb5c1d3115..0669105625 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,9 +156,6 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") -@skip_on_postgres( - PgVersion.V17, "There are no snapshots yet" -) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -206,9 +203,6 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") -@skip_on_postgres( - PgVersion.V17, "There are no snapshots yet" -) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -258,7 +252,7 @@ def test_forward_compatibility( # not using env.pageserver.version because it was initialized before prev_pageserver_version_str = env.get_binary_version("pageserver") prev_pageserver_version_match = re.search( - "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)", prev_pageserver_version_str, ) if prev_pageserver_version_match is not None: @@ -269,12 +263,12 @@ def test_forward_compatibility( ) # does not include logs from previous runs - assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}") env.start() # ensure the specified pageserver is running - assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}") check_neon_works( env, diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index b65430ff49..96543f1ef5 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -31,9 +31,7 @@ def helper_compare_timeline_list( ) ) - timelines_cli = env.neon_cli.list_timelines() - assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) - + timelines_cli = env.neon_cli.list_timelines(initial_tenant) cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) assert timelines_api == cli_timeline_ids diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 22a6013225..619fd83c9b 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.5",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -48,7 +48,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.5",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"] current_version = "1.5" diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 51e847135e..cac74492d7 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -435,7 +435,9 @@ $$; # Wait until pageserver has received all the data, and restart the endpoint wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) - endpoint.stop(mode="immediate") # 'immediate' to avoid writing shutdown checkpoint + endpoint.stop( + mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id) + ) # 'immediate' to avoid writing shutdown checkpoint endpoint.start() # Check that the next-multixid value wrapped around correctly diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 519994f774..96521b5684 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -549,6 +549,14 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # Instruct the storage controller to not interfere with our low level configuration + # of the pageserver's attachment states. Otherwise when it sees nodes go offline+return, + # it would send its own requests that would conflict with the test's. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.extend( + [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"] + ) + # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index bbf82fea4c..bd47a30428 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -174,8 +174,7 @@ def test_pageserver_chaos( "checkpoint_distance": "5000000", } ) - env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 5e8b8d38f7..b08fcc0da1 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -27,7 +27,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.extend( [ ".*basebackup .* failed: invalid basebackup lsn.*", - ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected", + ".*/lsn_lease.*invalid lsn lease request.*", ] ) @@ -108,7 +108,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): assert cur.fetchone() == (1,) # Create node at pre-initdb lsn - with pytest.raises(Exception, match="invalid basebackup lsn"): + with pytest.raises(Exception, match="invalid lsn lease request"): # compute node startup with invalid LSN should fail env.endpoints.create_start( branch_name="main", @@ -167,6 +167,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) return last_flush_lsn + def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint): + """ + Trigger GC manually on all pageservers. Then run an `SELECT` query. + """ + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert ( + gc_result["layers_removed"] == 0 + ), "No layers should be removed, old layers are guarded by leases." + + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + # Insert some records on main branch with env.endpoints.create_start("main") as ep_main: with ep_main.cursor() as cur: @@ -193,25 +210,31 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): generate_updates_on_main(env, ep_main, i, end=100) - # Trigger GC - for shard, ps in tenant_get_shards(env, env.initial_tenant): - client = ps.http_client() - gc_result = client.timeline_gc(shard, env.initial_timeline, 0) - log.info(f"{gc_result=}") + trigger_gc_and_select(env, ep_static) - assert ( - gc_result["layers_removed"] == 0 - ), "No layers should be removed, old layers are guarded by leases." + # Trigger Pageserver restarts + for ps in env.pageservers: + ps.stop() + # Static compute should have at least one lease request failure due to connection. + time.sleep(LSN_LEASE_LENGTH / 2) + ps.start() - with ep_static.cursor() as cur: - cur.execute("SELECT count(*) FROM t0") - assert cur.fetchone() == (ROW_COUNT,) + trigger_gc_and_select(env, ep_static) + + # Reconfigure pageservers + env.pageservers[0].stop() + env.storage_controller.node_configure( + env.pageservers[0].id, {"availability": "Offline"} + ) + env.storage_controller.reconcile_until_idle() + + trigger_gc_and_select(env, ep_static) # Do some update so we can increment latest_gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) # Wait for the existing lease to expire. - time.sleep(LSN_LEASE_LENGTH) + time.sleep(LSN_LEASE_LENGTH + 1) # Now trigger GC again, layers should be removed. for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index 0d95109d6b..d5e92b92d1 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -103,6 +103,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): # Initialize the primary, a test table, and a helper function to create lots # of subtransactions. env = neon_simple_env + timeline_id = env.initial_timeline primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") primary_conn = primary.connect() primary_cur = primary_conn.cursor() @@ -114,7 +115,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): # chance to write abort records for them. primary_cur.execute("begin") primary_cur.execute("select create_subxacts(100000)") - primary.stop(mode="immediate") + primary.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) # Restart the primary. Do some light work, and shut it down cleanly primary.start() @@ -659,6 +660,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): # Initialize the primary and a test table env = neon_simple_env + timeline_id = env.initial_timeline primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") with primary.cursor() as primary_cur: primary_cur.execute("create table t(pk serial primary key, payload integer)") @@ -667,7 +669,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): with primary.cursor() as primary_cur: primary_cur.execute("insert into t (payload) values (0)") # restart primary - primary.stop("immediate") + primary.stop("immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) primary.start() # Wait for the WAL to be flushed diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 3861f0b822..11c743e8a0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -567,6 +567,149 @@ def test_storage_controller_compute_hook( env.storage_controller.consistency_check() +def test_storage_controller_stuck_compute_hook( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + Test the migration process's behavior when the compute hook does not enable it to proceed + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + # Do a migration while the compute hook is returning 423 status + tenant_id = env.initial_tenant + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + dest_pageserver = env.get_pageserver(dest_ps_id) + shard_0_id = TenantShardId(tenant_id, 0, 0) + + NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" + env.storage_controller.allowed_errors.extend( + [ + NOTIFY_BLOCKED_LOG, + ".*Failed to notify compute.*", + ".*Reconcile error.*Cancelled", + ".*Reconcile error.*Control plane tenant busy", + ] + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + # We expect the controller to hit the 423 (locked) and retry. Migration shouldn't complete until that + # status is cleared. + handle_params["status"] = 423 + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, shard_0_id, dest_ps_id + ) + + def logged_stuck(): + env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG) + + wait_until(10, 0.25, logged_stuck) + contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG) + assert contains_r is not None # Appease mypy + (_, log_cursor) = contains_r + assert migrate_fut.running() + + # Permit the compute hook to proceed + handle_params["status"] = 200 + migrate_fut.result(timeout=10) + + # Advance log cursor past the last 'stuck' message (we already waited for one, but + # there could be more than one) + while True: + contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) + if contains_r is None: + break + else: + (_, log_cursor) = contains_r + + # Now, do a migration in the opposite direction + handle_params["status"] = 423 + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, shard_0_id, origin_pageserver.id + ) + + def logged_stuck_again(): + env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) + + wait_until(10, 0.25, logged_stuck_again) + assert migrate_fut.running() + + # This time, the compute hook remains stuck, but we mark the origin node offline: this should + # also allow the migration to complete -- we only wait for the compute hook as long as we think + # the old location is still usable for computes. + # This is a regression test for issue https://github.com/neondatabase/neon/issues/8901 + dest_pageserver.stop() + env.storage_controller.node_configure(dest_ps_id, {"availability": "Offline"}) + + try: + migrate_fut.result(timeout=10) + except StorageControllerApiException as e: + # The reconciler will fail because it can't detach from the origin: the important + # thing is that it finishes, rather than getting stuck in the compute notify loop. + assert "Reconcile error" in str(e) + + # A later background reconciliation will clean up and leave things in a neat state, even + # while the compute hook is still blocked + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # We expect that the reconciler will do its work, but be unable to fully succeed + # because it can't send a compute notification. It will complete, but leave + # the internal flag set for "retry compute notification later" + assert "Control plane tenant busy" in str(e) + + # Confirm that we are AttachedSingle on the node we last called the migrate API for + loc = origin_pageserver.http_client().tenant_get_location(shard_0_id) + assert loc["mode"] == "AttachedSingle" + + # When the origin node comes back, it should get cleaned up + dest_pageserver.start() + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # Compute hook is still blocked: reconciler will configure PS but not fully succeed + assert "Control plane tenant busy" in str(e) + + with pytest.raises(PageserverApiException, match="Tenant shard not found"): + dest_pageserver.http_client().tenant_get_location(shard_0_id) + + # Once the compute hook is unblocked, we should be able to get into a totally + # quiescent state again + handle_params["status"] = 200 + env.storage_controller.reconcile_until_idle() + + env.storage_controller.consistency_check() + + def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test @@ -2470,6 +2613,9 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB class MigrationFailpoints(Enum): # While only the origin is attached PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc" + # While only the origin is attached and the db was updated to + # point to the new location + PRE_AWAIT_LSN = "reconciler-live-migrate-pre-await-lsn" # While both locations are attached POST_NOTIFY = "reconciler-live-migrate-post-notify" # While only the destination is attached @@ -2495,6 +2641,12 @@ def test_storage_controller_proxy_during_migration( """ neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.storage_controller_config = { + # Publish long reconcile metric early + "long_reconcile_threshold": "5s", + } + env = neon_env_builder.init_configs() env.start() @@ -2502,6 +2654,12 @@ def test_storage_controller_proxy_during_migration( timeline_id = env.initial_timeline env.neon_cli.create_tenant(tenant_id, timeline_id) + # The test stalls a reconcile on purpose to check if the long running + # reconcile alert fires. + env.storage_controller.allowed_errors.extend( + [".*Reconcile passed the long running threshold.*"] + ) + # Activate a failpoint that will cause live migration to get stuck _after_ the generation has been issued # to the new pageserver: this should result in requests routed to the new pageserver. env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) @@ -2509,6 +2667,24 @@ def test_storage_controller_proxy_during_migration( origin_pageserver = env.get_tenant_pageserver(tenant_id) dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + def long_migration_metric_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + == 1 + ) + + def assert_long_migration_metric_not_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + is None + ) + try: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: migrate_fut = executor.submit( @@ -2539,9 +2715,14 @@ def test_storage_controller_proxy_during_migration( # We expect request to land on the origin assert tenant_info["generation"] == 1 + wait_until(10, 1, long_migration_metric_published) + # Eventually migration completes env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) migrate_fut.result() + + assert_long_migration_metric_not_published() + except: # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) @@ -2664,3 +2845,77 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): attached_to = shard["node_attached"] expected_az = env.get_pageserver(attached_to).az_id assert shard["preferred_az_id"] == expected_az + + +@run_only_on_default_postgres("Postgres version makes no difference here") +@pytest.mark.parametrize( + "migration_failpoint", + [ + MigrationFailpoints.PRE_GENERATION_INC, + MigrationFailpoints.PRE_AWAIT_LSN, + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ], +) +def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, migration_failpoint): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + + shard_zero = TenantShardId(tenant_id, 0, 0) + locations = env.storage_controller.get_tenants_placement()[str(shard_zero)] + + assert locations["observed"] == locations["intent"] + assert locations["observed"]["attached"] is not None + assert len(locations["observed"]["secondary"]) > 0 + + attached_location = locations["observed"]["attached"] + secondary_location = locations["observed"]["secondary"][0] + + env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + shard_zero, + secondary_location, + ) + + def has_hit_migration_failpoint(): + expr = f"at failpoint {migration_failpoint.value}" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(10, 1, has_hit_migration_failpoint) + + env.storage_controller.pageserver_api().timeline_delete( + tenant_id=tenant_id, timeline_id=timeline_id + ) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + migrate_fut.result() + + # Ensure that we detached from the old attached location + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(attached_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + + # Ensure the timeline is not present on the new attached location + client = env.get_pageserver(secondary_location).http_client() + assert timeline_id not in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"deleted timeline found on {secondary_location}" + + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + raise diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 91caad7220..647a2e6b14 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -13,7 +13,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): pub = env.endpoints.create("publisher") pub.start() - env.neon_cli.create_branch("subscriber") + sub_timeline_id = env.neon_cli.create_branch("subscriber") sub = env.endpoints.create("subscriber") sub.start() @@ -47,7 +47,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): for _ in range(n_restarts): # restart subscriber # time.sleep(2) - sub.stop("immediate") + sub.stop("immediate", sks_wait_walreceiver_gone=(env.safekeepers, sub_timeline_id)) sub.start() thread.join() diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index f872116a1c..609987ab0c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -27,20 +27,15 @@ def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) # we should never have zero, because there should be the initdb "changes" assert initial_size > 0, "initial implementation returns ~initdb tenant_size" - main_branch_name = "main" - - branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] - assert branch_name == main_branch_name - endpoint = env.endpoints.create_start( - main_branch_name, + "main", tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], ) @@ -54,7 +49,7 @@ def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): # The transaction above will make the compute generate a checkpoint. # In turn, the pageserver persists the checkpoint. This should only be # one key with a size of a couple hundred bytes. - wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) size = http_client.tenant_size(tenant_id) assert size >= initial_size and size - initial_size < 1024 @@ -306,7 +301,8 @@ def test_single_branch_get_tenant_size_grows( env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) tenant_id = env.initial_tenant - branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + timeline_id = env.initial_timeline + branch_name = "main" http_client = env.pageserver.http_client() @@ -516,7 +512,8 @@ def test_get_tenant_size_with_multiple_branches( env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*") tenant_id = env.initial_tenant - main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + main_timeline_id = env.initial_timeline + main_branch_name = "main" http_client = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 168876b711..6ecc903192 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -71,10 +71,9 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder): "checkpoint_distance": "5000000", } ) - env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) endpoint = env.endpoints.create_start( - "test_tenants_many", + "main", tenant_id=tenant, ) tenants_endpoints.append((tenant, endpoint)) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 711fcd5016..edb32cd2b4 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -638,7 +638,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, first_request_finished) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10) def test_timeline_delete_works_for_remote_smoke( diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 765c72cf2a..ddfe9b911f 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -45,10 +45,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool tenant_after = http.tenant_status(env.initial_tenant) assert tenant_before != tenant_after gc_blocking = tenant_after["gc_blocking"] - assert ( - gc_blocking - == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }" - ) + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" wait_for_another_gc_round() pss.assert_log_contains(gc_skipped_line) diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index 52f125ce0b..bfa9ce5db7 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -26,8 +26,7 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): } ) - env.neon_cli.create_timeline("test_truncate", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_truncate", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() cur.execute("create table t1(x integer)") cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 3075211ada..ae1b6fdab3 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -247,7 +247,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): # in a "clean" way. Our neon extension will write a full-page image of the VM # page, and we want to avoid that. A clean shutdown will also not do, for the # same reason. - endpoint.stop(mode="immediate") + endpoint.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) endpoint.start() pg_conn = endpoint.connect() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8ee548bdb0..25c66c3cae 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -47,7 +47,7 @@ from fixtures.remote_storage import ( s3_storage, ) from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( PropagatingThread, get_dir_size, @@ -772,7 +772,7 @@ class ProposerPostgres(PgProtocol): def initdb(self): """Run initdb""" - args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] + args = ["initdb", "--username", "cloud_admin", "--pgdata", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): @@ -1061,6 +1061,7 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): # https://github.com/neondatabase/neon/issues/8911 def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + timeline_id = env.initial_timeline endpoint = env.endpoints.create_start("main") @@ -1070,7 +1071,7 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): # we want immediate shutdown to have endpoint restart on xlog switch record, # so prevent shutdown checkpoint. - endpoint.stop(mode="immediate") + endpoint.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) endpoint = env.endpoints.create_start("main") endpoint.safe_psql("SELECT 'works'") @@ -1222,10 +1223,7 @@ def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks): # Even if there is no compute, there might be some in flight data; ensure # all walreceivers die before rechecking. for sk_http_cli in sk_http_clis: - wait( - partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id), - "walreceivers to be gone", - ) + wait_walreceivers_absent(sk_http_cli, tenant_id, timeline_id) # Now recheck again flush_lsn and exit if it is good if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): return @@ -2084,8 +2082,13 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): endpoint.safe_psql("create table t(key int, value text)") - timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id) - timeline_start_lsn = timeline_status.timeline_start_lsn + # Note: currently timelines on sks are created by compute and commit of + # transaction above is finished when 2/3 sks received it, so there is a + # small chance that timeline on this sk is not created/initialized yet, + # hence the usage of waiting function to prevent flakiness. + timeline_start_lsn = ( + env.safekeepers[0].http_client().get_non_zero_timeline_start_lsn(tenant_id, timeline_id) + ) log.info(f"Timeline start LSN: {timeline_start_lsn}") current_percent = 0.0 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e6d21e9434..ac4857bc50 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -23,7 +23,6 @@ aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "si aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } -axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } @@ -31,7 +30,6 @@ camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } -crossbeam-utils = { version = "0.8" } crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } @@ -49,10 +47,11 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } -hyper = { version = "0.14", features = ["full"] } +hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } +hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } +hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -63,13 +62,12 @@ num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } -prost = { version = "0.11" } +prost = { version = "0.13", features = ["prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } -reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -83,15 +81,13 @@ sync_wrapper = { version = "0.1", default-features = false, features = ["futures tikv-jemalloc-sys = { version = "0.5" } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } -tokio-rustls = { version = "0.24" } +tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.9", features = ["tls-roots"] } -tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } +tonic = { version = "0.12", features = ["tls-roots"] } +tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] } -tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } @@ -110,9 +106,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } -lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } +itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -122,8 +116,9 @@ num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } +prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } -prost = { version = "0.11" } +prost = { version = "0.13", features = ["prost-derive"] } quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }