diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index a52e43b4da..d60f97320b 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps on: workflow_call: # no inputs needed - + defaults: run: shell: bash -euxo pipefail {0} jobs: setup-databases: + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: - platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] database: [ clickbench, tpch, userexample ] - + env: LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib PLATFORM: ${{ matrix.platform }} @@ -23,7 +27,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -32,13 +39,13 @@ jobs: run: | case "${PLATFORM}" in neon) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; aws-rds-postgres) - CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; aws-aurora-serverless-v2-postgres) - CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}" @@ -46,10 +53,17 @@ jobs: ;; esac - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -57,23 +71,23 @@ jobs: path: /tmp/neon/ prefix: latest - # we create a table that has one row for each database that we want to restore with the status whether the restore is done + # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist env: BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} DATABASE_NAME: ${{ matrix.database }} - # to avoid a race condition of multiple jobs trying to create the table at the same time, + # to avoid a race condition of multiple jobs trying to create the table at the same time, # we use an advisory lock run: | ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " - SELECT pg_advisory_lock(4711); + SELECT pg_advisory_lock(4711); CREATE TABLE IF NOT EXISTS benchmark_restore_status ( databasename text primary key, restore_done boolean ); SELECT pg_advisory_unlock(4711); " - + - name: Check if restore is already done id: check-restore-done env: @@ -107,7 +121,7 @@ jobs: DATABASE_NAME: ${{ matrix.database }} run: | mkdir -p /tmp/dumps - aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ - name: Replace database name in connection string if: steps.check-restore-done.outputs.skip != 'true' @@ -126,17 +140,17 @@ jobs: else new_connstr="${base_connstr}/${DATABASE_NAME}" fi - echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT - name: Restore dump if: steps.check-restore-done.outputs.skip != 'true' env: DATABASE_NAME: ${{ matrix.database }} DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} - # the following works only with larger computes: + # the following works only with larger computes: # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" # we add the || true because: - # the dumps were created with Neon and contain neon extensions that are not + # the dumps were created with Neon and contain neon extensions that are not # available in RDS, so we will always report an error, but we can ignore it run: | ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 67152b6991..5fc6aa247a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -236,9 +236,7 @@ jobs: # run pageserver tests with different settings for io_engine in std-fs tokio-epoll-uring ; do - for io_buffer_alignment in 0 1 512 ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' done # Run separate tests for real S3 @@ -257,7 +255,15 @@ jobs: ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install postgres binaries - run: cp -a pg_install /tmp/neon/pg_install + run: | + # Use tar to copy files matching the pattern, preserving the paths in the destionation + tar c \ + pg_install/v* \ + pg_install/build/*/src/test/regress/*.so \ + pg_install/build/*/src/test/regress/pg_regress \ + pg_install/build/*/src/test/isolation/isolationtester \ + pg_install/build/*/src/test/isolation/pg_isolation_regress \ + | tar x -C /tmp/neon - name: Upload Neon artifact uses: ./.github/actions/upload diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index a4a597acde..32806b89ab 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -12,7 +12,6 @@ on: # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 3 * * *' # run once a day, timezone is utc - workflow_dispatch: # adds ability to run this manually inputs: region_id: @@ -59,7 +58,7 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: @@ -68,12 +67,10 @@ jobs: PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - DEFAULT_PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' RUNNER: [ self-hosted, eastus2, x64 ] - IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -86,7 +83,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: ${{ matrix.IMAGE }} + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -164,6 +164,10 @@ jobs: replication-tests: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install DEFAULT_PG_VERSION: 16 @@ -174,12 +178,21 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact uses: ./.github/actions/download @@ -267,7 +280,7 @@ jobs: region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' - image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" + image_default="neondatabase/build-tools:pinned" matrix='{ "pg_version" : [ 16 @@ -344,7 +357,7 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false @@ -371,7 +384,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Configure AWS credentials # necessary on Azure runners + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -492,17 +505,15 @@ jobs: permissions: contents: write statuses: write - id-token: write # Required for OIDC authentication in azure runners + id-token: write # aws-actions/configure-aws-credentials strategy: fail-fast: false matrix: include: - PLATFORM: "neonvm-captest-pgvector" RUNNER: [ self-hosted, us-east-2, x64 ] - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - PLATFORM: "azure-captest-pgvector" RUNNER: [ self-hosted, eastus2, x64 ] - IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" @@ -511,13 +522,16 @@ jobs: DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} runs-on: ${{ matrix.RUNNER }} container: - image: ${{ matrix.IMAGE }} + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -527,17 +541,26 @@ jobs: # instead of using Neon artifacts containing pgbench - name: Install postgresql-16 where pytest expects it run: | + # Just to make it easier to test things locally on macOS (with arm64) + arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') + cd /home/nonroot - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb - dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb" + dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg + mkdir -p /tmp/neon/pg_install/v16/bin - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql - ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench + ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql + ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib + + LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}" + export LD_LIBRARY_PATH + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV} + /tmp/neon/pg_install/v16/bin/pgbench --version /tmp/neon/pg_install/v16/bin/psql --version @@ -559,7 +582,7 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -620,6 +643,10 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: @@ -638,12 +665,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -714,6 +751,10 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: @@ -731,12 +772,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -806,6 +857,10 @@ jobs: user-examples-compare: if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: @@ -822,12 +877,22 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 81a9fd99ae..ba5d139553 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1190,10 +1190,9 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") - # TODO Add v17 - for pg_version in v14 v15 v16; do + for pg_version in v14 v15 v16 v17; do # We run less tests for debug builds, so we don't need to promote them - if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then + if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then continue fi diff --git a/Cargo.lock b/Cargo.lock index d0702e09d4..2bd828367c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1321,6 +1321,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "futures", "humantime", "humantime-serde", "hyper 0.14.30", @@ -4296,6 +4297,7 @@ dependencies = [ "camino-tempfile", "chrono", "clap", + "compute_api", "consumption_metrics", "dashmap", "ecdsa 0.16.9", diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index c4209c7a12..d8bcacf228 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] # System deps +# +# 'gdb' is included so that we get backtraces of core dumps produced in +# regression tests RUN set -e \ && apt update \ && apt install -y \ @@ -24,6 +27,7 @@ RUN set -e \ cmake \ curl \ flex \ + gdb \ git \ gnupg \ gzip \ diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml index 29be0958dd..acb17d3cc0 100644 --- a/compute/etc/neon_collector.yml +++ b/compute/etc/neon_collector.yml @@ -195,7 +195,7 @@ metrics: -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These -- temporary snapshot files are renamed to the actual snapshot files after they are -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files; + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. @@ -244,4 +244,3 @@ metrics: SELECT slot_name, CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost FROM pg_replication_slots; - diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml index 0af44745e5..50fcd62e4f 100644 --- a/compute/vm-image-spec.yaml +++ b/compute/vm-image-spec.yaml @@ -11,6 +11,10 @@ commands: user: root sysvInitAction: sysinit shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' - name: pgbouncer user: postgres sysvInitAction: respawn @@ -30,11 +34,12 @@ commands: shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: - - filename: compute_ctl-resize-swap + - filename: compute_ctl-sudoers content: | # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap - # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -100,7 +105,7 @@ merge: | && apt install --no-install-recommends -y \ sudo \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers COPY cgconfig.conf /etc/cgconfig.conf diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 9499a7186e..b10638c454 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,6 +44,7 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use compute_tools::disk_quota::set_disk_quota; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; @@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result { let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); + let set_disk_quota_for_fs = matches.get_one::("set-disk-quota-for-fs"); Ok(ProcessCliResult { connstr, @@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result { spec_json, spec_path, resize_swap_on_bind, + set_disk_quota_for_fs, }) } @@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> { spec_json: Option<&'clap String>, spec_path: Option<&'clap String>, resize_swap_on_bind: bool, + set_disk_quota_for_fs: Option<&'clap String>, } fn startup_context_from_env() -> Option { @@ -293,6 +297,7 @@ fn wait_spec( pgbin, ext_remote_storage, resize_swap_on_bind, + set_disk_quota_for_fs, http_port, .. }: ProcessCliResult, @@ -373,6 +378,7 @@ fn wait_spec( compute, http_port, resize_swap_on_bind, + set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), }) } @@ -381,6 +387,7 @@ struct WaitSpecResult { // passed through from ProcessCliResult http_port: u16, resize_swap_on_bind: bool, + set_disk_quota_for_fs: Option, } fn start_postgres( @@ -390,6 +397,7 @@ fn start_postgres( compute, http_port, resize_swap_on_bind, + set_disk_quota_for_fs, }: WaitSpecResult, ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. @@ -403,6 +411,7 @@ fn start_postgres( ); // before we release the mutex, fetch the swap size (if any) for later. let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; + let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes; drop(state); // Launch remaining service threads @@ -422,8 +431,8 @@ fn start_postgres( // OOM-killed during startup because swap wasn't available yet. match resize_swap(size_bytes) { Ok(()) => { - let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. - info!(%size_bytes, %size_gib, "resized swap"); + let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_mib, "resized swap"); } Err(err) => { let err = err.context("failed to resize swap"); @@ -432,10 +441,29 @@ fn start_postgres( // Mark compute startup as failed; don't try to start postgres, and report this // error to the control plane when it next asks. prestartup_failed = true; - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{err:?}")); - state.status = ComputeStatus::Failed; - compute.state_changed.notify_all(); + compute.set_failed_status(err); + delay_exit = true; + } + } + } + + // Set disk quota if the compute spec says so + if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = + (disk_quota_bytes, set_disk_quota_for_fs) + { + match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) { + Ok(()) => { + let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%disk_quota_bytes, %size_mib, "set disk quota"); + } + Err(err) => { + let err = err.context("failed to set disk quota"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + compute.set_failed_status(err); delay_exit = true; } } @@ -450,16 +478,7 @@ fn start_postgres( Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:#}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock + compute.set_failed_status(err); delay_exit = true; None } @@ -750,6 +769,11 @@ fn cli() -> clap::Command { .long("resize-swap-on-bind") .action(clap::ArgAction::SetTrue), ) + .arg( + Arg::new("set-disk-quota-for-fs") + .long("set-disk-quota-for-fs") + .value_name("SET_DISK_QUOTA_FOR_FS") + ) } /// When compute_ctl is killed, send also termination signal to sync-safekeepers diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1f47bb58a3..147eb2a161 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; use std::sync::{Condvar, Mutex, RwLock}; use std::thread; +use std::time::Duration; use std::time::Instant; use anyhow::{Context, Result}; @@ -305,6 +306,13 @@ impl ComputeNode { self.state_changed.notify_all(); } + pub fn set_failed_status(&self, err: anyhow::Error) { + let mut state = self.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + self.state_changed.notify_all(); + } + pub fn get_status(&self) -> ComputeStatus { self.state.lock().unwrap().status } @@ -710,7 +718,7 @@ impl ComputeNode { info!("running initdb"); let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); Command::new(initdb_bin) - .args(["-D", pgdata]) + .args(["--pgdata", pgdata]) .output() .expect("cannot start initdb process"); @@ -1123,6 +1131,9 @@ impl ComputeNode { // // Use that as a default location and pattern, except macos where core dumps are written // to /cores/ directory by default. + // + // With default Linux settings, the core dump file is called just "core", so check for + // that too. pub fn check_for_core_dumps(&self) -> Result<()> { let core_dump_dir = match std::env::consts::OS { "macos" => Path::new("/cores/"), @@ -1134,8 +1145,17 @@ impl ComputeNode { let files = fs::read_dir(core_dump_dir)?; let cores = files.filter_map(|entry| { let entry = entry.ok()?; - let _ = entry.file_name().to_str()?.strip_prefix("core.")?; - Some(entry.path()) + + let is_core_dump = match entry.file_name().to_str()? { + n if n.starts_with("core.") => true, + "core" => true, + _ => false, + }; + if is_core_dump { + Some(entry.path()) + } else { + None + } }); // Print backtrace for each core dump @@ -1386,6 +1406,36 @@ LIMIT 100", } Ok(remote_ext_metrics) } + + /// Waits until current thread receives a state changed notification and + /// the pageserver connection strings has changed. + /// + /// The operation will time out after a specified duration. + pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) { + let state = self.state.lock().unwrap(); + let old_pageserver_connstr = state + .pspec + .as_ref() + .expect("spec must be set") + .pageserver_connstr + .clone(); + let mut unchanged = true; + let _ = self + .state_changed + .wait_timeout_while(state, duration, |s| { + let pageserver_connstr = &s + .pspec + .as_ref() + .expect("spec must be set") + .pageserver_connstr; + unchanged = pageserver_connstr == &old_pageserver_connstr; + unchanged + }) + .unwrap(); + if !unchanged { + info!("Pageserver config changed"); + } + } } pub fn forward_termination_signal() { diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index 274a221ac7..7bd0e4938d 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -11,9 +11,17 @@ use crate::compute::ComputeNode; fn configurator_main_loop(compute: &Arc) { info!("waiting for reconfiguration requests"); loop { - let state = compute.state.lock().unwrap(); - let mut state = compute.state_changed.wait(state).unwrap(); + let mut state = compute.state.lock().unwrap(); + // We have to re-check the status after re-acquiring the lock because it could be that + // the status has changed while we were waiting for the lock, and we might not need to + // wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e. + // we are waiting for a condition variable that will never be signaled. + if state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + } + + // Re-check the status after waking up if state.status == ComputeStatus::ConfigurationPending { info!("got configuration request"); state.status = ComputeStatus::Configuration; diff --git a/compute_tools/src/disk_quota.rs b/compute_tools/src/disk_quota.rs new file mode 100644 index 0000000000..e838c5b9fd --- /dev/null +++ b/compute_tools/src/disk_quota.rs @@ -0,0 +1,25 @@ +use anyhow::Context; + +pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota"; + +/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes. +/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set. +pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> { + let size_kb = size_bytes / 1024; + // run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}` + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(DISK_QUOTA_BIN) + .arg(size_kb.to_string()) + .arg(fs_mountpoint) + .spawn(); + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => Err(anyhow::anyhow!("process exited with {status}")), + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`")) +} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index c402d63305..c5b4ca632c 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -10,6 +10,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod disk_quota; pub mod extension_server; pub mod lsn_lease; mod migration; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 7e5917c55f..3061d387a5 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -57,10 +57,10 @@ fn lsn_lease_bg_task( .max(valid_duration / 2); info!( - "Succeeded, sleeping for {} seconds", + "Request succeeded, sleeping for {} seconds", sleep_duration.as_secs() ); - thread::sleep(sleep_duration); + compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration); } } @@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry( .map(|connstr| { let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); if let Some(storage_auth_token) = &spec.storage_auth_token { - info!("Got storage auth token from spec file"); config.password(storage_auth_token.clone()); - } else { - info!("Storage auth token not set"); } config }) @@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry( bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); } Err(e) => { - warn!("Failed to acquire lsn lease: {e} (attempt {attempts}"); + warn!("Failed to acquire lsn lease: {e} (attempt {attempts})"); - thread::sleep(Duration::from_millis(retry_period_ms as u64)); + compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis( + retry_period_ms as u64, + )); retry_period_ms *= 1.5; retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index df87c181bf..355eca0fe5 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,6 +9,7 @@ anyhow.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true +futures.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 92f609761a..624936620d 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -6,7 +6,7 @@ //! rely on `neon_local` to set up the environment for each test. //! use anyhow::{anyhow, bail, Context, Result}; -use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; +use clap::Parser; use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::{ @@ -56,10 +56,627 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: &str = "16"; +const DEFAULT_PG_VERSION: u32 = 16; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; +#[derive(clap::Parser)] +#[command(version = GIT_VERSION, about, name = "Neon CLI")] +struct Cli { + #[command(subcommand)] + command: NeonLocalCmd, +} + +#[derive(clap::Subcommand)] +enum NeonLocalCmd { + Init(InitCmdArgs), + + #[command(subcommand)] + Tenant(TenantCmd), + #[command(subcommand)] + Timeline(TimelineCmd), + #[command(subcommand)] + Pageserver(PageserverCmd), + #[command(subcommand)] + #[clap(alias = "storage_controller")] + StorageController(StorageControllerCmd), + #[command(subcommand)] + #[clap(alias = "storage_broker")] + StorageBroker(StorageBrokerCmd), + #[command(subcommand)] + Safekeeper(SafekeeperCmd), + #[command(subcommand)] + Endpoint(EndpointCmd), + #[command(subcommand)] + Mappings(MappingsCmd), + + Start(StartCmdArgs), + Stop(StopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")] +struct InitCmdArgs { + #[clap(long, help("How many pageservers to create (default 1)"))] + num_pageservers: Option, + + #[clap(long)] + config: Option, + + #[clap(long, help("Force initialization even if the repository is not empty"))] + #[arg(value_parser)] + #[clap(default_value = "must-not-exist")] + force: InitForceMode, +} + +#[derive(clap::Args)] +#[clap(about = "Start pageserver and safekeepers")] +struct StartCmdArgs { + #[clap(long = "start-timeout", default_value = "10s")] + timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop pageserver and safekeepers")] +struct StopCmdArgs { + #[arg(value_enum)] + #[clap(long, default_value_t = StopMode::Fast)] + mode: StopMode, +} + +#[derive(Clone, Copy, clap::ValueEnum)] +enum StopMode { + Fast, + Immediate, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage tenants")] +enum TenantCmd { + List, + Create(TenantCreateCmdArgs), + SetDefault(TenantSetDefaultCmdArgs), + Config(TenantConfigCmdArgs), + Import(TenantImportCmdArgs), +} + +#[derive(clap::Args)] +struct TenantCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap( + long, + help = "Use a specific timeline id when creating a tenant and its initial timeline" + )] + timeline_id: Option, + + #[clap(short = 'c')] + config: Vec, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version to use for the initial timeline")] + pg_version: u32, + + #[clap( + long, + help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified" + )] + set_default: bool, + + #[clap(long, help = "Number of shards in the new tenant")] + #[arg(default_value_t = 0)] + shard_count: u8, + #[clap(long, help = "Sharding stripe size in pages")] + shard_stripe_size: Option, + + #[clap(long, help = "Placement policy shards in this tenant")] + #[arg(value_parser = parse_placement_policy)] + placement_policy: Option, +} + +fn parse_placement_policy(s: &str) -> anyhow::Result { + Ok(serde_json::from_str::(s)?) +} + +#[derive(clap::Args)] +#[clap( + about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified" +)] +struct TenantSetDefaultCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, +} + +#[derive(clap::Args)] +struct TenantConfigCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(short = 'c')] + config: Vec, +} + +#[derive(clap::Args)] +#[clap( + about = "Import a tenant that is present in remote storage, and create branches for its timelines" +)] +struct TenantImportCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage timelines")] +enum TimelineCmd { + List(TimelineListCmdArgs), + Branch(TimelineBranchCmdArgs), + Create(TimelineCreateCmdArgs), + Import(TimelineImportCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "List all timelines available to this pageserver")] +struct TimelineListCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_shard_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a new timeline, branching off from another timeline")] +struct TimelineBranchCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: Option, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[clap( + long, + help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name." + )] + ancestor_branch_name: Option, + + #[clap( + long, + help = "When using another timeline as base, use a specific Lsn in it instead of the latest one" + )] + ancestor_start_lsn: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a new blank timeline")] +struct TimelineCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: Option, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version")] + pg_version: u32, +} + +#[derive(clap::Args)] +#[clap(about = "Import timeline from a basebackup directory")] +struct TimelineImportCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(long, help = "New timeline's ID")] + timeline_id: TimelineId, + + #[clap(long, help = "Human-readable alias for the new timeline")] + branch_name: String, + + #[clap(long, help = "Basebackup tarfile to import")] + base_tarfile: PathBuf, + + #[clap(long, help = "Lsn the basebackup starts at")] + base_lsn: Lsn, + + #[clap(long, help = "Wal to add after base")] + wal_tarfile: Option, + + #[clap(long, help = "Lsn the basebackup ends at")] + end_lsn: Option, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version of the backup being imported")] + pg_version: u32, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage pageservers")] +enum PageserverCmd { + Status(PageserverStatusCmdArgs), + Start(PageserverStartCmdArgs), + Stop(PageserverStopCmdArgs), + Restart(PageserverRestartCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Show status of a local pageserver")] +struct PageserverStatusCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Start local pageserver")] +struct PageserverStartCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop local pageserver")] +struct PageserverStopCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, +} + +#[derive(clap::Args)] +#[clap(about = "Restart local pageserver")] +struct PageserverRestartCmdArgs { + #[clap(long = "id", help = "pageserver id")] + pageserver_id: Option, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage storage controller")] +enum StorageControllerCmd { + Start(StorageControllerStartCmdArgs), + Stop(StorageControllerStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start storage controller")] +struct StorageControllerStartCmdArgs { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, + + #[clap( + long, + help = "Identifier used to distinguish storage controller instances" + )] + #[arg(default_value_t = 1)] + instance_id: u8, + + #[clap( + long, + help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" + )] + base_port: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Stop storage controller")] +struct StorageControllerStopCmdArgs { + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, + + #[clap( + long, + help = "Identifier used to distinguish storage controller instances" + )] + #[arg(default_value_t = 1)] + instance_id: u8, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage storage broker")] +enum StorageBrokerCmd { + Start(StorageBrokerStartCmdArgs), + Stop(StorageBrokerStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start broker")] +struct StorageBrokerStartCmdArgs { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "stop broker")] +struct StorageBrokerStopCmdArgs { + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + #[arg(value_enum, default_value = "fast")] + stop_mode: StopMode, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage safekeepers")] +enum SafekeeperCmd { + Start(SafekeeperStartCmdArgs), + Stop(SafekeeperStopCmdArgs), + Restart(SafekeeperRestartCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Start local safekeeper")] +struct SafekeeperStartCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[clap( + short = 'e', + long = "safekeeper-extra-opt", + help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" + )] + extra_opt: Vec, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop local safekeeper")] +struct SafekeeperStopCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, +} + +#[derive(clap::Args)] +#[clap(about = "Restart local safekeeper")] +struct SafekeeperRestartCmdArgs { + #[clap(help = "safekeeper id")] + #[arg(default_value_t = NodeId(1))] + id: NodeId, + + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, + + #[clap( + short = 'e', + long = "safekeeper-extra-opt", + help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo" + )] + extra_opt: Vec, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage Postgres instances")] +enum EndpointCmd { + List(EndpointListCmdArgs), + Create(EndpointCreateCmdArgs), + Start(EndpointStartCmdArgs), + Reconfigure(EndpointReconfigureCmdArgs), + Stop(EndpointStopCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "List endpoints")] +struct EndpointListCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_shard_id: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Create a compute endpoint")] +struct EndpointCreateCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(help = "Postgres endpoint id")] + endpoint_id: Option, + #[clap(long, help = "Name of the branch the endpoint will run on")] + branch_name: Option, + #[clap( + long, + help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used" + )] + lsn: Option, + #[clap(long)] + pg_port: Option, + #[clap(long)] + http_port: Option, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap( + long, + help = "Don't do basebackup, create endpoint directory with only config files", + action = clap::ArgAction::Set, + default_value_t = false + )] + config_only: bool, + + #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[clap(long, help = "Postgres version")] + pg_version: u32, + + #[clap( + long, + help = "If set, the node will be a hot replica on the specified timeline", + action = clap::ArgAction::Set, + default_value_t = false + )] + hot_standby: bool, + + #[clap(long, help = "If set, will set up the catalog for neon_superuser")] + update_catalog: bool, + + #[clap( + long, + help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." + )] + allow_multiple: bool, +} + +#[derive(clap::Args)] +#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")] +struct EndpointStartCmdArgs { + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap(long)] + safekeepers: Option, + + #[clap( + long, + help = "Configure the remote extensions storage proxy gateway to request for extensions." + )] + remote_ext_config: Option, + + #[clap( + long, + help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`" + )] + create_test_user: bool, + + #[clap( + long, + help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests." + )] + allow_multiple: bool, + + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Reconfigure an endpoint")] +struct EndpointReconfigureCmdArgs { + #[clap( + long = "tenant-id", + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: Option, + + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + #[clap(long = "pageserver-id")] + endpoint_pageserver_id: Option, + + #[clap(long)] + safekeepers: Option, +} + +#[derive(clap::Args)] +#[clap(about = "Stop an endpoint")] +struct EndpointStopCmdArgs { + #[clap(help = "Postgres endpoint id")] + endpoint_id: String, + + #[clap( + long, + help = "Also delete data directory (now optional, should be default in future)" + )] + destroy: bool, + + #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m \"")] + #[arg(value_parser(["smart", "fast", "immediate"]))] + #[arg(default_value = "fast")] + mode: String, +} + +#[derive(clap::Subcommand)] +#[clap(about = "Manage neon_local branch name mappings")] +enum MappingsCmd { + Map(MappingsMapCmdArgs), +} + +#[derive(clap::Args)] +#[clap(about = "Create new mapping which cannot exist already")] +struct MappingsMapCmdArgs { + #[clap( + long, + help = "Tenant id. Represented as a hexadecimal string 32 symbols length" + )] + tenant_id: TenantId, + #[clap( + long, + help = "Timeline id. Represented as a hexadecimal string 32 symbols length" + )] + timeline_id: TimelineId, + #[clap(long, help = "Branch name to give to the timeline")] + branch_name: String, +} + /// /// Timelines tree element used as a value in the HashMap. /// @@ -80,19 +697,13 @@ struct TimelineTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - let matches = cli().get_matches(); - - let (sub_name, sub_args) = match matches.subcommand() { - Some(subcommand_data) => subcommand_data, - None => bail!("no subcommand provided"), - }; + let cli = Cli::parse(); // Check for 'neon init' command first. - let subcommand_result = if sub_name == "init" { - handle_init(sub_args).map(|env| Some(Cow::Owned(env))) + let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command { + handle_init(&args).map(|env| Some(Cow::Owned(env))) } else { // all other commands need an existing config - let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); let env = Box::leak(Box::new(env)); @@ -101,19 +712,20 @@ fn main() -> Result<()> { .build() .unwrap(); - let subcommand_result = match sub_name { - "tenant" => rt.block_on(handle_tenant(sub_args, env)), - "timeline" => rt.block_on(handle_timeline(sub_args, env)), - "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))), - "stop" => rt.block_on(handle_stop_all(sub_args, env)), - "pageserver" => rt.block_on(handle_pageserver(sub_args, env)), - "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)), - "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)), - "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)), - "endpoint" => rt.block_on(handle_endpoint(sub_args, env)), - "mappings" => handle_mappings(sub_args, env), - "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), - _ => bail!("unexpected subcommand {sub_name}"), + let subcommand_result = match cli.command { + NeonLocalCmd::Init(_) => unreachable!("init was handled earlier already"), + NeonLocalCmd::Start(args) => rt.block_on(handle_start_all(&args, env)), + NeonLocalCmd::Stop(args) => rt.block_on(handle_stop_all(&args, env)), + NeonLocalCmd::Tenant(subcmd) => rt.block_on(handle_tenant(&subcmd, env)), + NeonLocalCmd::Timeline(subcmd) => rt.block_on(handle_timeline(&subcmd, env)), + NeonLocalCmd::Pageserver(subcmd) => rt.block_on(handle_pageserver(&subcmd, env)), + NeonLocalCmd::StorageController(subcmd) => { + rt.block_on(handle_storage_controller(&subcmd, env)) + } + NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)), + NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)), + NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)), + NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; if &original_env != env { @@ -263,10 +875,13 @@ async fn get_timeline_infos( .collect()) } -// Helper function to parse --tenant_id option, or get the default from config file -fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { - if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { - tenant_id_from_arguments +/// Helper function to get tenant id from an optional --tenant_id option or from the config file +fn get_tenant_id( + tenant_id_arg: Option, + env: &local_env::LocalEnv, +) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = tenant_id_arg { + Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(default_id) } else { @@ -274,13 +889,14 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } -// Helper function to parse --tenant_id option, for commands that accept a shard suffix +/// Helper function to get tenant-shard ID from an optional --tenant_id option or from the config file, +/// for commands that accept a shard suffix fn get_tenant_shard_id( - sub_match: &ArgMatches, + tenant_shard_id_arg: Option, env: &local_env::LocalEnv, ) -> anyhow::Result { - if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() { - tenant_id_from_arguments + if let Some(tenant_id_from_arguments) = tenant_shard_id_arg { + Ok(tenant_id_from_arguments) } else if let Some(default_id) = env.default_tenant_id { Ok(TenantShardId::unsharded(default_id)) } else { @@ -288,41 +904,11 @@ fn get_tenant_shard_id( } } -fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("tenant-id") - .map(|tenant_id| TenantId::from_str(tenant_id)) - .transpose() - .context("Failed to parse tenant id from the argument string") -} - -fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("tenant-id") - .map(|id_str| TenantShardId::from_str(id_str)) - .transpose() - .context("Failed to parse tenant shard id from the argument string") -} - -fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("timeline-id") - .map(|timeline_id| TimelineId::from_str(timeline_id)) - .transpose() - .context("Failed to parse timeline id from the argument string") -} - -fn handle_init(init_match: &ArgMatches) -> anyhow::Result { - let num_pageservers = init_match.get_one::("num-pageservers"); - - let force = init_match.get_one("force").expect("we set a default value"); - +fn handle_init(args: &InitCmdArgs) -> anyhow::Result { // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. - let init_conf: NeonLocalInitConf = if let Some(config_path) = - init_match.get_one::("config") - { + let init_conf: NeonLocalInitConf = if let Some(config_path) = &args.config { // User (likely the Python test suite) provided a description of the environment. - if num_pageservers.is_some() { + if args.num_pageservers.is_some() { bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); } // load and parse the file @@ -346,7 +932,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, ..Default::default() }], - pageservers: (0..num_pageservers.copied().unwrap_or(1)) + pageservers: (0..args.num_pageservers.unwrap_or(1)) .map(|i| { let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; @@ -369,7 +955,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { } }; - LocalEnv::init(init_conf, force) + LocalEnv::init(init_conf, &args.force) .context("materialize initial neon_local environment on disk")?; Ok(LocalEnv::load_config(&local_env::base_path()) .expect("freshly written config should be loadable")) @@ -387,19 +973,16 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -async fn handle_tenant( - tenant_match: &ArgMatches, - env: &mut local_env::LocalEnv, -) -> anyhow::Result<()> { +async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { let pageserver = get_default_pageserver(env); - match tenant_match.subcommand() { - Some(("list", _)) => { + match subcmd { + TenantCmd::List => { for t in pageserver.tenant_list().await? { println!("{} {:?}", t.id, t.state); } } - Some(("import", import_match)) => { - let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + TenantCmd::Import(args) => { + let tenant_id = args.tenant_id; let storage_controller = StorageController::from_env(env); let create_response = storage_controller.tenant_import(tenant_id).await?; @@ -446,31 +1029,14 @@ async fn handle_tenant( env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; } } - Some(("create", create_match)) => { - let tenant_conf: HashMap<_, _> = create_match - .get_many::("config") - .map(|vals: clap::parser::ValuesRef<'_, String>| { - vals.flat_map(|c| c.split_once(':')).collect() - }) - .unwrap_or_default(); - - let shard_count: u8 = create_match - .get_one::("shard-count") - .cloned() - .unwrap_or(0); - - let shard_stripe_size: Option = - create_match.get_one::("shard-stripe-size").cloned(); - - let placement_policy = match create_match.get_one::("placement-policy") { - Some(s) if !s.is_empty() => serde_json::from_str::(s)?, - _ => PlacementPolicy::Attached(0), - }; + TenantCmd::Create(args) => { + let tenant_conf: HashMap<_, _> = + args.config.iter().flat_map(|c| c.split_once(':')).collect(); let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one - let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); + let tenant_id = args.tenant_id.unwrap_or_else(TenantId::generate); // We must register the tenant with the storage controller, so // that when the pageserver restarts, it will be re-attached. @@ -478,29 +1044,26 @@ async fn handle_tenant( storage_controller .tenant_create(TenantCreateRequest { // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the - // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest - // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards) + // storage controller expects a shard-naive tenant_id in this attribute, and the TenantCreateRequest + // type is used both in the storage controller (for creating tenants) and in the pageserver (for + // creating shards) new_tenant_id: TenantShardId::unsharded(tenant_id), generation: None, shard_parameters: ShardParameters { - count: ShardCount::new(shard_count), - stripe_size: shard_stripe_size + count: ShardCount::new(args.shard_count), + stripe_size: args + .shard_stripe_size .map(ShardStripeSize) .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), }, - placement_policy: Some(placement_policy), + placement_policy: args.placement_policy.clone(), config: tenant_conf, }) .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant - let new_timeline_id = - parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate()); - let pg_version = create_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; + let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have // different shards picking different start lsns. Maybe we have to teach storage controller @@ -513,7 +1076,7 @@ async fn handle_tenant( ancestor_timeline_id: None, ancestor_start_lsn: None, existing_initdb_timeline_id: None, - pg_version: Some(pg_version), + pg_version: Some(args.pg_version), }, ) .await?; @@ -526,23 +1089,19 @@ async fn handle_tenant( println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",); - if create_match.get_flag("set-default") { + if args.set_default { println!("Setting tenant {tenant_id} as a default one"); env.default_tenant_id = Some(tenant_id); } } - Some(("set-default", set_default_match)) => { - let tenant_id = - parse_tenant_id(set_default_match)?.context("No tenant id specified")?; - println!("Setting tenant {tenant_id} as a default one"); - env.default_tenant_id = Some(tenant_id); + TenantCmd::SetDefault(args) => { + println!("Setting tenant {} as a default one", args.tenant_id); + env.default_tenant_id = Some(args.tenant_id); } - Some(("config", create_match)) => { - let tenant_id = get_tenant_id(create_match, env)?; - let tenant_conf: HashMap<_, _> = create_match - .get_many::("config") - .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) - .unwrap_or_default(); + TenantCmd::Config(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let tenant_conf: HashMap<_, _> = + args.config.iter().flat_map(|c| c.split_once(':')).collect(); pageserver .tenant_config(tenant_id, tenant_conf) @@ -550,36 +1109,25 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), - None => bail!("no tenant subcommand provided"), } Ok(()) } -async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { +async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = get_default_pageserver(env); - match timeline_match.subcommand() { - Some(("list", list_match)) => { + match cmd { + TimelineCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. - let tenant_shard_id = get_tenant_shard_id(list_match, env)?; + let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timelines = pageserver.timeline_list(&tenant_shard_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } - Some(("create", create_match)) => { - let tenant_id = get_tenant_id(create_match, env)?; - let new_branch_name = create_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; - - let pg_version = create_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let new_timeline_id_opt = parse_timeline_id(create_match)?; + TimelineCmd::Create(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let new_branch_name = &args.branch_name; + let new_timeline_id_opt = args.timeline_id; let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); let storage_controller = StorageController::from_env(env); @@ -588,7 +1136,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local ancestor_timeline_id: None, existing_initdb_timeline_id: None, ancestor_start_lsn: None, - pg_version: Some(pg_version), + pg_version: Some(args.pg_version), }; let timeline_info = storage_controller .tenant_timeline_create(tenant_id, create_req) @@ -602,67 +1150,42 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local timeline_info.timeline_id ); } - Some(("import", import_match)) => { - let tenant_id = get_tenant_id(import_match, env)?; - let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); - let branch_name = import_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; + TimelineCmd::Import(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let timeline_id = args.timeline_id; + let branch_name = &args.branch_name; // Parse base inputs - let base_tarfile = import_match - .get_one::("base-tarfile") - .ok_or_else(|| anyhow!("No base-tarfile provided"))? - .to_owned(); - let base_lsn = Lsn::from_str( - import_match - .get_one::("base-lsn") - .ok_or_else(|| anyhow!("No base-lsn provided"))?, - )?; - let base = (base_lsn, base_tarfile); + let base = (args.base_lsn, args.base_tarfile.clone()); // Parse pg_wal inputs - let wal_tarfile = import_match.get_one::("wal-tarfile").cloned(); - let end_lsn = import_match - .get_one::("end-lsn") - .map(|s| Lsn::from_str(s).unwrap()); + let wal_tarfile = args.wal_tarfile.clone(); + let end_lsn = args.end_lsn; // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); - let pg_version = import_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - println!("Importing timeline into pageserver ..."); pageserver - .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version) + .timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version) .await?; env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } - Some(("branch", branch_match)) => { - let tenant_id = get_tenant_id(branch_match, env)?; - let new_timeline_id = - parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate()); - let new_branch_name = branch_match - .get_one::("branch-name") - .ok_or_else(|| anyhow!("No branch name provided"))?; - let ancestor_branch_name = branch_match - .get_one::("ancestor-branch-name") - .map(|s| s.as_str()) - .unwrap_or(DEFAULT_BRANCH_NAME); + TimelineCmd::Branch(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let new_timeline_id = args.timeline_id.unwrap_or(TimelineId::generate()); + let new_branch_name = &args.branch_name; + let ancestor_branch_name = args + .ancestor_branch_name + .clone() + .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); let ancestor_timeline_id = env - .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .get_branch_timeline_id(&ancestor_branch_name, tenant_id) .ok_or_else(|| { anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; - let start_lsn = branch_match - .get_one::("ancestor-start-lsn") - .map(|lsn_str| Lsn::from_str(lsn_str)) - .transpose() - .context("Failed to parse ancestor start Lsn from the request")?; + let start_lsn = args.ancestor_start_lsn; let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, @@ -684,25 +1207,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), - None => bail!("no tenant subcommand provided"), } Ok(()) } -async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match ep_match.subcommand() { - Some(ep_subcommand_data) => ep_subcommand_data, - None => bail!("no endpoint subcommand provided"), - }; +async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; - match sub_name { - "list" => { + match subcmd { + EndpointCmd::List(args) => { // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. - let tenant_shard_id = get_tenant_shard_id(sub_args, env)?; + let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; let timeline_infos = get_timeline_infos(env, &tenant_shard_id) .await .unwrap_or_else(|e| { @@ -766,52 +1283,29 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re println!("{table}"); } - "create" => { - let tenant_id = get_tenant_id(sub_args, env)?; - let branch_name = sub_args - .get_one::("branch-name") - .map(|s| s.as_str()) - .unwrap_or(DEFAULT_BRANCH_NAME); - let endpoint_id = sub_args - .get_one::("endpoint_id") - .map(String::to_string) + EndpointCmd::Create(args) => { + let tenant_id = get_tenant_id(args.tenant_id, env)?; + let branch_name = args + .branch_name + .clone() + .unwrap_or(DEFAULT_BRANCH_NAME.to_owned()); + let endpoint_id = args + .endpoint_id + .clone() .unwrap_or_else(|| format!("ep-{branch_name}")); - let update_catalog = sub_args - .get_one::("update-catalog") - .cloned() - .unwrap_or_default(); - let lsn = sub_args - .get_one::("lsn") - .map(|lsn_str| Lsn::from_str(lsn_str)) - .transpose() - .context("Failed to parse Lsn from the request")?; let timeline_id = env - .get_branch_timeline_id(branch_name, tenant_id) + .get_branch_timeline_id(&branch_name, tenant_id) .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; - let pg_port: Option = sub_args.get_one::("pg-port").copied(); - let http_port: Option = sub_args.get_one::("http-port").copied(); - let pg_version = sub_args - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let hot_standby = sub_args - .get_one::("hot-standby") - .copied() - .unwrap_or(false); - - let allow_multiple = sub_args.get_flag("allow-multiple"); - - let mode = match (lsn, hot_standby) { + let mode = match (args.lsn, args.hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, (None, false) => ComputeMode::Primary, (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; - match (mode, hot_standby) { + match (mode, args.hot_standby) { (ComputeMode::Static(_), true) => { bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") } @@ -821,7 +1315,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - if !allow_multiple { + if !args.allow_multiple { cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; } @@ -829,34 +1323,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &endpoint_id, tenant_id, timeline_id, - pg_port, - http_port, - pg_version, + args.pg_port, + args.http_port, + args.pg_version, mode, - !update_catalog, + !args.update_catalog, )?; } - "start" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; - - let pageserver_id = - if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - Some(NodeId( - id_str.parse().context("while parsing pageserver id")?, - )) - } else { - None - }; - - let remote_ext_config = sub_args.get_one::("remote-ext-config"); - - let allow_multiple = sub_args.get_flag("allow-multiple"); + EndpointCmd::Start(args) => { + let endpoint_id = &args.endpoint_id; + let pageserver_id = args.endpoint_pageserver_id; + let remote_ext_config = &args.remote_ext_config; // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. - let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? { + let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { safekeepers } else { env.safekeepers.iter().map(|sk| sk.id).collect() @@ -867,12 +1348,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get(endpoint_id.as_str()) .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; - let create_test_user = sub_args - .get_one::("create-test-user") - .cloned() - .unwrap_or_default(); - - if !allow_multiple { + if !args.allow_multiple { cplane.check_conflicting_endpoints( endpoint.mode, endpoint.tenant_id, @@ -894,17 +1370,27 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re // to pass these on to postgres. let storage_controller = StorageController::from_env(env); let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; - let pageservers = locate_result - .shards - .into_iter() - .map(|shard| { - ( + let pageservers = futures::future::try_join_all( + locate_result.shards.into_iter().map(|shard| async move { + if let ComputeMode::Static(lsn) = endpoint.mode { + // Initialize LSN leases for static computes. + let conf = env.get_pageserver_conf(shard.node_id).unwrap(); + let pageserver = PageServerNode::from_env(env, conf); + + pageserver + .http_client + .timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn) + .await?; + } + + anyhow::Ok(( Host::parse(&shard.listen_pg_addr) .expect("Storage controller reported bad hostname"), shard.listen_pg_port, - ) - }) - .collect::>(); + )) + }), + ) + .await?; let stripe_size = locate_result.shard_params.stripe_size; (pageservers, stripe_size) @@ -926,72 +1412,61 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &auth_token, safekeepers, pageservers, - remote_ext_config, + remote_ext_config.as_ref(), stripe_size.0 as usize, - create_test_user, + args.create_test_user, ) .await?; } - "reconfigure" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?; + EndpointCmd::Reconfigure(args) => { + let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let pageservers = - if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?); - let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); - vec![( - pageserver.pg_connection_config.host().clone(), - pageserver.pg_connection_config.port(), - )] - } else { - let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_locate(endpoint.tenant_id) - .await? - .shards - .into_iter() - .map(|shard| { - ( - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported malformed host"), - shard.listen_pg_port, - ) - }) - .collect::>() - }; + let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id { + let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); + vec![( + pageserver.pg_connection_config.host().clone(), + pageserver.pg_connection_config.port(), + )] + } else { + let storage_controller = StorageController::from_env(env); + storage_controller + .tenant_locate(endpoint.tenant_id) + .await? + .shards + .into_iter() + .map(|shard| { + ( + Host::parse(&shard.listen_pg_addr) + .expect("Storage controller reported malformed host"), + shard.listen_pg_port, + ) + }) + .collect::>() + }; // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. - let safekeepers = parse_safekeepers(sub_args)?; + let safekeepers = parse_safekeepers(&args.safekeepers)?; endpoint.reconfigure(pageservers, None, safekeepers).await?; } - "stop" => { - let endpoint_id = sub_args - .get_one::("endpoint_id") - .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; - let destroy = sub_args.get_flag("destroy"); - let mode = sub_args.get_one::("mode").expect("has a default"); - + EndpointCmd::Stop(args) => { + let endpoint_id = &args.endpoint_id; let endpoint = cplane .endpoints - .get(endpoint_id.as_str()) + .get(endpoint_id) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(mode, destroy)?; + endpoint.stop(&args.mode, args.destroy)?; } - - _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), } Ok(()) } /// Parse --safekeepers as list of safekeeper ids. -fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { - if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { +fn parse_safekeepers(safekeepers_str: &Option) -> Result>> { + if let Some(safekeepers_str) = safekeepers_str { let mut safekeepers: Vec = Vec::new(); for sk_id in safekeepers_str.split(',').map(str::trim) { let sk_id = NodeId( @@ -1006,44 +1481,25 @@ fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { } } -fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(ep_subcommand_data) => ep_subcommand_data, - None => bail!("no mappings subcommand provided"), - }; - - match sub_name { - "map" => { - let branch_name = sub_args - .get_one::("branch-name") - .expect("branch-name argument missing"); - - let tenant_id = sub_args - .get_one::("tenant-id") - .map(|x| TenantId::from_str(x)) - .expect("tenant-id argument missing") - .expect("malformed tenant-id arg"); - - let timeline_id = sub_args - .get_one::("timeline-id") - .map(|x| TimelineId::from_str(x)) - .expect("timeline-id argument missing") - .expect("malformed timeline-id arg"); - - env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?; +fn handle_mappings(subcmd: &MappingsCmd, env: &mut local_env::LocalEnv) -> Result<()> { + match subcmd { + MappingsCmd::Map(args) => { + env.register_branch_mapping( + args.branch_name.to_owned(), + args.tenant_id, + args.timeline_id, + )?; Ok(()) } - other => unimplemented!("mappings subcommand {other}"), } } -fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result { - let node_id = if let Some(id_str) = args.get_one::("pageserver-id") { - NodeId(id_str.parse().context("while parsing pageserver id")?) - } else { - DEFAULT_PAGESERVER_ID - }; +fn get_pageserver( + env: &local_env::LocalEnv, + pageserver_id_arg: Option, +) -> Result { + let node_id = pageserver_id_arg.unwrap_or(DEFAULT_PAGESERVER_ID); Ok(PageServerNode::from_env( env, @@ -1051,48 +1507,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result &Duration { - let humantime_duration = args - .get_one::("start-timeout") - .expect("invalid value for start-timeout"); - humantime_duration.as_ref() -} - -fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs { - let maybe_instance_id = args.get_one::("instance-id"); - - let base_port = args.get_one::("base-port"); - - if maybe_instance_id.is_some() && base_port.is_none() { - panic!("storage-controller start specificied instance-id but did not provide base-port"); - } - - let start_timeout = args - .get_one::("start-timeout") - .expect("invalid value for start-timeout"); - - NeonStorageControllerStartArgs { - instance_id: maybe_instance_id.copied().unwrap_or(1), - base_port: base_port.copied(), - start_timeout: *start_timeout, - } -} - -fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs { - let maybe_instance_id = args.get_one::("instance-id"); - let immediate = args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - - NeonStorageControllerStopArgs { - instance_id: maybe_instance_id.copied().unwrap_or(1), - immediate, - } -} - -async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - match sub_match.subcommand() { - Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)? - .start(get_start_timeout(subcommand_args)) +async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + PageserverCmd::Start(args) => { + if let Err(e) = get_pageserver(env, args.pageserver_id)? + .start(&args.start_timeout) .await { eprintln!("pageserver start failed: {e}"); @@ -1100,34 +1519,36 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } - Some(("stop", subcommand_args)) => { - let immediate = subcommand_args - .get_one::("stop-mode") - .map(|s| s.as_str()) - == Some("immediate"); - - if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) { + PageserverCmd::Stop(args) => { + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; + if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) { eprintln!("pageserver stop failed: {}", e); exit(1); } } - Some(("restart", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; + PageserverCmd::Restart(args) => { + let pageserver = get_pageserver(env, args.pageserver_id)?; //TODO what shutdown strategy should we use here? if let Err(e) = pageserver.stop(false) { eprintln!("pageserver stop failed: {}", e); exit(1); } - if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await { + if let Err(e) = pageserver.start(&args.start_timeout).await { eprintln!("pageserver start failed: {e}"); exit(1); } } - Some(("status", subcommand_args)) => { - match get_pageserver(env, subcommand_args)?.check_status().await { + PageserverCmd::Status(args) => { + match get_pageserver(env, args.pageserver_id)? + .check_status() + .await + { Ok(_) => println!("Page server is up and running"), Err(err) => { eprintln!("Page server is not available: {}", err); @@ -1135,34 +1556,42 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } } - - Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), - None => bail!("no pageserver subcommand provided"), } Ok(()) } async fn handle_storage_controller( - sub_match: &ArgMatches, + subcmd: &StorageControllerCmd, env: &local_env::LocalEnv, ) -> Result<()> { let svc = StorageController::from_env(env); - match sub_match.subcommand() { - Some(("start", start_match)) => { - if let Err(e) = svc.start(storage_controller_start_args(start_match)).await { + match subcmd { + StorageControllerCmd::Start(args) => { + let start_args = NeonStorageControllerStartArgs { + instance_id: args.instance_id, + base_port: args.base_port, + start_timeout: args.start_timeout, + }; + + if let Err(e) = svc.start(start_args).await { eprintln!("start failed: {e}"); exit(1); } } - Some(("stop", stop_match)) => { - if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await { + StorageControllerCmd::Stop(args) => { + let stop_args = NeonStorageControllerStopArgs { + instance_id: args.instance_id, + immediate: match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }, + }; + if let Err(e) = svc.stop(stop_args).await { eprintln!("stop failed: {}", e); exit(1); } } - Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name), - None => bail!("no storage_controller subcommand provided"), } Ok(()) } @@ -1175,111 +1604,77 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result Vec { - init_match - .get_many::("safekeeper-extra-opt") - .into_iter() - .flatten() - .map(|s| s.to_owned()) - .collect() -} +async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + SafekeeperCmd::Start(args) => { + let safekeeper = get_safekeeper(env, args.id)?; -async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(safekeeper_command_data) => safekeeper_command_data, - None => bail!("no safekeeper subcommand provided"), - }; - - // All the commands take an optional safekeeper name argument - let sk_id = if let Some(id_str) = sub_args.get_one::("id") { - NodeId(id_str.parse().context("while parsing safekeeper id")?) - } else { - DEFAULT_SAFEKEEPER_ID - }; - let safekeeper = get_safekeeper(env, sk_id)?; - - match sub_name { - "start" => { - let extra_opts = safekeeper_extra_opts(sub_args); - - if let Err(e) = safekeeper - .start(extra_opts, get_start_timeout(sub_args)) - .await - { + if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {}", e); exit(1); } } - "stop" => { - let immediate = - sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - + SafekeeperCmd::Stop(args) => { + let safekeeper = get_safekeeper(env, args.id)?; + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); exit(1); } } - "restart" => { - let immediate = - sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + SafekeeperCmd::Restart(args) => { + let safekeeper = get_safekeeper(env, args.id)?; + let immediate = match args.stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; if let Err(e) = safekeeper.stop(immediate) { eprintln!("safekeeper stop failed: {}", e); exit(1); } - let extra_opts = safekeeper_extra_opts(sub_args); - if let Err(e) = safekeeper - .start(extra_opts, get_start_timeout(sub_args)) - .await - { + if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { eprintln!("safekeeper start failed: {}", e); exit(1); } } - - _ => { - bail!("Unexpected safekeeper subcommand '{}'", sub_name) - } } Ok(()) } -async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(broker_command_data) => broker_command_data, - None => bail!("no broker subcommand provided"), - }; - - match sub_name { - "start" => { - if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await { +async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> { + match subcmd { + StorageBrokerCmd::Start(args) => { + if let Err(e) = broker::start_broker_process(env, &args.start_timeout).await { eprintln!("broker start failed: {e}"); exit(1); } } - "stop" => { + StorageBrokerCmd::Stop(_args) => { + // FIXME: stop_mode unused if let Err(e) = broker::stop_broker_process(env) { eprintln!("broker stop failed: {e}"); exit(1); } } - - _ => bail!("Unexpected broker subcommand '{}'", sub_name), } Ok(()) } async fn handle_start_all( + args: &StartCmdArgs, env: &'static local_env::LocalEnv, - retry_timeout: &Duration, ) -> anyhow::Result<()> { - let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else { - neon_start_status_check(env, retry_timeout) + // FIXME: this was called "retry_timeout", is it right? + let Err(errors) = handle_start_all_impl(env, args.timeout).await else { + neon_start_status_check(env, args.timeout.as_ref()) .await .context("status check after successful startup of all services")?; return Ok(()); @@ -1304,7 +1699,7 @@ async fn handle_start_all( /// Otherwise, returns the list of errors that occurred during startup. async fn handle_start_all_impl( env: &'static local_env::LocalEnv, - retry_timeout: Duration, + retry_timeout: humantime::Duration, ) -> Result<(), Vec> { // Endpoints are not started automatically @@ -1324,7 +1719,7 @@ async fn handle_start_all_impl( let storage_controller = StorageController::from_env(env); storage_controller .start(NeonStorageControllerStartArgs::with_default_instance_id( - retry_timeout.into(), + retry_timeout, )) .await .map_err(|e| e.context("start storage_controller")) @@ -1345,7 +1740,7 @@ async fn handle_start_all_impl( js.spawn(async move { let safekeeper = SafekeeperNode::from_env(env, node); safekeeper - .start(vec![], &retry_timeout) + .start(&[], &retry_timeout) .await .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) }); @@ -1425,9 +1820,11 @@ async fn neon_start_status_check( anyhow::bail!("\nNeon passed status check") } -async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let immediate = - sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); +async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Result<()> { + let immediate = match args.mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; try_stop_all(env, immediate).await; @@ -1485,400 +1882,3 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } } } - -fn cli() -> Command { - let timeout_arg = Arg::new("start-timeout") - .long("start-timeout") - .short('t') - .global(true) - .help("timeout until we fail the command, e.g. 30s") - .value_parser(value_parser!(humantime::Duration)) - .default_value("10s") - .required(false); - - let branch_name_arg = Arg::new("branch-name") - .long("branch-name") - .help("Name of the branch to be created or used as an alias for other services") - .required(false); - - let endpoint_id_arg = Arg::new("endpoint_id") - .help("Postgres endpoint id") - .required(false); - - let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); - - // --id, when using a pageserver command - let pageserver_id_arg = Arg::new("pageserver-id") - .long("id") - .global(true) - .help("pageserver id") - .required(false); - // --pageserver-id when using a non-pageserver command - let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id") - .long("pageserver-id") - .required(false); - - let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt") - .short('e') - .long("safekeeper-extra-opt") - .num_args(1) - .action(ArgAction::Append) - .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo") - .required(false); - - let tenant_id_arg = Arg::new("tenant-id") - .long("tenant-id") - .help("Tenant id. Represented as a hexadecimal string 32 symbols length") - .required(false); - - let timeline_id_arg = Arg::new("timeline-id") - .long("timeline-id") - .help("Timeline id. Represented as a hexadecimal string 32 symbols length") - .required(false); - - let pg_version_arg = Arg::new("pg-version") - .long("pg-version") - .help("Postgres version to use for the initial tenant") - .required(false) - .value_parser(value_parser!(u32)) - .default_value(DEFAULT_PG_VERSION); - - let pg_port_arg = Arg::new("pg-port") - .long("pg-port") - .required(false) - .value_parser(value_parser!(u16)) - .value_name("pg-port"); - - let http_port_arg = Arg::new("http-port") - .long("http-port") - .required(false) - .value_parser(value_parser!(u16)) - .value_name("http-port"); - - let safekeepers_arg = Arg::new("safekeepers") - .long("safekeepers") - .required(false) - .value_name("safekeepers"); - - let stop_mode_arg = Arg::new("stop-mode") - .short('m') - .value_parser(["fast", "immediate"]) - .default_value("fast") - .help("If 'immediate', don't flush repository data at shutdown") - .required(false) - .value_name("stop-mode"); - - let remote_ext_config_args = Arg::new("remote-ext-config") - .long("remote-ext-config") - .num_args(1) - .help("Configure the remote extensions storage proxy gateway to request for extensions.") - .required(false); - - let lsn_arg = Arg::new("lsn") - .long("lsn") - .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") - .required(false); - - let hot_standby_arg = Arg::new("hot-standby") - .value_parser(value_parser!(bool)) - .long("hot-standby") - .help("If set, the node will be a hot replica on the specified timeline") - .required(false); - - let force_arg = Arg::new("force") - .value_parser(value_parser!(InitForceMode)) - .long("force") - .default_value( - InitForceMode::MustNotExist - .to_possible_value() - .unwrap() - .get_name() - .to_owned(), - ) - .help("Force initialization even if the repository is not empty") - .required(false); - - let num_pageservers_arg = Arg::new("num-pageservers") - .value_parser(value_parser!(u16)) - .long("num-pageservers") - .help("How many pageservers to create (default 1)"); - - let update_catalog = Arg::new("update-catalog") - .value_parser(value_parser!(bool)) - .long("update-catalog") - .help("If set, will set up the catalog for neon_superuser") - .required(false); - - let create_test_user = Arg::new("create-test-user") - .value_parser(value_parser!(bool)) - .long("create-test-user") - .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") - .required(false); - - let allow_multiple = Arg::new("allow-multiple") - .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") - .long("allow-multiple") - .action(ArgAction::SetTrue) - .required(false); - - let instance_id = Arg::new("instance-id") - .long("instance-id") - .help("Identifier used to distinguish storage controller instances (default 1)") - .value_parser(value_parser!(u8)) - .required(false); - - let base_port = Arg::new("base-port") - .long("base-port") - .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)") - .value_parser(value_parser!(u16)) - .required(false); - - Command::new("Neon CLI") - .arg_required_else_help(true) - .version(GIT_VERSION) - .subcommand( - Command::new("init") - .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(num_pageservers_arg.clone()) - .arg( - Arg::new("config") - .long("config") - .required(false) - .value_parser(value_parser!(PathBuf)) - .value_name("config") - ) - .arg(force_arg) - ) - .subcommand( - Command::new("timeline") - .about("Manage timelines") - .arg_required_else_help(true) - .subcommand(Command::new("list") - .about("List all timelines, available to this pageserver") - .arg(tenant_id_arg.clone())) - .subcommand(Command::new("branch") - .about("Create a new timeline, using another timeline as a base, copying its data") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") - .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) - .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn") - .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) - .subcommand(Command::new("create") - .about("Create a new blank timeline") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(pg_version_arg.clone()) - ) - .subcommand(Command::new("import") - .about("Import timeline from basebackup directory") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(Arg::new("base-tarfile") - .long("base-tarfile") - .value_parser(value_parser!(PathBuf)) - .help("Basebackup tarfile to import") - ) - .arg(Arg::new("base-lsn").long("base-lsn") - .help("Lsn the basebackup starts at")) - .arg(Arg::new("wal-tarfile") - .long("wal-tarfile") - .value_parser(value_parser!(PathBuf)) - .help("Wal to add after base") - ) - .arg(Arg::new("end-lsn").long("end-lsn") - .help("Lsn the basebackup ends at")) - .arg(pg_version_arg.clone()) - ) - ).subcommand( - Command::new("tenant") - .arg_required_else_help(true) - .about("Manage tenants") - .subcommand(Command::new("list")) - .subcommand(Command::new("create") - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) - .arg(pg_version_arg.clone()) - .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false) - .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) - .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) - .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) - .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) - ) - .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) - .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) - .subcommand(Command::new("config") - .arg(tenant_id_arg.clone()) - .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) - .about("Import a tenant that is present in remote storage, and create branches for its timelines")) - ) - .subcommand( - Command::new("pageserver") - .arg_required_else_help(true) - .about("Manage pageserver") - .arg(pageserver_id_arg) - .subcommand(Command::new("status")) - .subcommand(Command::new("start") - .about("Start local pageserver") - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop local pageserver") - .arg(stop_mode_arg.clone()) - ) - .subcommand(Command::new("restart") - .about("Restart local pageserver") - .arg(timeout_arg.clone()) - ) - ) - .subcommand( - Command::new("storage_controller") - .arg_required_else_help(true) - .about("Manage storage_controller") - .subcommand(Command::new("start").about("Start storage controller") - .arg(timeout_arg.clone()) - .arg(instance_id.clone()) - .arg(base_port)) - .subcommand(Command::new("stop").about("Stop storage controller") - .arg(stop_mode_arg.clone()) - .arg(instance_id)) - ) - .subcommand( - Command::new("storage_broker") - .arg_required_else_help(true) - .about("Manage broker") - .subcommand(Command::new("start") - .about("Start broker") - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop broker") - .arg(stop_mode_arg.clone()) - ) - ) - .subcommand( - Command::new("safekeeper") - .arg_required_else_help(true) - .about("Manage safekeepers") - .subcommand(Command::new("start") - .about("Start local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(safekeeper_extra_opt_arg.clone()) - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("stop") - .about("Stop local safekeeper") - .arg(safekeeper_id_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - .subcommand(Command::new("restart") - .about("Restart local safekeeper") - .arg(safekeeper_id_arg) - .arg(stop_mode_arg.clone()) - .arg(safekeeper_extra_opt_arg) - .arg(timeout_arg.clone()) - ) - ) - .subcommand( - Command::new("endpoint") - .arg_required_else_help(true) - .about("Manage postgres instances") - .subcommand(Command::new("list").arg(tenant_id_arg.clone())) - .subcommand(Command::new("create") - .about("Create a compute endpoint") - .arg(endpoint_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(lsn_arg.clone()) - .arg(pg_port_arg.clone()) - .arg(http_port_arg.clone()) - .arg(endpoint_pageserver_id_arg.clone()) - .arg( - Arg::new("config-only") - .help("Don't do basebackup, create endpoint directory with only config files") - .long("config-only") - .required(false)) - .arg(pg_version_arg.clone()) - .arg(hot_standby_arg.clone()) - .arg(update_catalog) - .arg(allow_multiple.clone()) - ) - .subcommand(Command::new("start") - .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") - .arg(endpoint_id_arg.clone()) - .arg(endpoint_pageserver_id_arg.clone()) - .arg(safekeepers_arg.clone()) - .arg(remote_ext_config_args) - .arg(create_test_user) - .arg(allow_multiple.clone()) - .arg(timeout_arg.clone()) - ) - .subcommand(Command::new("reconfigure") - .about("Reconfigure the endpoint") - .arg(endpoint_pageserver_id_arg) - .arg(safekeepers_arg) - .arg(endpoint_id_arg.clone()) - .arg(tenant_id_arg.clone()) - ) - .subcommand( - Command::new("stop") - .arg(endpoint_id_arg) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .action(ArgAction::SetTrue) - .required(false) - ) - .arg( - Arg::new("mode") - .help("Postgres shutdown mode, passed to \"pg_ctl -m \"") - .long("mode") - .action(ArgAction::Set) - .required(false) - .value_parser(["smart", "fast", "immediate"]) - .default_value("fast") - ) - ) - - ) - .subcommand( - Command::new("mappings") - .arg_required_else_help(true) - .about("Manage neon_local branch name mappings") - .subcommand( - Command::new("map") - .about("Create new mapping which cannot exist already") - .arg(branch_name_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(timeline_id_arg.clone()) - ) - ) - // Obsolete old name for 'endpoint'. We now just print an error if it's used. - .subcommand( - Command::new("pg") - .hide(true) - .arg(Arg::new("ignore-rest").allow_hyphen_values(true).num_args(0..).required(false)) - .trailing_var_arg(true) - ) - .subcommand( - Command::new("start") - .about("Start page server and safekeepers") - .arg(timeout_arg.clone()) - ) - .subcommand( - Command::new("stop") - .about("Stop page server and safekeepers") - .arg(stop_mode_arg) - ) -} - -#[test] -fn verify_cli() { - cli().debug_assert(); -} diff --git a/control_plane/src/branch_mappings.rs b/control_plane/src/branch_mappings.rs new file mode 100644 index 0000000000..e89313df39 --- /dev/null +++ b/control_plane/src/branch_mappings.rs @@ -0,0 +1,94 @@ +//! Branch mappings for convenience + +use std::collections::HashMap; +use std::fs; +use std::path::Path; + +use anyhow::{bail, Context}; +use serde::{Deserialize, Serialize}; + +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct BranchMappings { + /// Default tenant ID to use with the 'neon_local' command line utility, when + /// --tenant_id is not explicitly specified. This comes from the branches. + pub default_tenant_id: Option, + + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub mappings: HashMap>, +} + +impl BranchMappings { + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let existing_values = self.mappings.entry(branch_name.clone()).or_default(); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: TenantId, + ) -> Option { + // If it looks like a timeline ID, return it as it is + if let Ok(timeline_id) = branch_name.parse::() { + return Some(timeline_id); + } + + self.mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + .map(TimelineId::from) + } + + pub fn timeline_name_mappings(&self) -> HashMap { + self.mappings + .iter() + .flat_map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) + }) + }) + .collect() + } + + pub fn persist(&self, path: &Path) -> anyhow::Result<()> { + let content = &toml::to_string_pretty(self)?; + fs::write(path, content).with_context(|| { + format!( + "Failed to write branch information into path '{}'", + path.display() + ) + }) + } + + pub fn load(path: &Path) -> anyhow::Result { + let branches_file_contents = fs::read_to_string(path)?; + Ok(toml::from_str(branches_file_contents.as_str())?) + } +} diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7554a03a68..18f396b886 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -561,6 +561,7 @@ impl Endpoint { operation_uuid: None, features: self.features.clone(), swap_size_bytes: None, + disk_quota_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 573f1688d5..7a019bce88 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -113,7 +113,7 @@ impl SafekeeperNode { pub async fn start( &self, - extra_opts: Vec, + extra_opts: &[String], retry_timeout: &Duration, ) -> anyhow::Result<()> { print!( @@ -196,7 +196,7 @@ impl SafekeeperNode { ]); } - args.extend(extra_opts); + args.extend_from_slice(extra_opts); background_process::start_process( &format!("safekeeper-{id}"), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 0c0e67dff0..36e5e04c86 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -347,7 +347,7 @@ impl StorageController { if !tokio::fs::try_exists(&pg_data_path).await? { let initdb_args = [ - "-D", + "--pgdata", pg_data_path.as_ref(), "--username", &username(), diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 883c624f71..83515a00a0 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -50,6 +50,16 @@ pub struct ComputeSpec { #[serde(default)] pub swap_size_bytes: Option, + /// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs + /// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the + /// spec is first received. + /// + /// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of disk quota. + #[serde(default)] + pub disk_quota_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -268,6 +278,22 @@ pub struct GenericOption { /// declare a `trait` on it. pub type GenericOptions = Option>; +/// Configured the local-proxy application with the relevant JWKS and roles it should +/// use for authorizing connect requests using JWT. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct LocalProxySpec { + pub jwks: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct JwksSettings { + pub id: String, + pub role_names: Vec, + pub jwks_url: String, + pub provider_name: String, + pub jwt_audience: Option, +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index e274d24585..085540e7b9 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String { } fn log_query_error(query: &str, e: &QueryError) { + // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`. match e { QueryError::Disconnected(ConnectionError::Io(io_error)) => { if is_expected_io_error(io_error) { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 5c0abda522..9524a5149b 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -93,9 +93,9 @@ impl Conf { ); let output = self .new_pg_command("initdb")? - .arg("-D") + .arg("--pgdata") .arg(&self.datadir) - .args(["-U", "postgres", "--no-instructions", "--no-sync"]) + .args(["--username", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); ensure!( diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 2d95ac42e6..592f1ded0d 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -736,4 +736,22 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn timeline_init_lsn_lease( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease", + self.mgmt_api_endpoint, + ); + + self.request(Method::POST, &uri, LsnLeaseRequest { lsn }) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d15a0e47a4..e9e52acee6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::config::PageserverIdentity; -use pageserver::control_plane_client::ControlPlaneClient; +use pageserver::controller_upcall_client::ControllerUpcallClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; @@ -396,7 +396,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - ControlPlaneClient::new(conf, &shutdown_pageserver), + ControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); if let Some(deletion_workers) = deletion_workers { diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/controller_upcall_client.rs similarity index 80% rename from pageserver/src/control_plane_client.rs rename to pageserver/src/controller_upcall_client.rs index d0a967b920..73fc6dc3ab 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; use pageserver_api::config::NodeMetadata; -/// The Pageserver's client for using the control plane API: this is a small subset -/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) -pub struct ControlPlaneClient { +/// The Pageserver's client for using the storage controller upcall API: this is a small API +/// for dealing with generations (see docs/rfcs/025-generation-numbers.md). +/// +/// The server presenting this API may either be the storage controller or some other +/// service (such as the Neon control plane) providing a store of generation numbers. +pub struct ControllerUpcallClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, @@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi { ) -> impl Future, RetryForeverError>> + Send; } -impl ControlPlaneClient { +impl ControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { @@ -114,7 +117,7 @@ impl ControlPlaneClient { } } -impl ControlPlaneGenerationsApi for ControlPlaneClient { +impl ControlPlaneGenerationsApi for ControllerUpcallClient { /// Block until we get a successful response, or error out if we are shut down async fn re_attach( &self, @@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .join("validate") .expect("Failed to build validate path"); - let request = ValidateRequest { - tenants: tenants - .into_iter() - .map(|(id, gen)| ValidateRequestTenant { - id, - gen: gen - .into() - .expect("Generation should always be valid for a Tenant doing deletions"), - }) - .collect(), - }; + // When sending validate requests, break them up into chunks so that we + // avoid possible edge cases of generating any HTTP requests that + // require database I/O across many thousands of tenants. + let mut result: HashMap = HashMap::with_capacity(tenants.len()); + for tenant_chunk in (tenants).chunks(128) { + let request = ValidateRequest { + tenants: tenant_chunk + .iter() + .map(|(id, generation)| ValidateRequestTenant { + id: *id, + gen: (*generation).into().expect( + "Generation should always be valid for a Tenant doing deletions", + ), + }) + .collect(), + }; - failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); - if self.cancel.is_cancelled() { - return Err(RetryForeverError::ShuttingDown); + failpoint_support::sleep_millis_async!( + "control-plane-client-validate-sleep", + &self.cancel + ); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } + + let response: ValidateResponse = + self.retry_http_forever(&re_attach_path, request).await?; + for rt in response.tenants { + result.insert(rt.id, rt.valid); + } } - let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; - - Ok(response - .tenants - .into_iter() - .map(|rt| (rt.id, rt.valid)) - .collect()) + Ok(result.into_iter().collect()) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 22f7d5b824..73bdc90213 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; @@ -622,7 +622,7 @@ impl DeletionQueue { /// If remote_storage is None, then the returned workers will also be None. pub fn new( remote_storage: GenericRemoteStorage, - control_plane_client: Option, + controller_upcall_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) where @@ -662,7 +662,7 @@ impl DeletionQueue { conf, backend_rx, executor_tx, - control_plane_client, + controller_upcall_client, lsn_table.clone(), cancel.clone(), ), @@ -704,7 +704,7 @@ mod test { use tokio::task::JoinHandle; use crate::{ - control_plane_client::RetryForeverError, + controller_upcall_client::RetryForeverError, repository::Key, tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index d215fd2b7d..1d55581ebd 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -25,8 +25,8 @@ use tracing::info; use tracing::warn; use crate::config::PageServerConf; -use crate::control_plane_client::ControlPlaneGenerationsApi; -use crate::control_plane_client::RetryForeverError; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::controller_upcall_client::RetryForeverError; use crate::metrics; use crate::virtual_file::MaybeFatalIo; @@ -61,7 +61,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - control_plane_client: Option, + controller_upcall_client: Option, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -94,7 +94,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - control_plane_client: Option, + controller_upcall_client: Option, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -102,7 +102,7 @@ where conf, rx, tx, - control_plane_client, + controller_upcall_client, lsn_table, pending_lists: Vec::new(), validated_lists: Vec::new(), @@ -145,8 +145,8 @@ where return Ok(()); } - let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client { - match control_plane_client + let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { + match controller_upcall_client .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) .await { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6a10d4fb1c..6f0402e7b0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -589,6 +589,10 @@ async fn timeline_create_handler( StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()), ), + Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response( + StatusCode::NOT_ACCEPTABLE, + HttpErrorBody::from_msg(e.to_string()), + ), Err(tenant::CreateTimelineError::ShuttingDown) => json_response( StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("tenant shutting down".to_string()), @@ -820,7 +824,7 @@ async fn get_lsn_by_timestamp_handler( let lease = if with_lease { timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) + .init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx) .inspect_err(|_| { warn!("fail to grant a lease to {}", lsn); }) @@ -1688,9 +1692,18 @@ async fn lsn_lease_handler( let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; - let result = timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) - .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + let result = async { + timeline + .init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) + .map_err(|e| { + ApiError::InternalServerError( + e.context(format!("invalid lsn lease request at {lsn}")), + ) + }) + } + .instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await?; json_response(StatusCode::OK, result) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 7a9cf495c7..08abfbd647 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -6,7 +6,7 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; -pub mod control_plane_client; +pub mod controller_upcall_client; pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 366bd82903..b76efa5b48 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,6 +8,8 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; +use postgres_backend::{is_expected_io_error, QueryError}; +use pq_proto::framed::ConnectionError; use strum::{EnumCount, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; @@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { pub(crate) struct BasebackupQueryTime { ok: Histogram, error: Histogram, + client_error: Histogram, } pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { @@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| BasebackupQueryTime { ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), error: vec.get_metric_with_label_values(&["error"]).unwrap(), + client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(), } }); @@ -1557,7 +1561,7 @@ impl BasebackupQueryTime { } impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { - pub(crate) fn observe(self, res: &Result) { + pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed(); let ex_throttled = self .ctx @@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { elapsed } }; - let metric = if res.is_ok() { - &self.parent.ok - } else { - &self.parent.error + // If you want to change categorize of a specific error, also change it in `log_query_error`. + let metric = match res { + Ok(_) => &self.parent.ok, + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) + if is_expected_io_error(io_error) => + { + &self.parent.client_error + } + Err(_) => &self.parent.error, }; metric.observe(ex_throttled.as_secs_f64()); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 9261b7481d..8fa6b9a7f0 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -273,10 +273,20 @@ async fn page_service_conn_main( info!("Postgres client disconnected ({io_error})"); Ok(()) } else { - Err(io_error).context("Postgres connection error") + let tenant_id = conn_handler.timeline_handles.tenant_id(); + Err(io_error).context(format!( + "Postgres connection error for tenant_id={:?} client at peer_addr={}", + tenant_id, peer_addr + )) } } - other => other.context("Postgres query error"), + other => { + let tenant_id = conn_handler.timeline_handles.tenant_id(); + other.context(format!( + "Postgres query error for tenant_id={:?} client peer_addr={}", + tenant_id, peer_addr + )) + } } } @@ -340,6 +350,10 @@ impl TimelineHandles { } }) } + + fn tenant_id(&self) -> Option { + self.wrapper.tenant_id.get().copied() + } } pub(crate) struct TenantManagerWrapper { @@ -819,7 +833,7 @@ impl PageServerHandler { set_tracing_field_shard_id(&timeline); let lease = timeline - .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) + .renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) .inspect_err(|e| { warn!("{e}"); }) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 53cbaea621..db88303f7b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; use pageserver_api::models::TopTenantShardItem; @@ -182,27 +183,54 @@ pub struct TenantSharedResources { pub(super) struct AttachedTenantConf { tenant_conf: TenantConfOpt, location: AttachedLocationConfig, + /// The deadline before which we are blocked from GC so that + /// leases have a chance to be renewed. + lsn_lease_deadline: Option, } impl AttachedTenantConf { fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + // Sets a deadline before which we cannot proceed to GC due to lsn lease. + // + // We do this as the leases mapping are not persisted to disk. By delaying GC by lease + // length, we guarantee that all the leases we granted before will have a chance to renew + // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. + let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single { + Some( + tokio::time::Instant::now() + + tenant_conf + .lsn_lease_length + .unwrap_or(LsnLease::DEFAULT_LENGTH), + ) + } else { + // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale + // because we don't do GC in these modes. + None + }; + Self { tenant_conf, location, + lsn_lease_deadline, } } fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { - LocationMode::Attached(attach_conf) => Ok(Self { - tenant_conf: location_conf.tenant_conf, - location: *attach_conf, - }), + LocationMode::Attached(attach_conf) => { + Ok(Self::new(location_conf.tenant_conf, *attach_conf)) + } LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") } } } + + fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + self.lsn_lease_deadline + .map(|d| tokio::time::Instant::now() < d) + .unwrap_or(false) + } } struct TimelinePreload { timeline_id: TimelineId, @@ -563,6 +591,8 @@ pub enum CreateTimelineError { AncestorLsn(anyhow::Error), #[error("ancestor timeline is not active")] AncestorNotActive, + #[error("ancestor timeline is archived")] + AncestorArchived, #[error("tenant shutting down")] ShuttingDown, #[error(transparent)] @@ -1698,6 +1728,11 @@ impl Tenant { return Err(CreateTimelineError::AncestorNotActive); } + if ancestor_timeline.is_archived() == Some(true) { + info!("tried to branch archived timeline"); + return Err(CreateTimelineError::AncestorArchived); + } + if let Some(lsn) = ancestor_start_lsn.as_mut() { *lsn = lsn.align(); @@ -1815,6 +1850,11 @@ impl Tenant { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); } + + if conf.is_gc_blocked_by_lsn_lease_deadline() { + info!("Skipping GC because lsn lease deadline is not reached"); + return Ok(GcResult::default()); + } } let _guard = match self.gc_block.start().await { @@ -2623,6 +2663,8 @@ impl Tenant { Arc::new(AttachedTenantConf { tenant_conf: new_tenant_conf.clone(), location: inner.location, + // Attached location is not changed, no need to update lsn lease deadline. + lsn_lease_deadline: inner.lsn_lease_deadline, }) }); @@ -3880,9 +3922,9 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; let initdb_command = tokio::process::Command::new(&initdb_bin_path) - .args(["-D", initdb_target_dir.as_ref()]) - .args(["-U", &conf.superuser]) - .args(["-E", "utf8"]) + .args(["--pgdata", initdb_target_dir.as_ref()]) + .args(["--username", &conf.superuser]) + .args(["--encoding", "utf8"]) .arg("--no-instructions") .arg("--no-sync") .env_clear() @@ -4454,13 +4496,17 @@ mod tests { tline.freeze_and_flush().await.map_err(|e| e.into()) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") .await? .load() .await; + // Advance to the lsn lease deadline so that GC is not blocked by + // initial transition into AttachedSingle. + tokio::time::advance(tenant.get_lsn_lease_length()).await; + tokio::time::resume(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7237,9 +7283,17 @@ mod tests { Ok(()) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_lsn_lease() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await; + let (tenant, ctx) = TenantHarness::create("test_lsn_lease") + .await + .unwrap() + .load() + .await; + // Advance to the lsn lease deadline so that GC is not blocked by + // initial transition into AttachedSingle. + tokio::time::advance(tenant.get_lsn_lease_length()).await; + tokio::time::resume(); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -7267,24 +7321,33 @@ mod tests { let leased_lsns = [0x30, 0x50, 0x70]; let mut leases = Vec::new(); - let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| { - leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?); - Ok(()) + leased_lsns.iter().for_each(|n| { + leases.push( + timeline + .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx) + .expect("lease request should succeed"), + ); }); - // Renewing with shorter lease should not change the lease. - let updated_lease_0 = - timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?; - assert_eq!(updated_lease_0.valid_until, leases[0].valid_until); + let updated_lease_0 = timeline + .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx) + .expect("lease renewal should succeed"); + assert_eq!( + updated_lease_0.valid_until, leases[0].valid_until, + " Renewing with shorter lease should not change the lease." + ); - // Renewing with a long lease should renew lease with later expiration time. - let updated_lease_1 = timeline.make_lsn_lease( - Lsn(leased_lsns[1]), - timeline.get_lsn_lease_length() * 2, - &ctx, - )?; - - assert!(updated_lease_1.valid_until > leases[1].valid_until); + let updated_lease_1 = timeline + .renew_lsn_lease( + Lsn(leased_lsns[1]), + timeline.get_lsn_lease_length() * 2, + &ctx, + ) + .expect("lease renewal should succeed"); + assert!( + updated_lease_1.valid_until > leases[1].valid_until, + "Renewing with a long lease should renew lease with later expiration time." + ); // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( @@ -7301,7 +7364,8 @@ mod tests { &CancellationToken::new(), &ctx, ) - .await?; + .await + .unwrap(); // Keeping everything <= Lsn(0x80) b/c leases: // 0/10: initdb layer @@ -7315,13 +7379,16 @@ mod tests { // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); - let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx); - assert!(res.is_err()); + timeline + .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) + .expect_err("lease request on GC-ed LSN should fail"); // Should still be able to renew a currently valid lease // Assumption: original lease to is still valid for 0/50. - let _ = - timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?; + // (use `Timeline::init_lsn_lease` for testing so it always does validation) + timeline + .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx) + .expect("lease renewal with validation should succeed"); Ok(()) } diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index 8eaa8a001c..1e8fa8d1d6 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -5,6 +5,7 @@ use itertools::Itertools; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). +/// /// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index 1271d25b76..f7a7836a12 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,29 +1,12 @@ -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; -use super::remote_timeline_client::index::GcBlockingReason; -use tokio::time::Instant; use utils::id::TimelineId; -type TimelinesBlocked = HashMap>; +use super::remote_timeline_client::index::GcBlockingReason; -#[derive(Default)] -struct Storage { - timelines_blocked: TimelinesBlocked, - /// The deadline before which we are blocked from GC so that - /// leases have a chance to be renewed. - lsn_lease_deadline: Option, -} +type Storage = HashMap>; -impl Storage { - fn is_blocked_by_lsn_lease_deadline(&self) -> bool { - self.lsn_lease_deadline - .map(|d| Instant::now() < d) - .unwrap_or(false) - } -} - -/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc -/// blocking. +/// GcBlock provides persistent (per-timeline) gc blocking. #[derive(Default)] pub(crate) struct GcBlock { /// The timelines which have current reasons to block gc. @@ -66,17 +49,6 @@ impl GcBlock { } } - /// Sets a deadline before which we cannot proceed to GC due to lsn lease. - /// - /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease - /// length, we guarantee that all the leases we granted before will have a chance to renew - /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. - pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) { - let deadline = Instant::now() + lsn_lease_length; - let mut g = self.reasons.lock().unwrap(); - g.lsn_lease_deadline = Some(deadline); - } - /// Describe the current gc blocking reasons. /// /// TODO: make this json serializable. @@ -102,7 +74,7 @@ impl GcBlock { ) -> anyhow::Result { let (added, uploaded) = { let mut g = self.reasons.lock().unwrap(); - let set = g.timelines_blocked.entry(timeline.timeline_id).or_default(); + let set = g.entry(timeline.timeline_id).or_default(); let added = set.insert(reason); // LOCK ORDER: intentionally hold the lock, see self.reasons. @@ -133,7 +105,7 @@ impl GcBlock { let (remaining_blocks, uploaded) = { let mut g = self.reasons.lock().unwrap(); - match g.timelines_blocked.entry(timeline.timeline_id) { + match g.entry(timeline.timeline_id) { Entry::Occupied(mut oe) => { let set = oe.get_mut(); set.remove(reason); @@ -147,7 +119,7 @@ impl GcBlock { } } - let remaining_blocks = g.timelines_blocked.len(); + let remaining_blocks = g.len(); // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons let uploaded = timeline @@ -172,11 +144,11 @@ impl GcBlock { pub(crate) fn before_delete(&self, timeline: &super::Timeline) { let unblocked = { let mut g = self.reasons.lock().unwrap(); - if g.timelines_blocked.is_empty() { + if g.is_empty() { return; } - g.timelines_blocked.remove(&timeline.timeline_id); + g.remove(&timeline.timeline_id); BlockingReasons::clean_and_summarize(g).is_none() }; @@ -187,11 +159,10 @@ impl GcBlock { } /// Initialize with the non-deleted timelines of this tenant. - pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) { + pub(crate) fn set_scanned(&self, scanned: Storage) { let mut g = self.reasons.lock().unwrap(); - assert!(g.timelines_blocked.is_empty()); - g.timelines_blocked - .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + assert!(g.is_empty()); + g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { tracing::info!(summary=?reasons, "initialized with gc blocked"); @@ -205,7 +176,6 @@ pub(super) struct Guard<'a> { #[derive(Debug)] pub(crate) struct BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: bool, timelines: usize, reasons: enumset::EnumSet, } @@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}", - self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons + "{} timelines block for {:?}", + self.timelines, self.reasons ) } } @@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons { impl BlockingReasons { fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { let mut reasons = enumset::EnumSet::empty(); - g.timelines_blocked.retain(|_key, value| { + g.retain(|_key, value| { reasons = reasons.union(*value); !value.is_empty() }); - let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); - if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline { + if !g.is_empty() { Some(BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, - timelines: g.timelines_blocked.len(), + timelines: g.len(), reasons, }) } else { @@ -240,17 +208,14 @@ impl BlockingReasons { } fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { - let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); - if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline { + if g.is_empty() { None } else { let reasons = g - .timelines_blocked .values() .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); Some(BlockingReasons { - tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, - timelines: g.timelines_blocked.len(), + timelines: g.len(), reasons, }) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 1e7c1e10a5..c7212e89ba 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::control_plane_client::{ - ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, +use crate::controller_upcall_client::{ + ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; @@ -122,7 +122,7 @@ pub(crate) enum ShardSelector { Known(ShardIndex), } -/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// A convenience for use with the re_attach ControllerUpcallClient function: rather /// than the serializable struct, we build this enum that encapsulates /// the invariant that attached tenants always have generations. /// @@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { match attach_conf.generation.cmp(&tenant.generation) { Ordering::Equal => { - if attach_conf.attach_mode == AttachmentMode::Single { - tenant - .gc_block - .set_lsn_lease_deadline(tenant.get_lsn_lease_length()); - } - // A transition from Attached to Attached in the same generation, we may // take our fast path and just provide the updated configuration // to the tenant. diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 9fbe2f0da5..97506b7e9a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -178,6 +178,7 @@ async fn download_object<'a>( destination_file .flush() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("flush source file at {dst_path}")) .map_err(DownloadError::Other)?; @@ -185,6 +186,7 @@ async fn download_object<'a>( destination_file .sync_all() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("failed to fsync source file at {dst_path}")) .map_err(DownloadError::Other)?; @@ -232,6 +234,7 @@ async fn download_object<'a>( destination_file .sync_all() .await + .maybe_fatal_err("download_object sync_all") .with_context(|| format!("failed to fsync source file at {dst_path}")) .map_err(DownloadError::Other)?; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2b212cfed5..6f9eda85f5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -40,11 +40,11 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadCoalesceMode, VectoredReadPlanner, + VectoredReadPlanner, }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; @@ -589,7 +589,9 @@ impl DeltaLayerWriterInner { ); // fsync the file - file.sync_all().await?; + file.sync_all() + .await + .maybe_fatal_err("delta_layer sync_all")?; trace!("created delta layer {}", self.path); @@ -1133,7 +1135,7 @@ impl DeltaLayerInner { ctx: &RequestContext, ) -> anyhow::Result { use crate::tenant::vectored_blob_io::{ - BlobMeta, VectoredReadBuilder, VectoredReadExtended, + BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; use futures::stream::TryStreamExt; @@ -1183,8 +1185,8 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; - let mut read_builder: Option = None; - let read_mode = VectoredReadCoalesceMode::get(); + let mut read_builder: Option = None; + let align = virtual_file::get_io_buffer_alignment(); let max_read_size = self .max_vectored_read_bytes @@ -1228,12 +1230,12 @@ impl DeltaLayerInner { { None } else { - read_builder.replace(VectoredReadBuilder::new( + read_builder.replace(ChunkedVectoredReadBuilder::new( offsets.start.pos(), offsets.end.pos(), meta, max_read_size, - read_mode, + align, )) } } else { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 940d169db0..3dcd7bc962 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,7 +41,7 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; @@ -889,7 +889,9 @@ impl ImageLayerWriterInner { // set inner.file here. The first read will have to re-open it. // fsync the file - file.sync_all().await?; + file.sync_all() + .await + .maybe_fatal_err("image_layer sync_all")?; trace!("created image layer {}", self.path); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 3f0f8a21c8..547739e773 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; - tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length()); loop { tokio::select! { _ = cancel.cancelled() => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d301ba23ea..2113a1d726 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -66,6 +66,7 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ + config::AttachmentMode, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, @@ -1324,16 +1325,38 @@ impl Timeline { Ok(()) } - /// Obtains a temporary lease blocking garbage collection for the given LSN. - /// - /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also - /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if - /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and - /// the requesting lease. - pub(crate) fn make_lsn_lease( + /// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`. + pub(crate) fn init_lsn_lease( &self, lsn: Lsn, length: Duration, + ctx: &RequestContext, + ) -> anyhow::Result { + self.make_lsn_lease(lsn, length, true, ctx) + } + + /// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period. + pub(crate) fn renew_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + ctx: &RequestContext, + ) -> anyhow::Result { + self.make_lsn_lease(lsn, length, false, ctx) + } + + /// Obtains a temporary lease blocking garbage collection for the given LSN. + /// + /// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error + /// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present. + /// + /// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease. + /// The returned lease is therefore the maximum between the existing lease and the requesting lease. + fn make_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + init: bool, _ctx: &RequestContext, ) -> anyhow::Result { let lease = { @@ -1347,8 +1370,8 @@ impl Timeline { let entry = gc_info.leases.entry(lsn); - let lease = { - if let Entry::Occupied(mut occupied) = entry { + match entry { + Entry::Occupied(mut occupied) => { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; @@ -1360,20 +1383,28 @@ impl Timeline { } existing_lease.clone() - } else { - // Reject already GC-ed LSN (lsn < latest_gc_cutoff) - let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); - if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + Entry::Vacant(vacant) => { + // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and + // not blocked by the lsn lease deadline. + let validate = { + let conf = self.tenant_conf.load(); + conf.location.attach_mode == AttachmentMode::Single + && !conf.is_gc_blocked_by_lsn_lease_deadline() + }; + + if init || validate { + let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } } let dt: DateTime = valid_until.into(); info!("lease created, valid until {}", dt); - entry.or_insert(LsnLease { valid_until }).clone() + vacant.insert(LsnLease { valid_until }).clone() } - }; - - lease + } }; Ok(lease) @@ -1950,8 +1981,6 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } - // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072 - #[allow(unused)] pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -3601,7 +3630,7 @@ impl Timeline { ctx, ) .await - .map_err(|e| FlushLayerError::from_anyhow(self, e))?; + .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); @@ -3840,16 +3869,20 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { + ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - anyhow::bail!("repartition() called concurrently, this should not happen"); + return Err(CompactionError::Other(anyhow!( + "repartition() called concurrently, this should not happen" + ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; if lsn < *partition_lsn { - anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); + return Err(CompactionError::Other(anyhow!( + "repartition() called with LSN going backwards, this should not happen" + ))); } let distance = lsn.0 - partition_lsn.0; @@ -4451,6 +4484,12 @@ pub(crate) enum CompactionError { Other(anyhow::Error), } +impl CompactionError { + pub fn is_cancelled(&self) -> bool { + matches!(self, CompactionError::ShuttingDown) + } +} + impl From for CompactionError { fn from(err: CollectKeySpaceError) -> Self { match err { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6edc28a11b..3de386a2d5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -390,7 +390,7 @@ impl Timeline { // error but continue. // // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { + if !self.cancel.is_cancelled() && !err.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } (1, false) diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index aa37a45898..1faa6bab99 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -185,171 +185,7 @@ pub(crate) enum VectoredReadExtended { No, } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum VectoredReadCoalesceMode { - /// Only coalesce exactly adjacent reads. - AdjacentOnly, - /// In addition to adjacent reads, also consider reads whose corresponding - /// `end` and `start` offsets reside at the same chunk. - Chunked(usize), -} - -impl VectoredReadCoalesceMode { - /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0, - /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher. - pub(crate) fn get() -> Self { - let align = virtual_file::get_io_buffer_alignment_raw(); - if align == 0 { - VectoredReadCoalesceMode::AdjacentOnly - } else { - VectoredReadCoalesceMode::Chunked(align) - } - } -} - -pub(crate) enum VectoredReadBuilder { - Adjacent(AdjacentVectoredReadBuilder), - Chunked(ChunkedVectoredReadBuilder), -} - -impl VectoredReadBuilder { - fn new_impl( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: Option, - mode: VectoredReadCoalesceMode, - ) -> Self { - match mode { - VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent( - AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size), - ), - VectoredReadCoalesceMode::Chunked(chunk_size) => { - Self::Chunked(ChunkedVectoredReadBuilder::new( - start_offset, - end_offset, - meta, - max_read_size, - chunk_size, - )) - } - } - } - - pub(crate) fn new( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: usize, - mode: VectoredReadCoalesceMode, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode) - } - - pub(crate) fn new_streaming( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - mode: VectoredReadCoalesceMode, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, None, mode) - } - - pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta), - VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta), - } - } - - pub(crate) fn build(self) -> VectoredRead { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.build(), - VectoredReadBuilder::Chunked(builder) => builder.build(), - } - } - - pub(crate) fn size(&self) -> usize { - match self { - VectoredReadBuilder::Adjacent(builder) => builder.size(), - VectoredReadBuilder::Chunked(builder) => builder.size(), - } - } -} - -pub(crate) struct AdjacentVectoredReadBuilder { - /// Start offset of the read. - start: u64, - // End offset of the read. - end: u64, - /// Start offset and metadata for each blob in this read - blobs_at: VecMap, - max_read_size: Option, -} - -impl AdjacentVectoredReadBuilder { - /// Start building a new vectored read. - /// - /// Note that by design, this does not check against reading more than `max_read_size` to - /// support reading larger blobs than the configuration value. The builder will be single use - /// however after that. - pub(crate) fn new( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - max_read_size: Option, - ) -> Self { - let mut blobs_at = VecMap::default(); - blobs_at - .append(start_offset, meta) - .expect("First insertion always succeeds"); - - Self { - start: start_offset, - end: end_offset, - blobs_at, - max_read_size, - } - } - /// Attempt to extend the current read with a new blob if the start - /// offset matches with the current end of the vectored read - /// and the resuting size is below the max read size - pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { - tracing::trace!(start, end, "trying to extend"); - let size = (end - start) as usize; - let not_limited_by_max_read_size = { - if let Some(max_read_size) = self.max_read_size { - self.size() + size <= max_read_size - } else { - true - } - }; - - if self.end == start && not_limited_by_max_read_size { - self.end = end; - self.blobs_at - .append(start, meta) - .expect("LSNs are ordered within vectored reads"); - - return VectoredReadExtended::Yes; - } - - VectoredReadExtended::No - } - - pub(crate) fn size(&self) -> usize { - (self.end - self.start) as usize - } - - pub(crate) fn build(self) -> VectoredRead { - VectoredRead { - start: self.start, - end: self.end, - blobs_at: self.blobs_at, - } - } -} - +/// A vectored read builder that tries to coalesce all reads that fits in a chunk. pub(crate) struct ChunkedVectoredReadBuilder { /// Start block number start_blk_no: usize, @@ -373,7 +209,7 @@ impl ChunkedVectoredReadBuilder { /// Note that by design, this does not check against reading more than `max_read_size` to /// support reading larger blobs than the configuration value. The builder will be single use /// however after that. - pub(crate) fn new( + fn new_impl( start_offset: u64, end_offset: u64, meta: BlobMeta, @@ -396,6 +232,25 @@ impl ChunkedVectoredReadBuilder { } } + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + align: usize, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align) + } + + pub(crate) fn new_streaming( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + align: usize, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, None, align) + } + /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. /// /// The resulting size also must be below the max read size. @@ -474,17 +329,17 @@ pub struct VectoredReadPlanner { max_read_size: usize, - mode: VectoredReadCoalesceMode, + align: usize, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, - mode, + align, } } @@ -545,7 +400,7 @@ impl VectoredReadPlanner { } pub fn finish(self) -> Vec { - let mut current_read_builder: Option = None; + let mut current_read_builder: Option = None; let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { @@ -558,12 +413,12 @@ impl VectoredReadPlanner { }; if extended == VectoredReadExtended::No { - let next_read_builder = VectoredReadBuilder::new( + let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, BlobMeta { key, lsn }, self.max_read_size, - self.mode, + self.align, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -688,7 +543,7 @@ impl<'a> VectoredBlobReader<'a> { /// `handle` gets called and when the current key would just exceed the read_size and /// max_cnt constraints. pub struct StreamingVectoredReadPlanner { - read_builder: Option, + read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, @@ -699,21 +554,21 @@ pub struct StreamingVectoredReadPlanner { /// Size of the current batch cnt: usize, - mode: VectoredReadCoalesceMode, + align: usize, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, - mode, + align, } } @@ -762,11 +617,11 @@ impl StreamingVectoredReadPlanner { } None => { self.read_builder = { - Some(VectoredReadBuilder::new_streaming( + Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, BlobMeta { key, lsn }, - self.mode, + self.align, )) }; } @@ -1092,7 +947,7 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); - let mode = VectoredReadCoalesceMode::get(); + let align = virtual_file::get_io_buffer_alignment(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -1104,7 +959,8 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode); + let read_builder = + ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 57856eea80..5b7b279888 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -466,6 +466,7 @@ impl VirtualFile { &[] }; utils::crashsafe::overwrite(&final_path, &tmp_path, content) + .maybe_fatal_err("crashsafe_overwrite") }) .await .expect("blocking task is never aborted") @@ -475,7 +476,7 @@ impl VirtualFile { pub async fn sync_all(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; - res + res.maybe_fatal_err("sync_all") }) } @@ -483,7 +484,7 @@ impl VirtualFile { pub async fn sync_data(&self) -> Result<(), Error> { with_file!(self, StorageIoOperation::Fsync, |file_guard| { let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; - res + res.maybe_fatal_err("sync_data") }) } @@ -1147,7 +1148,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) panic!("virtual_file::init called twice"); } if set_io_buffer_alignment(io_buffer_alignment).is_err() { - panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two"); + panic!( + "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}" + ); } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); @@ -1174,14 +1177,16 @@ fn get_open_files() -> &'static OpenFiles { static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); -/// Returns true if `x` is zero or a power of two. -fn is_zero_or_power_of_two(x: usize) -> bool { - (x == 0) || ((x & (x - 1)) == 0) +/// Returns true if the alignment is a power of two and is greater or equal to 512. +fn is_valid_io_buffer_alignment(align: usize) -> bool { + align.is_power_of_two() && align >= 512 } +/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is +/// not a power of two or less than 512 bytes. #[allow(unused)] pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { - if is_zero_or_power_of_two(align) { + if is_valid_io_buffer_alignment(align) { IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); Ok(()) } else { @@ -1189,19 +1194,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { } } -/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified. +/// Gets the io buffer alignment. /// -/// This function should be used to check the raw config value. -pub(crate) fn get_io_buffer_alignment_raw() -> usize { +/// This function should be used for getting the actual alignment value to use. +pub(crate) fn get_io_buffer_alignment() -> usize { let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); if cfg!(test) { let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; if let Some(test_align) = utils::env::var(env_var_name) { - if is_zero_or_power_of_two(test_align) { + if is_valid_io_buffer_alignment(test_align) { test_align } else { - panic!("IO buffer alignment ({test_align}) is not a power of two"); + panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}"); } } else { align @@ -1211,14 +1216,6 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize { } } -/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero. -/// -/// This function should be used for getting the actual alignment value to use. -pub(crate) fn get_io_buffer_alignment() -> usize { - let align = get_io_buffer_alignment_raw(); - align.max(1) -} - #[cfg(test)] mod tests { use crate::context::DownloadBehavior; diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index ddc8155ff3..f1229b2d73 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -25,7 +25,18 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql +DATA = \ + neon--1.0.sql \ + neon--1.0--1.1.sql \ + neon--1.1--1.2.sql \ + neon--1.2--1.3.sql \ + neon--1.3--1.4.sql \ + neon--1.4--1.5.sql \ + neon--1.5--1.4.sql \ + neon--1.4--1.3.sql \ + neon--1.3--1.2.sql \ + neon--1.2--1.1.sql \ + neon--1.1--1.0.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 4d0d06e6de..bb65a11c7d 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1473,11 +1473,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, { NeonWALReadResult res; - res = NeonWALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id()); +#if PG_MAJORVERSION_NUM >= 17 + if (!sk->wp->config->syncSafekeepers) + { + Size rbytes; + rbytes = WALReadFromBuffers(buf, startptr, count, + walprop_pg_get_timeline_id()); + + startptr += rbytes; + count -= rbytes; + } +#endif + + if (count == 0) + { + res = NEON_WALREAD_SUCCESS; + } + else + { + Assert(count > 0); + + /* Now read the remaining WAL from the WAL file */ + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); + } if (res == NEON_WALREAD_SUCCESS) { diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 501ce050e0..04e0f9d4f5 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -24,6 +24,7 @@ bytes = { workspace = true, features = ["serde"] } camino.workspace = true chrono.workspace = true clap.workspace = true +compute_api.workspace = true consumption_metrics.workspace = true dashmap.workspace = true env_logger.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 4e9f4591ad..5dbfa5cc09 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -80,6 +80,14 @@ pub(crate) trait TestBackend: Send + Sync + 'static { fn get_allowed_ips_and_secret( &self, ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; + fn dyn_clone(&self) -> Box; +} + +#[cfg(test)] +impl Clone for Box { + fn clone(&self) -> Self { + TestBackend::dyn_clone(&**self) + } } impl std::fmt::Display for Backend<'_, (), ()> { @@ -585,6 +593,14 @@ mod tests { )) } + async fn get_endpoint_jwks( + &self, + _ctx: &RequestMonitoring, + _endpoint: crate::EndpointId, + ) -> anyhow::Result> { + unimplemented!() + } + async fn wake_compute( &self, _ctx: &RequestMonitoring, diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 94e5999a5f..ab848551a9 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -12,7 +12,10 @@ use serde::{Deserialize, Deserializer}; use signature::Verifier; use tokio::time::Instant; -use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName}; +use crate::{ + context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, + RoleName, +}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -27,7 +30,6 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - role_name: RoleName, ) -> impl Future>> + Send; } @@ -35,10 +37,11 @@ pub(crate) struct AuthRule { pub(crate) id: String, pub(crate) jwks_url: url::Url, pub(crate) audience: Option, + pub(crate) role_names: Vec, } #[derive(Default)] -pub(crate) struct JwkCache { +pub struct JwkCache { client: reqwest::Client, map: DashMap<(EndpointId, RoleName), Arc>, @@ -54,18 +57,28 @@ pub(crate) struct JwkCacheEntry { } impl JwkCacheEntry { - fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> { - self.key_sets.values().find_map(|key_set| { - key_set - .find_key(key_id) - .map(|jwk| (jwk, key_set.audience.as_deref())) - }) + fn find_jwk_and_audience( + &self, + key_id: &str, + role_name: &RoleName, + ) -> Option<(&jose_jwk::Jwk, Option<&str>)> { + self.key_sets + .values() + // make sure our requested role has access to the key set + .filter(|key_set| key_set.role_names.iter().any(|role| **role == **role_name)) + // try and find the requested key-id in the key set + .find_map(|key_set| { + key_set + .find_key(key_id) + .map(|jwk| (jwk, key_set.audience.as_deref())) + }) } } struct KeySet { jwks: jose_jwk::JwkSet, audience: Option, + role_names: Vec, } impl KeySet { @@ -106,7 +119,6 @@ impl JwkCacheEntryLock { ctx: &RequestMonitoring, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, auth_rules: &F, ) -> anyhow::Result> { // double check that no one beat us to updating the cache. @@ -119,11 +131,10 @@ impl JwkCacheEntryLock { } } - let rules = auth_rules - .fetch_auth_rules(ctx, endpoint, role_name) - .await?; + let rules = auth_rules.fetch_auth_rules(ctx, endpoint).await?; let mut key_sets = ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); + // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { @@ -151,6 +162,7 @@ impl JwkCacheEntryLock { KeySet { jwks, audience: rule.audience, + role_names: rule.role_names, }, ); } @@ -173,7 +185,6 @@ impl JwkCacheEntryLock { ctx: &RequestMonitoring, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, fetch: &F, ) -> Result, anyhow::Error> { let now = Instant::now(); @@ -183,9 +194,7 @@ impl JwkCacheEntryLock { let Some(cached) = guard else { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; - return self - .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) - .await; + return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; }; let last_update = now.duration_since(cached.last_retrieved); @@ -196,9 +205,7 @@ impl JwkCacheEntryLock { let permit = self.acquire_permit().await; // it's been too long since we checked the keys. wait for them to update. - return self - .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) - .await; + return self.renew_jwks(permit, ctx, client, endpoint, fetch).await; } // every 5 minutes we should spawn a job to eagerly update the token. @@ -212,7 +219,7 @@ impl JwkCacheEntryLock { let ctx = ctx.clone(); tokio::spawn(async move { if let Err(e) = entry - .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch) + .renew_jwks(permit, &ctx, &client, endpoint, &fetch) .await { tracing::warn!(error=?e, "could not fetch JWKs in background job"); @@ -232,7 +239,7 @@ impl JwkCacheEntryLock { jwt: &str, client: &reqwest::Client, endpoint: EndpointId, - role_name: RoleName, + role_name: &RoleName, fetch: &F, ) -> Result<(), anyhow::Error> { // JWT compact form is defined to be @@ -254,30 +261,26 @@ impl JwkCacheEntryLock { let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) .context("Provided authentication token is not a valid JWT encoding")?; - ensure!(header.typ == "JWT"); + ensure!( + header.typ == "JWT", + "Provided authentication token is not a valid JWT encoding" + ); let kid = header.key_id.context("missing key id")?; let mut guard = self - .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch) + .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) .await?; // get the key from the JWKs if possible. If not, wait for the keys to update. let (jwk, expected_audience) = loop { - match guard.find_jwk_and_audience(kid) { + match guard.find_jwk_and_audience(kid, role_name) { Some(jwk) => break jwk, None if guard.last_retrieved.elapsed() > MIN_RENEW => { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; guard = self - .renew_jwks( - permit, - ctx, - client, - endpoint.clone(), - role_name.clone(), - fetch, - ) + .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } _ => { @@ -320,11 +323,14 @@ impl JwkCacheEntryLock { let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY); + ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); } if let Some(nbf) = payload.not_before { - ensure!(nbf < now + CLOCK_SKEW_LEEWAY); + ensure!( + nbf < now + CLOCK_SKEW_LEEWAY, + "JWT token is not yet ready to use" + ); } Ok(()) @@ -336,7 +342,7 @@ impl JwkCache { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - role_name: RoleName, + role_name: &RoleName, fetch: &F, jwt: &str, ) -> Result<(), anyhow::Error> { @@ -572,7 +578,7 @@ mod tests { format!("{header}.{body}") } - fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String { + fn new_ec_jwt(kid: String, key: &p256::SecretKey) -> String { use p256::ecdsa::{Signature, SigningKey}; let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); @@ -660,11 +666,6 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let (ec1, jwk3) = new_ec_jwk("3".into()); let (ec2, jwk4) = new_ec_jwk("4".into()); - let jwt1 = new_rsa_jwt("1".into(), rs1); - let jwt2 = new_rsa_jwt("2".into(), rs2); - let jwt3 = new_ec_jwt("3".into(), ec1); - let jwt4 = new_ec_jwt("4".into(), ec2); - let foo_jwks = jose_jwk::JwkSet { keys: vec![jwk1, jwk3], }; @@ -706,47 +707,98 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL let client = reqwest::Client::new(); #[derive(Clone)] - struct Fetch(SocketAddr); + struct Fetch(SocketAddr, Vec); impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - _role_name: RoleName, ) -> anyhow::Result> { Ok(vec![ AuthRule { id: "foo".to_owned(), jwks_url: format!("http://{}/foo", self.0).parse().unwrap(), audience: None, + role_names: self.1.clone(), }, AuthRule { id: "bar".to_owned(), jwks_url: format!("http://{}/bar", self.0).parse().unwrap(), audience: None, + role_names: self.1.clone(), }, ]) } } - let role_name = RoleName::from("user"); + let role_name1 = RoleName::from("anonymous"); + let role_name2 = RoleName::from("authenticated"); + + let fetch = Fetch( + addr, + vec![ + RoleNameInt::from(&role_name1), + RoleNameInt::from(&role_name2), + ], + ); + let endpoint = EndpointId::from("ep"); let jwk_cache = Arc::new(JwkCacheEntryLock::default()); - for token in [jwt1, jwt2, jwt3, jwt4] { - jwk_cache - .check_jwt( - &RequestMonitoring::test(), - &token, - &client, - endpoint.clone(), - role_name.clone(), - &Fetch(addr), - ) - .await - .unwrap(); + let jwt1 = new_rsa_jwt("1".into(), rs1); + let jwt2 = new_rsa_jwt("2".into(), rs2); + let jwt3 = new_ec_jwt("3".into(), &ec1); + let jwt4 = new_ec_jwt("4".into(), &ec2); + + // had the wrong kid, therefore will have the wrong ecdsa signature + let bad_jwt = new_ec_jwt("3".into(), &ec2); + // this role_name is not accepted + let bad_role_name = RoleName::from("cloud_admin"); + + let err = jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &bad_jwt, + &client, + endpoint.clone(), + &role_name1, + &fetch, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("signature error")); + + let err = jwk_cache + .check_jwt( + &RequestMonitoring::test(), + &jwt1, + &client, + endpoint.clone(), + &bad_role_name, + &fetch, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("jwk not found")); + + let tokens = [jwt1, jwt2, jwt3, jwt4]; + let role_names = [role_name1, role_name2]; + for role in &role_names { + for token in &tokens { + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + token, + &client, + endpoint.clone(), + role, + &fetch, + ) + .await + .unwrap(); + } } } } diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 2ff2ca00f0..2ab53f2c6a 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, net::SocketAddr}; +use std::net::SocketAddr; use anyhow::Context; use arc_swap::ArcSwapOption; @@ -10,8 +10,8 @@ use crate::{ NodeInfo, }, context::RequestMonitoring, - intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag}, - EndpointId, RoleName, + intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, + EndpointId, }; use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; @@ -48,26 +48,17 @@ impl LocalBackend { #[derive(Clone, Copy)] pub(crate) struct StaticAuthRules; -pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); - -#[derive(Debug, Clone)] -pub struct JwksRoleSettings { - pub roles: HashMap, - pub project_id: ProjectIdInt, - pub branch_id: BranchIdInt, -} +pub static JWKS_ROLE_MAP: ArcSwapOption = ArcSwapOption::const_empty(); impl FetchAuthRules for StaticAuthRules { async fn fetch_auth_rules( &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - role_name: RoleName, ) -> anyhow::Result> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .and_then(|m| m.roles.get(&role_name)) .context("JWKs settings for this role were not configured")?; let mut rules = vec![]; for setting in &role_mappings.jwks { @@ -75,6 +66,7 @@ impl FetchAuthRules for StaticAuthRules { id: setting.id.clone(), jwks_url: setting.jwks_url.clone(), audience: setting.jwt_audience.clone(), + role_names: setting.role_names.clone(), }); } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 94365ddf05..1b3f465686 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,34 +1,35 @@ -use std::{ - net::SocketAddr, - path::{Path, PathBuf}, - pin::pin, - sync::Arc, - time::Duration, -}; +use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; -use anyhow::{bail, ensure}; +use anyhow::{bail, ensure, Context}; +use camino::{Utf8Path, Utf8PathBuf}; +use compute_api::spec::LocalProxySpec; use dashmap::DashMap; -use futures::{future::Either, FutureExt}; +use futures::future::Either; use proxy::{ - auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP}, + auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}, cancellation::CancellationHandlerMain, config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - console::{locks::ApiLocks, messages::JwksRoleMapping}, + console::{ + locks::ApiLocks, + messages::{EndpointJwksResponse, JwksSettings}, + }, http::health_server::AppMetrics, + intern::RoleNameInt, metrics::{Metrics, ThreadPoolMetrics}, rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, scram::threadpool::ThreadPool, serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, + RoleName, }; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, task::JoinSet}; +use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; @@ -72,9 +73,12 @@ struct LocalProxyCliArgs { /// Address of the postgres server #[clap(long, default_value = "127.0.0.1:5432")] compute: SocketAddr, - /// File address of the local proxy config file + /// Path of the local proxy config file #[clap(long, default_value = "./localproxy.json")] - config_path: PathBuf, + config_path: Utf8PathBuf, + /// Path of the local proxy PID file + #[clap(long, default_value = "./localproxy.pid")] + pid_path: Utf8PathBuf, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -126,6 +130,24 @@ async fn main() -> anyhow::Result<()> { let args = LocalProxyCliArgs::parse(); let config = build_config(&args)?; + // before we bind to any ports, write the process ID to a file + // so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + // + // This also claims a "lock" that makes sure only one instance + // of local-proxy runs at a time. + let _process_guard = loop { + match pid_file::claim_for_current_process(&args.pid_path) { + Ok(guard) => break guard, + Err(e) => { + // compute-ctl might have tried to read the pid-file to let us + // know about some config change. We should try again. + error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; let http_listener = TcpListener::bind(args.http).await?; let shutdown = CancellationToken::new(); @@ -139,12 +161,30 @@ async fn main() -> anyhow::Result<()> { 16, )); - refresh_config(args.config_path.clone()).await; + // write the process ID to a file so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + let pid = std::process::id(); + info!("process running in PID {pid}"); + std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?; let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || { - refresh_config(args.config_path.clone()).map(Ok) + + let refresh_config_notify = Arc::new(Notify::new()); + maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), { + let refresh_config_notify = Arc::clone(&refresh_config_notify); + move || { + refresh_config_notify.notify_one(); + } })); + + // trigger the first config load **after** setting up the signal hook + // to avoid the race condition where: + // 1. No config file registered when local-proxy starts up + // 2. The config file is written but the signal hook is not yet received + // 3. local-proxy completes startup but has no config loaded, despite there being a registerd config. + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + maintenance_tasks.spawn(proxy::http::health_server::task_main( metrics_listener, AppMetrics { @@ -245,81 +285,84 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }))) } -async fn refresh_config(path: PathBuf) { - match refresh_config_inner(&path).await { - Ok(()) => {} - Err(e) => { - error!(error=?e, ?path, "could not read config file"); +async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + loop { + rx.notified().await; + + match refresh_config_inner(&path).await { + Ok(()) => {} + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } } } } -async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> { +async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { let bytes = tokio::fs::read(&path).await?; - let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - let mut settings = None; + let mut jwks_set = vec![]; - for mapping in data.roles.values_mut() { - for jwks in &mut mapping.jwks { - ensure!( - jwks.jwks_url.has_authority() - && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); + for jwks in data.jwks { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - ensure!( - jwks.jwks_url - .host() - .is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); - // clear username, password and ports - jwks.jwks_url.set_username("").expect( + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( "url can be a base and has a valid host and is not a file. should not error", ); - jwks.jwks_url.set_password(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks.jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks.jwks_url.set_fragment(None); - jwks.jwks_url.query_pairs_mut().clear().finish(); - - if jwks.jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks.jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id)); - ensure!( - *pr == jwks.project_id, - "inconsistent project IDs configured" - ); - ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured"); } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + jwks_set.push(JwksSettings { + id: jwks.id, + jwks_url, + provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) } - if let Some((project_id, branch_id)) = settings { - JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings { - roles: data.roles, - project_id, - branch_id, - }))); - } + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); Ok(()) } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 20d2d3df9a..53f1586abe 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -133,9 +133,7 @@ async fn main() -> anyhow::Result<()> { proxy_listener, cancellation_token.clone(), )); - let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async { - Ok(()) - })); + let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {})); // the signal task cant ever succeed. // the main task can error, or can succeed on cancellation. diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 2ac66ffe8c..141005788d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -461,10 +461,7 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals( - cancellation_token.clone(), - || async { Ok(()) }, - )); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {})); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 85683acb82..1696e229ce 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,13 +1,11 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; use std::fmt::{self, Display}; use crate::auth::IpPattern; -use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; -use crate::RoleName; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. @@ -348,11 +346,6 @@ impl ColdStartInfo { } } -#[derive(Debug, Deserialize, Clone)] -pub struct JwksRoleMapping { - pub roles: HashMap, -} - #[derive(Debug, Deserialize, Clone)] pub struct EndpointJwksResponse { pub jwks: Vec, @@ -361,11 +354,10 @@ pub struct EndpointJwksResponse { #[derive(Debug, Deserialize, Clone)] pub struct JwksSettings { pub id: String, - pub project_id: ProjectIdInt, - pub branch_id: BranchIdInt, pub jwks_url: url::Url, pub provider_name: String, pub jwt_audience: Option, + pub role_names: Vec, } #[cfg(test)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 16e8da605b..95097f2de9 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -5,7 +5,10 @@ pub mod neon; use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ auth::{ - backend::{ComputeCredentialKeys, ComputeUserInfo}, + backend::{ + jwt::{AuthRule, FetchAuthRules}, + ComputeCredentialKeys, ComputeUserInfo, + }, IpPattern, }, cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, @@ -16,7 +19,7 @@ use crate::{ intern::ProjectIdInt, metrics::ApiLockMetrics, rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, + scram, EndpointCacheKey, EndpointId, }; use dashmap::DashMap; use std::{hash::Hash, sync::Arc, time::Duration}; @@ -334,6 +337,12 @@ pub(crate) trait Api { user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result>; + /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, @@ -343,6 +352,7 @@ pub(crate) trait Api { } #[non_exhaustive] +#[derive(Clone)] pub enum ConsoleBackend { /// Current Cloud API (V2). Console(neon::Api), @@ -386,6 +396,20 @@ impl Api for ConsoleBackend { } } + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + match self { + Self::Console(api) => api.get_endpoint_jwks(ctx, endpoint).await, + #[cfg(any(test, feature = "testing"))] + Self::Postgres(api) => api.get_endpoint_jwks(ctx, endpoint).await, + #[cfg(test)] + Self::Test(_api) => Ok(vec![]), + } + } + async fn wake_compute( &self, ctx: &RequestMonitoring, @@ -552,3 +576,13 @@ impl WakeComputePermit { res } } + +impl FetchAuthRules for ConsoleBackend { + async fn fetch_auth_rules( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.get_endpoint_jwks(ctx, endpoint).await + } +} diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 1b77418de6..b548a0203a 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,7 +4,9 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::context::RequestMonitoring; +use crate::{ + auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, +}; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; use crate::{ @@ -118,6 +120,39 @@ impl Api { }) } + async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; + + let connection = tokio::spawn(connection); + + let res = client.query( + "select id, jwks_url, audience, role_names from neon_control_plane.endpoint_jwks where endpoint_id = $1", + &[&endpoint.as_str()], + ) + .await?; + + let mut rows = vec![]; + for row in res { + rows.push(AuthRule { + id: row.get("id"), + jwks_url: url::Url::parse(row.get("jwks_url"))?, + audience: row.get("audience"), + role_names: row + .get::<_, Vec>("role_names") + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }); + } + + drop(client); + connection.await??; + + Ok(rows) + } + async fn do_wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config @@ -185,6 +220,14 @@ impl super::Api for Api { )) } + async fn get_endpoint_jwks( + &self, + _ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.do_get_endpoint_jwks(endpoint).await + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index b004bf4ecf..2d527f378c 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,27 +7,33 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, + auth::backend::{jwt::AuthRule, ComputeUserInfo}, compute, - console::messages::{ColdStartInfo, Reason}, + console::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, http, metrics::{CacheOutcome, Metrics}, rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, + scram, EndpointCacheKey, EndpointId, }; use crate::{cache::Cached, context::RequestMonitoring}; +use ::http::{header::AUTHORIZATION, HeaderName}; +use anyhow::bail; use futures::TryFutureExt; use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{debug, error, info, info_span, warn, Instrument}; +const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); + +#[derive(Clone)] pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, pub(crate) locks: &'static ApiLocks, pub(crate) wake_compute_endpoint_rate_limiter: Arc, - jwt: String, + // put in a shared ref so we don't copy secrets all over in memory + jwt: Arc, } impl Api { @@ -38,7 +44,9 @@ impl Api { locks: &'static ApiLocks, wake_compute_endpoint_rate_limiter: Arc, ) -> Self { - let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default(); + let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") + .unwrap_or_default() + .into(); Self { endpoint, caches, @@ -71,9 +79,9 @@ impl Api { async { let request = self .endpoint - .get("proxy_get_role_secret") - .header("X-Request-ID", &request_id) - .header("Authorization", format!("Bearer {}", &self.jwt)) + .get_path("proxy_get_role_secret") + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) .query(&[ ("application_name", application_name.as_str()), @@ -125,6 +133,61 @@ impl Api { .await } + async fn do_get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &endpoint.normalize()) + .await + { + bail!("endpoint not found"); + } + let request_id = ctx.session_id().to_string(); + async { + let request = self + .endpoint + .get_with_url(|url| { + url.path_segments_mut() + .push("endpoints") + .push(endpoint.as_str()) + .push("jwks"); + }) + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .build()?; + + info!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + + let body = parse_body::(response).await?; + + let rules = body + .jwks + .into_iter() + .map(|jwks| AuthRule { + id: jwks.id, + jwks_url: jwks.jwks_url, + audience: jwks.jwt_audience, + role_names: jwks.role_names, + }) + .collect(); + + Ok(rules) + } + .map_err(crate::error::log_error) + .instrument(info_span!("http", id = request_id)) + .await + } + async fn do_wake_compute( &self, ctx: &RequestMonitoring, @@ -135,7 +198,7 @@ impl Api { async { let mut request_builder = self .endpoint - .get("proxy_wake_compute") + .get_path("proxy_wake_compute") .header("X-Request-ID", &request_id) .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) @@ -262,6 +325,15 @@ impl super::Api for Api { )) } + #[tracing::instrument(skip_all)] + async fn get_endpoint_jwks( + &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, + ) -> anyhow::Result> { + self.do_get_endpoint_jwks(ctx, endpoint).await + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/http.rs b/proxy/src/http.rs index c77d95f47d..14720b5c6b 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -86,9 +86,17 @@ impl Endpoint { /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub(crate) fn get(&self, path: &str) -> RequestBuilder { + pub(crate) fn get_path(&self, path: &str) -> RequestBuilder { + self.get_with_url(|u| { + u.path_segments_mut().push(path); + }) + } + + /// Return a [builder](RequestBuilder) for a `GET` request, + /// accepting a closure to modify the url path segments for more complex paths queries. + pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder { let mut url = self.endpoint.clone(); - url.path_segments_mut().push(path); + f(&mut url); self.client.get(url.into_inner()) } @@ -144,7 +152,7 @@ mod tests { // Validate that this pattern makes sense. let req = endpoint - .get("frobnicate") + .get_path("frobnicate") .query(&[ ("foo", Some("10")), // should be just `foo=10` ("bar", None), // shouldn't be passed at all @@ -162,7 +170,7 @@ mod tests { let endpoint = Endpoint::new(url, Client::new()); let req = endpoint - .get("frobnicate") + .get_path("frobnicate") .query(&[("session_id", uuid::Uuid::nil())]) .build()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index e5144cfe2e..108420d7d7 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -130,14 +130,14 @@ impl Default for StringInterner { } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] -pub(crate) struct RoleNameTag; +pub struct RoleNameTag; impl InternId for RoleNameTag { fn get_interner() -> &'static StringInterner { static ROLE_NAMES: OnceLock> = OnceLock::new(); ROLE_NAMES.get_or_init(Default::default) } } -pub(crate) type RoleNameInt = InternedString; +pub type RoleNameInt = InternedString; impl From<&RoleName> for RoleNameInt { fn from(value: &RoleName) -> Self { RoleNameTag::get_interner().get_or_intern(value) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 0070839aa8..ea0a9beced 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -82,7 +82,7 @@ impl_trait_overcaptures, )] -use std::{convert::Infallible, future::Future}; +use std::convert::Infallible; use anyhow::{bail, Context}; use intern::{EndpointIdInt, EndpointIdTag, InternId}; @@ -117,13 +117,12 @@ pub mod usage_metrics; pub mod waiters; /// Handle unix signals appropriately. -pub async fn handle_signals( +pub async fn handle_signals( token: CancellationToken, mut refresh_config: F, ) -> anyhow::Result where - F: FnMut() -> Fut, - Fut: Future>, + F: FnMut(), { use tokio::signal::unix::{signal, SignalKind}; @@ -136,7 +135,7 @@ where // Hangup is commonly used for config reload. _ = hangup.recv() => { warn!("received SIGHUP"); - refresh_config().await?; + refresh_config(); } // Shut down the whole application. _ = interrupt.recv() => { diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 752d982726..058ec06e02 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -525,6 +525,10 @@ impl TestBackend for TestConnectMechanism { { unimplemented!("not used in tests") } + + fn dyn_clone(&self) -> Box { + Box::new(self.clone()) + } } fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 2702aeebfe..c027a0cd20 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -43,6 +43,13 @@ impl ThreadPool { pub fn new(n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. + if n_workers == 0 { + return Arc::new(Self { + runtime: None, + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + }); + } + Arc::new_cyclic(|pool| { let pool = pool.clone(); let worker_id = AtomicUsize::new(0); diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index aa236907db..607eb0caf6 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -119,7 +119,7 @@ impl PoolingBackend { .check_jwt( ctx, user_info.endpoint.clone(), - user_info.user.clone(), + &user_info.user, &StaticAuthRules, jwt, ) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 5270934f5e..1e5f963a4f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -374,14 +374,16 @@ type JoinTaskRes = Result, JoinError>; async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { // fsync the datadir to make sure we have a consistent state on disk. - let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; - let started = Instant::now(); - utils::crashsafe::syncfs(dfd)?; - let elapsed = started.elapsed(); - info!( - elapsed_ms = elapsed.as_millis(), - "syncfs data directory done" - ); + if !conf.no_sync { + let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?; + let started = Instant::now(); + utils::crashsafe::syncfs(dfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "syncfs data directory done" + ); + } info!("starting safekeeper WAL service on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 7bdee35cd7..1a932ef699 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -9,7 +9,7 @@ use crate::walproposer_sim::{ pub mod walproposer_sim; -// Generates 2000 random seeds and runs a schedule for each of them. +// Generates 500 random seeds and runs a schedule for each of them. // If you see this test fail, please report the last seed to the // @safekeeper team. #[test] @@ -17,7 +17,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let clock = init_logger(); let mut config = TestConfig::new(Some(clock)); - for _ in 0..2000 { + for _ in 0..500 { let seed: u64 = rand::thread_rng().gen(); config.network = generate_network_opts(seed); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 201eb1087d..70a038c960 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -642,9 +642,6 @@ class NeonEnvBuilder: patch_script = "" for ps in self.env.pageservers: patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" - # This is a temporary to get the backward compat test happy - # since the compat snapshot was generated with an older version of neon local - patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}' WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;" patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines @@ -3314,7 +3311,7 @@ class VanillaPostgres(PgProtocol): self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "--pgdata", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def enable_tls(self): diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 96c84d1616..7f170eeea3 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -8,6 +8,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics +from fixtures.utils import wait_until # Walreceiver as returned by sk's timeline status endpoint. @@ -161,6 +162,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): walreceivers=walreceivers, ) + # Get timeline_start_lsn, waiting until it's nonzero. It is a way to ensure + # that the timeline is fully initialized at the safekeeper. + def get_non_zero_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + def timeline_start_lsn_non_zero() -> Lsn: + s = self.timeline_status(tenant_id, timeline_id).timeline_start_lsn + assert s > Lsn(0) + return s + + return wait_until(30, 1, timeline_start_lsn_non_zero) + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index fb5c1d3115..b559be5f18 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,9 +156,6 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") -@skip_on_postgres( - PgVersion.V17, "There are no snapshots yet" -) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -206,9 +203,6 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") -@skip_on_postgres( - PgVersion.V17, "There are no snapshots yet" -) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 5e8b8d38f7..b08fcc0da1 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -27,7 +27,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.extend( [ ".*basebackup .* failed: invalid basebackup lsn.*", - ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected", + ".*/lsn_lease.*invalid lsn lease request.*", ] ) @@ -108,7 +108,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): assert cur.fetchone() == (1,) # Create node at pre-initdb lsn - with pytest.raises(Exception, match="invalid basebackup lsn"): + with pytest.raises(Exception, match="invalid lsn lease request"): # compute node startup with invalid LSN should fail env.endpoints.create_start( branch_name="main", @@ -167,6 +167,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): ) return last_flush_lsn + def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint): + """ + Trigger GC manually on all pageservers. Then run an `SELECT` query. + """ + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert ( + gc_result["layers_removed"] == 0 + ), "No layers should be removed, old layers are guarded by leases." + + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + # Insert some records on main branch with env.endpoints.create_start("main") as ep_main: with ep_main.cursor() as cur: @@ -193,25 +210,31 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): generate_updates_on_main(env, ep_main, i, end=100) - # Trigger GC - for shard, ps in tenant_get_shards(env, env.initial_tenant): - client = ps.http_client() - gc_result = client.timeline_gc(shard, env.initial_timeline, 0) - log.info(f"{gc_result=}") + trigger_gc_and_select(env, ep_static) - assert ( - gc_result["layers_removed"] == 0 - ), "No layers should be removed, old layers are guarded by leases." + # Trigger Pageserver restarts + for ps in env.pageservers: + ps.stop() + # Static compute should have at least one lease request failure due to connection. + time.sleep(LSN_LEASE_LENGTH / 2) + ps.start() - with ep_static.cursor() as cur: - cur.execute("SELECT count(*) FROM t0") - assert cur.fetchone() == (ROW_COUNT,) + trigger_gc_and_select(env, ep_static) + + # Reconfigure pageservers + env.pageservers[0].stop() + env.storage_controller.node_configure( + env.pageservers[0].id, {"availability": "Offline"} + ) + env.storage_controller.reconcile_until_idle() + + trigger_gc_and_select(env, ep_static) # Do some update so we can increment latest_gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) # Wait for the existing lease to expire. - time.sleep(LSN_LEASE_LENGTH) + time.sleep(LSN_LEASE_LENGTH + 1) # Now trigger GC again, layers should be removed. for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 765c72cf2a..ddfe9b911f 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -45,10 +45,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool tenant_after = http.tenant_status(env.initial_tenant) assert tenant_before != tenant_after gc_blocking = tenant_after["gc_blocking"] - assert ( - gc_blocking - == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }" - ) + assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" wait_for_another_gc_round() pss.assert_log_contains(gc_skipped_line) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8ee548bdb0..c75235a04b 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -772,7 +772,7 @@ class ProposerPostgres(PgProtocol): def initdb(self): """Run initdb""" - args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] + args = ["initdb", "--username", "cloud_admin", "--pgdata", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): @@ -2084,8 +2084,13 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): endpoint.safe_psql("create table t(key int, value text)") - timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id) - timeline_start_lsn = timeline_status.timeline_start_lsn + # Note: currently timelines on sks are created by compute and commit of + # transaction above is finished when 2/3 sks received it, so there is a + # small chance that timeline on this sk is not created/initialized yet, + # hence the usage of waiting function to prevent flakiness. + timeline_start_lsn = ( + env.safekeepers[0].http_client().get_non_zero_timeline_start_lsn(tenant_id, timeline_id) + ) log.info(f"Timeline start LSN: {timeline_start_lsn}") current_percent = 0.0