diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 8e28049888..bdf7c07c6a 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -19,8 +19,8 @@ on: description: 'debug or release' required: true type: string - pg-versions: - description: 'a json array of postgres versions to run regression tests on' + test-cfg: + description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string @@ -276,14 +276,14 @@ jobs: options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false - matrix: - pg_version: ${{ fromJson(inputs.pg-versions) }} + matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }} steps: - uses: actions/checkout@v4 with: submodules: true - name: Pytest regression tests + continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }} uses: ./.github/actions/run-python-test-set timeout-minutes: 60 with: @@ -300,6 +300,7 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index acea859b4d..2ad1ee0a42 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -558,12 +558,12 @@ jobs: arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') cd /home/nonroot - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb" - dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg - dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg - dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb" + wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb" + dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg + dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg mkdir -p /tmp/neon/pg_install/v16/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 89fd2d0d17..9830c2a0c9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -253,7 +253,14 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} + # run without LFC on v17 release only + test-cfg: | + ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"}, + {"pg_version":"v15", "lfc_state": "without-lfc"}, + {"pg_version":"v16", "lfc_state": "without-lfc"}, + {"pg_version":"v17", "lfc_state": "without-lfc"}, + {"pg_version":"v17", "lfc_state": "with-lfc"}]' + || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml index 98e394a3c2..2ed044b780 100644 --- a/.github/workflows/report-workflow-stats-batch.yml +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -4,10 +4,12 @@ on: schedule: - cron: '*/15 * * * *' - cron: '25 0 * * *' + - cron: '25 1 * * 6' jobs: - gh-workflow-stats-batch: - name: GitHub Workflow Stats Batch + gh-workflow-stats-batch-2h: + name: GitHub Workflow Stats Batch 2 hours + if: github.event.schedule == '*/15 * * * *' runs-on: ubuntu-22.04 permissions: actions: read @@ -16,14 +18,36 @@ jobs: uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} duration: '2h' - - name: Export Workflow Run for the past 24 hours - if: github.event.schedule == '25 0 * * *' + + gh-workflow-stats-batch-48h: + name: GitHub Workflow Stats Batch 48 hours + if: github.event.schedule == '25 0 * * *' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 48 hours uses: neondatabase/gh-workflow-stats-action@v0.2.1 with: db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - db_table: "gh_workflow_stats_batch_neon" + db_table: "gh_workflow_stats_neon" gh_token: ${{ secrets.GITHUB_TOKEN }} - duration: '24h' + duration: '48h' + + gh-workflow-stats-batch-30d: + name: GitHub Workflow Stats Batch 30 days + if: github.event.schedule == '25 1 * * 6' + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 30 days + uses: neondatabase/gh-workflow-stats-action@v0.2.1 + with: + db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} + db_table: "gh_workflow_stats_neon" + gh_token: ${{ secrets.GITHUB_TOKEN }} + duration: '720h' diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml deleted file mode 100644 index 15e446bcd7..0000000000 --- a/.github/workflows/report-workflow-stats.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Report Workflow Stats - -on: - workflow_run: - workflows: - - Add `external` label to issues and PRs created by external users - - Benchmarking - - Build and Test - - Build and Test Locally - - Build build-tools image - - Check Permissions - - Check neon with extra platform builds - - Cloud Regression Test - - Create Release Branch - - Handle `approved-for-ci-run` label - - Lint GitHub Workflows - - Notify Slack channel about upcoming release - - Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region - - Pin build-tools image - - Prepare benchmarking databases by restoring dumps - - Push images to ACR - - Test Postgres client libraries - - Trigger E2E Tests - - cleanup caches by a branch - - Pre-merge checks - types: [completed] - -jobs: - gh-workflow-stats: - name: Github Workflow Stats - runs-on: ubuntu-22.04 - permissions: - actions: read - steps: - - name: Export GH Workflow Stats - uses: neondatabase/gh-workflow-stats-action@v0.1.4 - with: - DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} - DB_TABLE: "gh_workflow_stats_neon" - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GH_RUN_ID: ${{ github.event.workflow_run.id }} diff --git a/Cargo.lock b/Cargo.lock index c7af140f7d..98d2e0864a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,6 +46,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e0966165eaf052580bd70eb1b32cb3d6245774c0104d1b2793e9650bf83b52a" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.16" @@ -146,6 +155,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "asn1-rs" version = "0.6.2" @@ -359,6 +374,28 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-sdk-kms" +version = "1.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.52.0" @@ -575,9 +612,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -742,7 +779,7 @@ dependencies = [ "once_cell", "paste", "pin-project", - "quick-xml", + "quick-xml 0.31.0", "rand 0.8.5", "reqwest 0.11.19", "rustc_version", @@ -1220,6 +1257,10 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-sdk-kms", + "aws-sdk-s3", + "base64 0.13.1", "bytes", "camino", "cfg-if", @@ -1237,13 +1278,16 @@ dependencies = [ "opentelemetry", "opentelemetry_sdk", "postgres", + "postgres_initdb", "prometheus", "regex", "remote_storage", "reqwest 0.12.4", "rlimit", "rust-ini", + "serde", "serde_json", + "serde_with", "signal-hook", "tar", "thiserror", @@ -1381,6 +1425,15 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpp_demangle" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d" +dependencies = [ + "cfg-if", +] + [[package]] name = "cpufeatures" version = "0.2.9" @@ -1904,6 +1957,26 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equator" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -2011,6 +2084,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2089,9 +2174,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2099,9 +2184,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -2116,9 +2201,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" @@ -2137,9 +2222,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -2148,15 +2233,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -2166,9 +2251,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2714,6 +2799,24 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" +[[package]] +name = "inferno" +version = "0.11.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" +dependencies = [ + "ahash", + "indexmap 2.0.1", + "is-terminal", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.26.0", + "rgb", + "str_stack", +] + [[package]] name = "inotify" version = "0.9.6" @@ -2764,9 +2867,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.9.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is-terminal" @@ -3053,6 +3156,15 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "memmap2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.7.1" @@ -3278,6 +3390,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.45" @@ -3619,6 +3741,7 @@ dependencies = [ "num_cpus", "once_cell", "pageserver_api", + "pageserver_client", "pageserver_compaction", "pin-project-lite", "postgres", @@ -3627,6 +3750,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "postgres_ffi", + "postgres_initdb", "pq_proto", "procfs", "rand 0.8.5", @@ -4058,7 +4182,7 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pemfile 2.1.1", "serde", "thiserror", @@ -4102,12 +4226,48 @@ dependencies = [ "utils", ] +[[package]] +name = "postgres_initdb" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "thiserror", + "tokio", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "criterion", + "findshlibs", + "inferno", + "libc", + "log", + "nix 0.26.4", + "once_cell", + "parking_lot 0.12.1", + "protobuf", + "protobuf-codegen-pure", + "smallvec", + "symbolic-demangle", + "tempfile", + "thiserror", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -4260,6 +4420,31 @@ dependencies = [ "prost", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + +[[package]] +name = "protobuf-codegen" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" +dependencies = [ + "protobuf", + "protobuf-codegen", +] + [[package]] name = "proxy" version = "0.1.0" @@ -4333,7 +4518,7 @@ dependencies = [ "rsa", "rstest", "rustc-hash", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", @@ -4371,6 +4556,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "quick-xml" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.31.0" @@ -4853,6 +5047,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "rgb" +version = "0.8.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.6" @@ -5028,9 +5231,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f" dependencies = [ "log", "once_cell", @@ -5166,6 +5369,7 @@ dependencies = [ "postgres-protocol", "postgres_backend", "postgres_ffi", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -5712,6 +5916,12 @@ dependencies = [ "der 0.7.8", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -5738,7 +5948,7 @@ dependencies = [ "once_cell", "parking_lot 0.12.1", "prost", - "rustls 0.23.16", + "rustls 0.23.18", "tokio", "tonic", "tonic-build", @@ -5821,7 +6031,7 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest 0.12.4", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-native-certs 0.8.0", "serde", "serde_json", @@ -5858,6 +6068,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.2" @@ -5905,6 +6121,29 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca" +[[package]] +name = "symbolic-common" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366f1b4c6baf6cfefc234bbd4899535fca0b06c74443039a73f6dfb2fad88d77" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba05ba5b9962ea5617baf556293720a8b2d0a282aa14ee4bf10e22efc7da8c8" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "1.0.109" @@ -6254,7 +6493,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ "ring", - "rustls 0.23.16", + "rustls 0.23.18", "tokio", "tokio-postgres", "tokio-rustls 0.26.0", @@ -6288,7 +6527,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pki-types", "tokio", ] @@ -6697,7 +6936,7 @@ dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.23.16", + "rustls 0.23.18", "rustls-pki-types", "url", "webpki-roots 0.26.1", @@ -6772,6 +7011,7 @@ dependencies = [ "once_cell", "pin-project-lite", "postgres_connection", + "pprof", "pq_proto", "rand 0.8.5", "regex", @@ -7306,6 +7546,7 @@ dependencies = [ "anyhow", "axum", "axum-core", + "base64 0.13.1", "base64 0.21.1", "base64ct", "bytes", @@ -7340,6 +7581,7 @@ dependencies = [ "libc", "log", "memchr", + "nix 0.26.4", "nom", "num-bigint", "num-integer", @@ -7356,7 +7598,7 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest 0.12.4", - "rustls 0.23.16", + "rustls 0.23.18", "scopeguard", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index dbda930535..e3dc5b97f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ members = [ "libs/vm_monitor", "libs/walproposer", "libs/wal_decoder", + "libs/postgres_initdb", ] [workspace.package] @@ -57,6 +58,7 @@ async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } aws-sdk-s3 = "1.52" aws-sdk-iam = "1.46.0" +aws-sdk-kms = "1.47.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.2" aws-credential-types = "1.2.0" @@ -73,7 +75,7 @@ bytes = "1.0" camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } -clap = { version = "4.0", features = ["derive"] } +clap = { version = "4.0", features = ["derive", "env"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" @@ -106,7 +108,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" -ipnet = "2.9.0" +ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" jsonwebtoken = "9" @@ -130,6 +132,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" +pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -153,7 +156,7 @@ sentry = { version = "0.32", default-features = false, features = ["backtrace", serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" -serde_with = "2.0" +serde_with = { version = "2.0", features = [ "base64" ] } serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" @@ -212,12 +215,14 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } +pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } +postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } diff --git a/README.md b/README.md index e68ef70bdf..1417d6b9e7 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. +Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory. #### Running neon database diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index c1190b13f4..24e5bbf46f 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -234,7 +234,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.19 \ +ENV PYTHON_VERSION=3.11.10 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 32405ece86..7c21c67a0a 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1243,7 +1243,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ ######################################################################################### # -# Compile and run the Neon-specific `compute_ctl` binary +# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools @@ -1264,6 +1264,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de FROM debian:$DEBIAN_FLAVOR AS compute-tools-image COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import ######################################################################################### # @@ -1458,6 +1459,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import # pgbouncer and its config COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer @@ -1533,6 +1535,25 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 +# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 +# used by fast_import +ARG TARGETARCH +ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb +RUN set -ex; \ + \ + # Determine the expected checksum based on TARGETARCH + if [ "${TARGETARCH}" = "amd64" ]; then \ + CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + elif [ "${TARGETARCH}" = "arm64" ]; then \ + CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + else \ + echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ + fi; \ + \ + # Compute and validate the checksum + echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - +RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0bf4ed53d6..c0c390caef 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,10 @@ default = [] testing = [] [dependencies] +base64.workspace = true +aws-config.workspace = true +aws-sdk-s3.workspace = true +aws-sdk-kms.workspace = true anyhow.workspace = true camino.workspace = true chrono.workspace = true @@ -27,6 +31,8 @@ opentelemetry.workspace = true opentelemetry_sdk.workspace = true postgres.workspace = true regex.workspace = true +serde.workspace = true +serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,6 +49,7 @@ thiserror.workspace = true url.workspace = true prometheus.workspace = true +postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 284db005c8..4689cc2b83 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -105,6 +105,11 @@ fn main() -> Result<()> { fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; + opentelemetry::global::set_error_handler(|err| { + tracing::info!("OpenTelemetry error: {err}"); + }) + .expect("global error handler lock poisoned"); + let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs new file mode 100644 index 0000000000..3b0b990df2 --- /dev/null +++ b/compute_tools/src/bin/fast_import.rs @@ -0,0 +1,338 @@ +//! This program dumps a remote Postgres database into a local Postgres database +//! and uploads the resulting PGDATA into object storage for import into a Timeline. +//! +//! # Context, Architecture, Design +//! +//! See cloud.git Fast Imports RFC () +//! for the full picture. +//! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline +//! is publicly accessible at . +//! +//! # This is a Prototype! +//! +//! This program is part of a prototype feature and not yet used in production. +//! +//! The cloud.git RFC contains lots of suggestions for improving e2e throughput +//! of this step of the timeline import process. +//! +//! # Local Testing +//! +//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Build the image with the following command: +//! +//! ```bash +//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com +//! docker push localhost:3030/localregistry/compute-node-v14:latest +//! ``` + +use anyhow::Context; +use aws_config::BehaviorVersion; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use nix::unistd::Pid; +use tracing::{info, info_span, warn, Instrument}; +use utils::fs_ext::is_directory_empty; + +#[path = "fast_import/child_stdio_to_log.rs"] +mod child_stdio_to_log; +#[path = "fast_import/s3_uri.rs"] +mod s3_uri; +#[path = "fast_import/s5cmd.rs"] +mod s5cmd; + +#[derive(clap::Parser)] +struct Args { + #[clap(long)] + working_directory: Utf8PathBuf, + #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] + s3_prefix: s3_uri::S3Uri, + #[clap(long)] + pg_bin_dir: Utf8PathBuf, + #[clap(long)] + pg_lib_dir: Utf8PathBuf, +} + +#[serde_with::serde_as] +#[derive(serde::Deserialize)] +struct Spec { + encryption_secret: EncryptionSecret, + #[serde_as(as = "serde_with::base64::Base64")] + source_connstring_ciphertext_base64: Vec, +} + +#[derive(serde::Deserialize)] +enum EncryptionSecret { + #[allow(clippy::upper_case_acronyms)] + KMS { key_id: String }, +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Plain, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let Args { + working_directory, + s3_prefix, + pg_bin_dir, + pg_lib_dir, + } = Args::parse(); + + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + + let spec: Spec = { + let spec_key = s3_prefix.append("/spec.json"); + let s3_client = aws_sdk_s3::Client::new(&aws_config); + let object = s3_client + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + }; + + match tokio::fs::create_dir(&working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&working_directory) + .await + .context("check if working directory is empty")? + { + anyhow::bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + let pgdata_dir = working_directory.join("pgdata"); + tokio::fs::create_dir(&pgdata_dir) + .await + .context("create pgdata directory")?; + + // + // Setup clients + // + let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let kms_client = aws_sdk_kms::Client::new(&aws_config); + + // + // Initialize pgdata + // + let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser, + locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded, + pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in + initdb_bin: pg_bin_dir.join("initdb").as_ref(), + library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &pgdata_dir, + }) + .await + .context("initdb")?; + + let nproc = num_cpus::get(); + + // + // Launch postgres process + // + let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres")) + .arg("-D") + .arg(&pgdata_dir) + .args(["-c", "wal_level=minimal"]) + .args(["-c", "shared_buffers=10GB"]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args(["-c", "maintenance_work_mem=8388608"]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + postgres_proc.stdout.take(), + postgres_proc.stderr.take(), + ) + .instrument(info_span!("postgres")), + ); + let restore_pg_connstring = + format!("host=localhost port=5432 user={superuser} dbname=postgres"); + loop { + let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await; + if res.is_ok() { + info!("postgres is ready, could connect to it"); + break; + } + } + + // + // Decrypt connection string + // + let source_connection_string = { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + spec.source_connstring_ciphertext_base64, + )) + .send() + .await + .context("decrypt source connection string")?; + let plaintext = output + .plaintext + .take() + .context("get plaintext source connection string")?; + String::from_utf8(plaintext.into_inner()) + .context("parse source connection string as utf8")? + } + } + }; + + // + // Start the work + // + + let dumpdir = working_directory.join("dumpdir"); + + let common_args = [ + // schema mapping (prob suffices to specify them on one side) + "--no-owner".to_string(), + "--no-privileges".to_string(), + "--no-publications".to_string(), + "--no-security-labels".to_string(), + "--no-subscriptions".to_string(), + "--no-tablespaces".to_string(), + // format + "--format".to_string(), + "directory".to_string(), + // concurrency + "--jobs".to_string(), + num_cpus::get().to_string(), + // progress updates + "--verbose".to_string(), + ]; + + info!("dump into the working directory"); + { + let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump")) + .args(&common_args) + .arg("-f") + .arg(&dumpdir) + .arg("--no-sync") + // POSITIONAL args + // source db (db name included in connection string) + .arg(&source_connection_string) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_dump")?; + + info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump"); + + tokio::spawn( + child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take()) + .instrument(info_span!("pg_dump")), + ); + + let st = pg_dump.wait().await.context("wait for pg_dump")?; + info!(status=?st, "pg_dump exited"); + if !st.success() { + warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + } + } + + // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: do the unlogged table trick + + info!("restore from working directory into vanilla postgres"); + { + let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) + .args(&common_args) + .arg("-d") + .arg(&restore_pg_connstring) + // POSITIONAL args + .arg(&dumpdir) + // how we run it + .env_clear() + .kill_on_drop(true) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn pg_restore")?; + + info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore"); + tokio::spawn( + child_stdio_to_log::relay_process_output( + pg_restore.stdout.take(), + pg_restore.stderr.take(), + ) + .instrument(info_span!("pg_restore")), + ); + let st = pg_restore.wait().await.context("wait for pg_restore")?; + info!(status=?st, "pg_restore exited"); + if !st.success() { + warn!(status=%st, "pg_restore failed, restore will likely fail as well"); + } + } + + info!("shutdown postgres"); + { + nix::sys::signal::kill( + Pid::from_raw( + i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), + ), + nix::sys::signal::SIGTERM, + ) + .context("signal postgres to shut down")?; + postgres_proc + .wait() + .await + .context("wait for postgres to shut down")?; + } + + info!("upload pgdata"); + s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = working_directory.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("status"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + .await + .context("sync status directory to destination")?; + } + + Ok(()) +} diff --git a/compute_tools/src/bin/fast_import/child_stdio_to_log.rs b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs new file mode 100644 index 0000000000..6724ef9bed --- /dev/null +++ b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs @@ -0,0 +1,35 @@ +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::{ChildStderr, ChildStdout}; +use tracing::info; + +/// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log. +/// Each line is read and logged individually, with lossy UTF-8 conversion. +/// +/// # Arguments +/// +/// * `stdout`: An `Option` from the child process. +/// * `stderr`: An `Option` from the child process. +/// +pub(crate) async fn relay_process_output(stdout: Option, stderr: Option) { + let stdout_fut = async { + if let Some(stdout) = stdout { + let reader = BufReader::new(stdout); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stdout", "{}", line); + } + } + }; + + let stderr_fut = async { + if let Some(stderr) = stderr { + let reader = BufReader::new(stderr); + let mut lines = reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + info!(fd = "stderr", "{}", line); + } + } + }; + + tokio::join!(stdout_fut, stderr_fut); +} diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs new file mode 100644 index 0000000000..52bbef420f --- /dev/null +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -0,0 +1,75 @@ +use anyhow::Result; +use std::str::FromStr; + +/// Struct to hold parsed S3 components +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct S3Uri { + pub bucket: String, + pub key: String, +} + +impl FromStr for S3Uri { + type Err = anyhow::Error; + + /// Parse an S3 URI into a bucket and key + fn from_str(uri: &str) -> Result { + // Ensure the URI starts with "s3://" + if !uri.starts_with("s3://") { + return Err(anyhow::anyhow!("Invalid S3 URI scheme")); + } + + // Remove the "s3://" prefix + let stripped_uri = &uri[5..]; + + // Split the remaining string into bucket and key parts + if let Some((bucket, key)) = stripped_uri.split_once('/') { + Ok(S3Uri { + bucket: bucket.to_string(), + key: key.to_string(), + }) + } else { + Err(anyhow::anyhow!( + "Invalid S3 URI format, missing bucket or key" + )) + } + } +} + +impl S3Uri { + pub fn append(&self, suffix: &str) -> Self { + Self { + bucket: self.bucket.clone(), + key: format!("{}{}", self.key, suffix), + } + } +} + +impl std::fmt::Display for S3Uri { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "s3://{}/{}", self.bucket, self.key) + } +} + +impl clap::builder::TypedValueParser for S3Uri { + type Value = Self; + + fn parse_ref( + &self, + _cmd: &clap::Command, + _arg: Option<&clap::Arg>, + value: &std::ffi::OsStr, + ) -> Result { + let value_str = value.to_str().ok_or_else(|| { + clap::Error::raw( + clap::error::ErrorKind::InvalidUtf8, + "Invalid UTF-8 sequence", + ) + })?; + S3Uri::from_str(value_str).map_err(|e| { + clap::Error::raw( + clap::error::ErrorKind::InvalidValue, + format!("Failed to parse S3 URI: {}", e), + ) + }) + } +} diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/s5cmd.rs new file mode 100644 index 0000000000..d2d9a79736 --- /dev/null +++ b/compute_tools/src/bin/fast_import/s5cmd.rs @@ -0,0 +1,27 @@ +use anyhow::Context; +use camino::Utf8Path; + +use super::s3_uri::S3Uri; + +pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { + let mut builder = tokio::process::Command::new("s5cmd"); + // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL + if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { + builder.arg("--endpoint-url").arg(val); + } + builder + .arg("sync") + .arg(local.as_str()) + .arg(remote.to_string()); + let st = builder + .spawn() + .context("spawn s5cmd")? + .wait() + .await + .context("wait for s5cmd")?; + if st.success() { + Ok(()) + } else { + Err(anyhow::anyhow!("s5cmd failed")) + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d4e413034e..d65fe73194 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -116,7 +116,7 @@ pub fn write_postgres_conf( vartype: "enum".to_owned(), }; - write!(file, "{}", opt.to_pg_setting())?; + writeln!(file, "{}", opt.to_pg_setting())?; } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 3677582c11..8a047634df 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -20,6 +20,7 @@ use anyhow::Result; use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use metrics::proto::MetricFamily; use metrics::Encoder; use metrics::TextEncoder; use tokio::task; @@ -72,10 +73,22 @@ async fn routes(req: Request, compute: &Arc) -> Response { debug!("serving /metrics GET request"); - let mut buffer = vec![]; - let metrics = installed_extensions::collect(); + // When we call TextEncoder::encode() below, it will immediately + // return an error if a metric family has no metrics, so we need to + // preemptively filter out metric families with no metrics. + let metrics = installed_extensions::collect() + .into_iter() + .filter(|m| !m.get_metric().is_empty()) + .collect::>(); + let encoder = TextEncoder::new(); - encoder.encode(&metrics, &mut buffer).unwrap(); + let mut buffer = vec![]; + + if let Err(err) = encoder.encode(&metrics, &mut buffer) { + let msg = format!("error handling /metrics request: {err}"); + error!(msg); + return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR); + } match Response::builder() .status(StatusCode::OK) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 6dd55855db..79d8b2ca04 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -115,7 +115,7 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> { static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "installed_extensions", + "compute_installed_extensions", "Number of databases where the version of extension is installed", &["extension_name", "version"] ) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c4063bbd1a..1ea443b026 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1153,6 +1153,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re timeline_info.timeline_id ); } + // TODO: rename to import-basebackup-plus-wal TimelineCmd::Import(args) => { let tenant_id = get_tenant_id(args.tenant_id, env)?; let timeline_id = args.timeline_id; diff --git a/deny.toml b/deny.toml index 8bf643f4ba..7a1eecac99 100644 --- a/deny.toml +++ b/deny.toml @@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key [licenses] allow = [ "Apache-2.0", - "Artistic-2.0", "BSD-2-Clause", "BSD-3-Clause", "CC0-1.0", @@ -67,7 +66,7 @@ registries = [] # More documentation about the 'bans' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html [bans] -multiple-versions = "warn" +multiple-versions = "allow" wildcards = "allow" highlight = "all" workspace-default-features = "allow" diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 3732bfdab2..1f7e913c07 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -113,21 +113,21 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.9 (the minimal supported version) or greater. +- Install Python 3.11 (the minimal supported version) or greater. - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: + - If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.9 + sudo apt install python3.11 ``` - Install `poetry` - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. - - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`. This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 8710904cec..79da05da6c 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -33,6 +33,7 @@ remote_storage.workspace = true postgres_backend.workspace = true nix = {workspace = true, optional = true} reqwest.workspace = true +rand.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index ee20613d6d..7666728427 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -97,6 +97,15 @@ pub struct ConfigToml { pub control_plane_api: Option, pub control_plane_api_token: Option, pub control_plane_emergency_mode: bool, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_upcall_api_token: Option, + /// Unstable feature: subject to change or removal without notice. + /// See . + pub import_pgdata_aws_endpoint_url: Option, pub heatmap_upload_concurrency: usize, pub secondary_download_concurrency: usize, pub virtual_file_io_engine: Option, @@ -386,6 +395,10 @@ impl Default for ConfigToml { control_plane_api_token: (None), control_plane_emergency_mode: (false), + import_pgdata_upcall_api: (None), + import_pgdata_upcall_api_token: (None), + import_pgdata_aws_endpoint_url: (None), + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 401887d362..c55b9e9484 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -48,7 +48,7 @@ pub struct ShardedRange<'a> { // Calculate the size of a range within the blocks of the same relation, or spanning only the // top page in the previous relation's space. -fn contiguous_range_len(range: &Range) -> u32 { +pub fn contiguous_range_len(range: &Range) -> u32 { debug_assert!(is_contiguous_range(range)); if range.start.field6 == 0xffffffff { range.end.field6 + 1 @@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range) -> u32 { /// This matters, because: /// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. /// - Within such ranges, we may calculate distances using simple subtraction of field6. -fn is_contiguous_range(range: &Range) -> bool { +pub fn is_contiguous_range(range: &Range) -> bool { range.start.field1 == range.end.field1 && range.start.field2 == range.end.field2 && range.start.field3 == range.end.field3 diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0dfa1ba817..1b86bfd91a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,6 +2,8 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; pub use utilization::PageserverUtilization; use std::{ @@ -227,6 +229,9 @@ pub enum TimelineCreateRequestMode { // we continue to accept it by having it here. pg_version: Option, }, + ImportPgdata { + import_pgdata: TimelineCreateRequestModeImportPgdata, + }, // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap. // (serde picks the first matching enum variant, in declaration order). Bootstrap { @@ -236,6 +241,42 @@ pub enum TimelineCreateRequestMode { }, } +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineCreateRequestModeImportPgdata { + pub location: ImportPgdataLocation, + pub idempotency_key: ImportPgdataIdempotencyKey, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub enum ImportPgdataLocation { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + /// A better name for this would be `prefix`; changing requires coordination with cplane. + /// See . + key: String, + }, +} + +#[derive(Serialize, Deserialize, Clone)] +#[serde(transparent)] +pub struct ImportPgdataIdempotencyKey(pub String); + +impl ImportPgdataIdempotencyKey { + pub fn random() -> Self { + use rand::{distributions::Alphanumeric, Rng}; + Self( + rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(20) + .map(char::from) + .collect(), + ) + } +} + #[derive(Serialize, Deserialize, Clone)] pub struct LsnLeaseRequest { pub lsn: Lsn, diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml new file mode 100644 index 0000000000..1605279bce --- /dev/null +++ b/libs/postgres_initdb/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "postgres_initdb" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +tokio.workspace = true +camino.workspace = true +thiserror.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs new file mode 100644 index 0000000000..2f072354fb --- /dev/null +++ b/libs/postgres_initdb/src/lib.rs @@ -0,0 +1,103 @@ +//! The canonical way we run `initdb` in Neon. +//! +//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations. +//! +//! This module's job is to eliminate the environment-dependence as much as possible. + +use std::fmt; + +use camino::Utf8Path; + +pub struct RunInitdbArgs<'a> { + pub superuser: &'a str, + pub locale: &'a str, + pub initdb_bin: &'a Utf8Path, + pub pg_version: u32, + pub library_search_path: &'a Utf8Path, + pub pgdata: &'a Utf8Path, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + Spawn(std::io::Error), + Failed { + status: std::process::ExitStatus, + stderr: Vec, + }, + WaitOutput(std::io::Error), + Other(anyhow::Error), +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e), + Error::Failed { status, stderr } => write!( + f, + "Command failed with status {:?}: {}", + status, + String::from_utf8_lossy(stderr) + ), + Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e), + Error::Other(e) => write!(f, "Error: {:?}", e), + } + } +} + +pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { + let RunInitdbArgs { + superuser, + locale, + initdb_bin: initdb_bin_path, + pg_version, + library_search_path, + pgdata, + } = args; + let mut initdb_command = tokio::process::Command::new(initdb_bin_path); + initdb_command + .args(["--pgdata", pgdata.as_ref()]) + .args(["--username", superuser]) + .args(["--encoding", "utf8"]) + .args(["--locale", locale]) + .arg("--no-instructions") + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", library_search_path) + .env("DYLD_LIBRARY_PATH", library_search_path) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()); + + // Before version 14, only the libc provide was available. + if pg_version > 14 { + // Version 17 brought with it a builtin locale provider which only provides + // C and C.UTF-8. While being safer for collation purposes since it is + // guaranteed to be consistent throughout a major release, it is also more + // performant. + let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + + initdb_command.args(["--locale-provider", locale_provider]); + } + + let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?; + + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_proc + .wait_with_output() + .await + .map_err(Error::WaitOutput)?; + if !initdb_output.status.success() { + return Err(Error::Failed { + status: initdb_output.status, + stderr: initdb_output.stderr, + }); + } + + Ok(()) +} diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index b9e5387d86..6c40968496 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -184,9 +184,8 @@ pub struct CancelKeyData { impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO: this is producing strange results, with 0xffffffff........ always in the logs. let hi = (self.backend_pid as u64) << 32; - let lo = self.cancel_key as u64; + let lo = (self.cancel_key as u64) & 0xffffffff; let id = hi | lo; // This format is more compact and might work better for logs. @@ -1047,4 +1046,13 @@ mod tests { let data = [0, 0, 0, 7, 0, 0, 0, 0]; FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err(); } + + #[test] + fn cancel_key_data() { + let key = CancelKeyData { + backend_pid: -1817212860, + cancel_key: -1183897012, + }; + assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)"); + } } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index ed56bdcc70..8b701ad189 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -25,6 +25,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl use bytes::Bytes; use futures::future::Either; use futures::stream::Stream; +use futures::FutureExt; use futures_util::StreamExt; use futures_util::TryStreamExt; use http_types::{StatusCode, Url}; @@ -32,6 +33,7 @@ use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; +use utils::backoff::exponential_backoff_duration_seconds; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; use crate::{ @@ -311,40 +313,59 @@ impl RemoteStorage for AzureBlobStorage { let mut next_marker = None; + let mut timeout_try_cnt = 1; + 'outer: loop { let mut builder = builder.clone(); if let Some(marker) = next_marker.clone() { builder = builder.marker(marker); } - let response = builder.into_stream(); - let response = response.into_stream().map_err(to_download_error); - let response = tokio_stream::StreamExt::timeout(response, self.timeout); - let response = response.map(|res| match res { - Ok(res) => res, - Err(_elapsed) => Err(DownloadError::Timeout), + // Azure Blob Rust SDK does not expose the list blob API directly. Users have to use + // their pageable iterator wrapper that returns all keys as a stream. We want to have + // full control of paging, and therefore we only take the first item from the stream. + let mut response_stream = builder.into_stream(); + let response = response_stream.next(); + // Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request + // would immediately succeed. Therefore, we use exponential backoff timeout to retry the request. + // (Usually, exponential backoff is used to determine the sleep time between two retries.) We + // start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures. + // timeout = min(5 * (1.0+1.0)^n, self.timeout). + let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64()); + let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response); + let response = response.map(|res| { + match res { + Ok(Some(Ok(res))) => Ok(Some(res)), + Ok(Some(Err(e))) => Err(to_download_error(e)), + Ok(None) => Ok(None), + Err(_elasped) => Err(DownloadError::Timeout), + } }); - - let mut response = std::pin::pin!(response); - let mut max_keys = max_keys.map(|mk| mk.get()); let next_item = tokio::select! { - op = response.next() => Ok(op), + op = response => op, _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &next_item { + timeout_try_cnt += 1; + if timeout_try_cnt <= 5 { + continue; + } + } + + let next_item = next_item?; + + if timeout_try_cnt >= 2 { + tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); + } + timeout_try_cnt = 1; + let Some(entry) = next_item else { // The list is complete, so yield it. break; }; let mut res = Listing::default(); - let entry = match entry { - Ok(entry) => entry, - Err(e) => { - // The error is potentially retryable, so we must rewind the loop after yielding. - yield Err(e); - continue; - } - }; next_marker = entry.continuation(); let prefix_iter = entry .blobs @@ -360,7 +381,7 @@ impl RemoteStorage for AzureBlobStorage { last_modified: k.properties.last_modified.into(), size: k.properties.content_length, } - ); + ); for key in blob_iter { res.keys.push(key); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 553153826e..ee2fc9d6e2 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs { let mut objects = Vec::with_capacity(keys.len()); for key in keys { let path = key.with_base(&self.storage_root); - let metadata = file_metadata(&path).await?; + let metadata = file_metadata(&path).await; + if let Err(DownloadError::NotFound) = metadata { + // Race: if the file is deleted between listing and metadata check, ignore it. + continue; + } + let metadata = metadata?; if metadata.is_dir() { continue; } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 545317f958..4aad0aee2c 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -29,6 +29,7 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +pprof.workspace = true regex.workspace = true routerify.workspace = true serde.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 8ee5abd434..6a85f0ddeb 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,7 +1,8 @@ use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use anyhow::Context; -use hyper::header::{HeaderName, AUTHORIZATION}; +use crate::http::request::{get_query_param, parse_query_param}; +use anyhow::{anyhow, Context}; +use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; @@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; +use std::io::Write as _; use std::str::FromStr; +use std::time::Duration; use bytes::{Bytes, BytesMut}; -use std::io::Write as _; -use tokio::sync::mpsc; +use pprof::protos::Message as _; +use tokio::sync::{mpsc, Mutex}; use tokio_stream::wrappers::ReceiverStream; static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { @@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request) -> Result) -> Result, ApiError> { + enum Format { + Pprof, + Svg, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + let seconds = match parse_query_param(&req, "seconds")? { + None => 5, + Some(seconds @ 1..=30) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + }; + let frequency_hz = match parse_query_param(&req, "frequency")? { + None => 99, + Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), + Some(frequency) => frequency, + }; + + // Only allow one profiler at a time. + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + let _lock = PROFILE_LOCK + .try_lock() + .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + + // Take the profile. + let report = tokio::task::spawn_blocking(move || { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(frequency_hz) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build()?; + std::thread::sleep(Duration::from_secs(seconds)); + guard.report().build() + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + + // Return the report in the requested format. + match format { + Format::Pprof => { + let mut body = Vec::new(); + report + .pprof() + .map_err(|err| ApiError::InternalServerError(err.into()))? + .write_to_vec(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Svg => { + let mut body = Vec::new(); + report + .flamegraph(&mut body) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8b8ed5a67f..7ea71685ec 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -30,7 +30,7 @@ pub fn parse_request_param( } } -fn get_query_param<'a>( +pub fn get_query_param<'a>( request: &'a Request, param_name: &str, ) -> Result>, ApiError> { diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 375b227b99..d99dc25769 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -83,7 +83,9 @@ where } wake_these.push(self.heap.pop().unwrap().wake_channel); } - self.update_status(); + if !wake_these.is_empty() { + self.update_status(); + } wake_these } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 143d8236df..140b287ccc 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -43,6 +43,7 @@ postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true +postgres_initdb.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -68,6 +69,7 @@ url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f7be6ecaab..59ea6fb941 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -144,6 +144,10 @@ pub struct PageServerConf { /// JWT token for use with the control plane API. pub control_plane_api_token: Option, + pub import_pgdata_upcall_api: Option, + pub import_pgdata_upcall_api_token: Option, + pub import_pgdata_aws_endpoint_url: Option, + /// If true, pageserver will make best-effort to operate without a control plane: only /// for use in major incidents. pub control_plane_emergency_mode: bool, @@ -328,6 +332,9 @@ impl PageServerConf { control_plane_api, control_plane_api_token, control_plane_emergency_mode, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token, + import_pgdata_aws_endpoint_url, heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, @@ -383,6 +390,9 @@ impl PageServerConf { timeline_offloading, ephemeral_bytes_per_memory_kb, server_side_batch_timeout, + import_pgdata_upcall_api, + import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), + import_pgdata_aws_endpoint_url, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 1f04bc0410..3d02387c98 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; use utils::backoff; +use utils::pausable_failpoint; use crate::metrics; @@ -90,6 +91,7 @@ impl Deleter { /// Block until everything in accumulator has been executed async fn flush(&mut self) -> Result<(), DeletionQueueError> { while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { + pausable_failpoint!("deletion-queue-before-execute-pause"); match self.remote_delete().await { Ok(()) => { // Note: we assume that the remote storage layer returns Ok(()) if some diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 2bc7f5ad39..7fb9247feb 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -623,6 +623,8 @@ paths: existing_initdb_timeline_id: type: string format: hex + import_pgdata: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdata" responses: "201": description: Timeline was created, or already existed with matching parameters @@ -979,6 +981,34 @@ components: $ref: "#/components/schemas/TenantConfig" effective_config: $ref: "#/components/schemas/TenantConfig" + TimelineCreateRequestImportPgdata: + type: object + required: + - location + - idempotency_key + properties: + idempotency_key: + type: string + location: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation" + TimelineCreateRequestImportPgdataLocation: + type: object + properties: + AwsS3: + $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3" + TimelineCreateRequestImportPgdataLocationAwsS3: + type: object + properties: + region: + type: string + bucket: + type: string + key: + type: string + required: + - region + - bucket + - key TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 306b0f35ab..ceb1c3b012 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantState; use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TimelineCreateRequestMode; +use pageserver_api::models::TimelineCreateRequestModeImportPgdata; use pageserver_api::models::TimelinesInfoAndOffloaded; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; @@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::profile_cpu_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::request::must_parse_query_param; @@ -80,6 +82,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::offload::OffloadError; use crate::tenant::timeline::CompactFlags; @@ -125,7 +128,7 @@ pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, @@ -146,10 +149,13 @@ impl State { deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, ) -> anyhow::Result { - let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); + let allowlist_routes = &[ + "/v1/status", + "/v1/doc", + "/swagger.yml", + "/metrics", + "/profile/cpu", + ]; Ok(Self { conf, tenant_manager, @@ -576,6 +582,35 @@ async fn timeline_create_handler( ancestor_timeline_id, ancestor_start_lsn, }), + TimelineCreateRequestMode::ImportPgdata { + import_pgdata: + TimelineCreateRequestModeImportPgdata { + location, + idempotency_key, + }, + } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new( + idempotency_key.0, + ), + new_timeline_id, + location: { + use import_pgdata::index_part_format::Location; + use pageserver_api::models::ImportPgdataLocation; + match location { + #[cfg(feature = "testing")] + ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path }, + ImportPgdataLocation::AwsS3 { + region, + bucket, + key, + } => Location::AwsS3 { + region, + bucket, + key, + }, + } + }, + }), }; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); @@ -3148,7 +3183,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -3167,6 +3202,7 @@ pub fn make_router( Ok(router .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a429dff1fd..5fd02d8749 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1068,21 +1068,26 @@ impl PageServerHandler { )); } - if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus + if request_lsn == Lsn::INVALID { + return Err(PageStreamError::BadRequest( + "invalid LSN(0) in request".into(), + )); + } + + // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. + // + // We may have older data available, but we make a best effort to detect this case and return an error, + // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). + if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); if !gc_info.leases.contains_key(&request_lsn) { - // The requested LSN is below gc cutoff and is not guarded by a lease. - - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { + return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + ); } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 5995d1cc57..f4f184be5a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -2276,9 +2276,9 @@ impl<'a> Version<'a> { //--- Metadata structs stored in key-value pairs in the repository. #[derive(Debug, Serialize, Deserialize)] -struct DbDirectory { +pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) - dbdirs: HashMap<(Oid, Oid), bool>, + pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of @@ -2287,8 +2287,8 @@ struct DbDirectory { // "pg_twophsae/0000000A000002E4". #[derive(Debug, Serialize, Deserialize)] -struct TwoPhaseDirectory { - xids: HashSet, +pub(crate) struct TwoPhaseDirectory { + pub(crate) xids: HashSet, } #[derive(Debug, Serialize, Deserialize)] @@ -2297,12 +2297,12 @@ struct TwoPhaseDirectoryV17 { } #[derive(Debug, Serialize, Deserialize, Default)] -struct RelDirectory { +pub(crate) struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) // // TODO: Store it as a btree or radix tree or something else that spans multiple // key-value pairs, if you have a lot of relations - rels: HashSet<(Oid, u8)>, + pub(crate) rels: HashSet<(Oid, u8)>, } #[derive(Debug, Serialize, Deserialize)] @@ -2311,9 +2311,9 @@ struct RelSizeEntry { } #[derive(Debug, Serialize, Deserialize, Default)] -struct SlruSegmentDirectory { +pub(crate) struct SlruSegmentDirectory { // Set of SLRU segments that exist. - segments: HashSet, + pub(crate) segments: HashSet, } #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 6a4e90dd55..622738022a 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -381,6 +381,8 @@ pub enum TaskKind { UnitTest, DetachAncestor, + + ImportPgdata, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2e5f69e3c9..0214ee68fa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; +use timeline::import_pgdata; use timeline::offload::offload_timeline; +use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; use tokio::task::JoinSet; @@ -373,7 +375,6 @@ pub struct Tenant { l0_flush_global_state: L0FlushGlobalState, } - impl std::fmt::Debug for Tenant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) @@ -860,6 +861,7 @@ impl Debug for SetStoppingError { pub(crate) enum CreateTimelineParams { Bootstrap(CreateTimelineParamsBootstrap), Branch(CreateTimelineParamsBranch), + ImportPgdata(CreateTimelineParamsImportPgdata), } #[derive(Debug)] @@ -877,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch { pub(crate) ancestor_start_lsn: Option, } -/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`]. +#[derive(Debug)] +pub(crate) struct CreateTimelineParamsImportPgdata { + pub(crate) new_timeline_id: TimelineId, + pub(crate) location: import_pgdata::index_part_format::Location, + pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey, +} + +/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`] in [`Tenant::start_creating_timeline`]. /// /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`]. /// @@ -907,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency { ancestor_timeline_id: TimelineId, ancestor_start_lsn: Lsn, }, + ImportPgdata(CreatingTimelineIdempotencyImportPgdata), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata::index_part_format::IdempotencyKey, } /// What is returned by [`Tenant::start_creating_timeline`]. #[must_use] -enum StartCreatingTimelineResult<'t> { - CreateGuard(TimelineCreateGuard<'t>), +enum StartCreatingTimelineResult { + CreateGuard(TimelineCreateGuard), Idempotent(Arc), } +enum TimelineInitAndSyncResult { + ReadyToActivate(Arc), + NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), +} + +impl TimelineInitAndSyncResult { + fn ready_to_activate(self) -> Option> { + match self { + Self::ReadyToActivate(timeline) => Some(timeline), + _ => None, + } + } +} + +#[must_use] +struct TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline: Arc, + import_pgdata: import_pgdata::index_part_format::Root, + guard: TimelineCreateGuard, +} + /// What is returned by [`Tenant::create_timeline`]. enum CreateTimelineResult { Created(Arc), Idempotent(Arc), + /// IMPORTANT: This [`Arc`] object is not in [`Tenant::timelines`] when + /// we return this result, nor will this concrete object ever be added there. + /// Cf method comment on [`Tenant::create_timeline_import_pgdata`]. + ImportSpawned(Arc), } impl CreateTimelineResult { @@ -927,18 +967,19 @@ impl CreateTimelineResult { match self { Self::Created(_) => "Created", Self::Idempotent(_) => "Idempotent", + Self::ImportSpawned(_) => "ImportSpawned", } } fn timeline(&self) -> &Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } /// Unit test timelines aren't activated, test has to do it if it needs to. #[cfg(test)] fn into_timeline_for_test(self) -> Arc { match self { - Self::Created(t) | Self::Idempotent(t) => t, + Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t, } } } @@ -962,33 +1003,13 @@ pub enum CreateTimelineError { } #[derive(thiserror::Error, Debug)] -enum InitdbError { - Other(anyhow::Error), +pub enum InitdbError { + #[error("Operation was cancelled")] Cancelled, - Spawn(std::io::Result<()>), - Failed(std::process::ExitStatus, Vec), -} - -impl fmt::Display for InitdbError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - InitdbError::Cancelled => write!(f, "Operation was cancelled"), - InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e), - InitdbError::Failed(status, stderr) => write!( - f, - "Command failed with status {:?}: {}", - status, - String::from_utf8_lossy(stderr) - ), - InitdbError::Other(e) => write!(f, "Error: {:?}", e), - } - } -} - -impl From for InitdbError { - fn from(error: std::io::Error) -> Self { - InitdbError::Spawn(Err(error)) - } + #[error(transparent)] + Other(anyhow::Error), + #[error(transparent)] + Inner(postgres_initdb::Error), } enum CreateTimelineCause { @@ -996,6 +1017,15 @@ enum CreateTimelineCause { Delete, } +enum LoadTimelineCause { + Attach, + Unoffload, + ImportPgdata { + create_guard: TimelineCreateGuard, + activate: ActivateTimelineArgs, + }, +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GcError { // The tenant is shutting down @@ -1072,24 +1102,35 @@ impl Tenant { /// it is marked as Active. #[allow(clippy::too_many_arguments)] async fn timeline_init_and_sync( - &self, + self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - index_part: IndexPart, + mut index_part: IndexPart, metadata: TimelineMetadata, ancestor: Option>, - _ctx: &RequestContext, - ) -> anyhow::Result<()> { + cause: LoadTimelineCause, + ctx: &RequestContext, + ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let idempotency = if metadata.ancestor_timeline().is_none() { - CreateTimelineIdempotency::Bootstrap { - pg_version: metadata.pg_version(), + let import_pgdata = index_part.import_pgdata.take(); + let idempotency = match &import_pgdata { + Some(import_pgdata) => { + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: import_pgdata.idempotency_key().clone(), + }) } - } else { - CreateTimelineIdempotency::Branch { - ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), - ancestor_start_lsn: metadata.ancestor_lsn(), + None => { + if metadata.ancestor_timeline().is_none() { + CreateTimelineIdempotency::Bootstrap { + pg_version: metadata.pg_version(), + } + } else { + CreateTimelineIdempotency::Branch { + ancestor_timeline_id: metadata.ancestor_timeline().unwrap(), + ancestor_start_lsn: metadata.ancestor_lsn(), + } + } } }; @@ -1121,39 +1162,91 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; - { - // avoiding holding it across awaits - let mut timelines_accessor = self.timelines.lock().unwrap(); - match timelines_accessor.entry(timeline_id) { - // We should never try and load the same timeline twice during startup - Entry::Occupied(_) => { - unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + match import_pgdata { + Some(import_pgdata) if !import_pgdata.is_done() => { + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { .. } => { + unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + } } - Entry::Vacant(v) => { - v.insert(Arc::clone(&timeline)); - timeline.maybe_spawn_flush_loop(); + let mut guard = self.timelines_creating.lock().unwrap(); + if !guard.insert(timeline_id) { + // We should never try and load the same timeline twice during startup + unreachable!("Timeline {tenant_id}/{timeline_id} is already being created") } + let timeline_create_guard = TimelineCreateGuard { + _tenant_gate_guard: self.gate.enter()?, + owning_tenant: self.clone(), + timeline_id, + idempotency, + // The users of this specific return value don't need the timline_path in there. + timeline_path: timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id), + }; + Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard: timeline_create_guard, + }, + )) } - }; + Some(_) | None => { + { + let mut timelines_accessor = self.timelines.lock().unwrap(); + match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup + Entry::Occupied(_) => { + unreachable!( + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); + } + Entry::Vacant(v) => { + v.insert(Arc::clone(&timeline)); + timeline.maybe_spawn_flush_loop(); + } + } + } - // Sanity check: a timeline should have some content. - anyhow::ensure!( - ancestor.is_some() - || timeline - .layers - .read() - .await - .layer_map() - .expect("currently loading, layer manager cannot be shutdown already") - .iter_historic_layers() - .next() - .is_some(), - "Timeline has no ancestor and no layer files" - ); + // Sanity check: a timeline should have some content. + anyhow::ensure!( + ancestor.is_some() + || timeline + .layers + .read() + .await + .layer_map() + .expect("currently loading, layer manager cannot be shutdown already") + .iter_historic_layers() + .next() + .is_some(), + "Timeline has no ancestor and no layer files" + ); - Ok(()) + match cause { + LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), + LoadTimelineCause::ImportPgdata { + create_guard, + activate, + } => { + // TODO: see the comment in the task code above how I'm not so certain + // it is safe to activate here because of concurrent shutdowns. + match activate { + ActivateTimelineArgs::Yes { broker_client } => { + info!("activating timeline after reload from pgdata import task"); + timeline.activate(self.clone(), broker_client, None, ctx); + } + ActivateTimelineArgs::No => (), + } + drop(create_guard); + } + } + + Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + } + } } /// Attach a tenant that's available in cloud storage. @@ -1578,24 +1671,46 @@ impl Tenant { } // TODO again handle early failure - self.load_remote_timeline( - timeline_id, - index_part, - remote_metadata, - TimelineResources { - remote_client, - timeline_get_throttle: self.timeline_get_throttle.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, - ctx, - ) - .await - .with_context(|| { - format!( - "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_shard_id + let effect = self + .load_remote_timeline( + timeline_id, + index_part, + remote_metadata, + TimelineResources { + remote_client, + timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), + }, + LoadTimelineCause::Attach, + ctx, ) - })?; + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_shard_id + ) + })?; + + match effect { + TimelineInitAndSyncResult::ReadyToActivate(_) => { + // activation happens later, on Tenant::activate + } + TimelineInitAndSyncResult::NeedsSpawnImportPgdata( + TimelineInitAndSyncNeedsSpawnImportPgdata { + timeline, + import_pgdata, + guard, + }, + ) => { + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline, + import_pgdata, + ActivateTimelineArgs::No, + guard, + )); + } + } } // Walk through deleted timelines, resume deletion @@ -1719,13 +1834,14 @@ impl Tenant { #[instrument(skip_all, fields(timeline_id=%timeline_id))] async fn load_remote_timeline( - &self, + self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, resources: TimelineResources, + cause: LoadTimelineCause, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); @@ -1752,6 +1868,7 @@ impl Tenant { index_part, remote_metadata, ancestor, + cause, ctx, ) .await @@ -1938,6 +2055,7 @@ impl Tenant { TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists")) } TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e), + TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled, })?; let timeline_preload = self @@ -1976,6 +2094,7 @@ impl Tenant { index_part, remote_metadata, timeline_resources, + LoadTimelineCause::Unoffload, &ctx, ) .await @@ -2213,7 +2332,7 @@ impl Tenant { /// /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys. pub(crate) async fn create_empty_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2263,7 +2382,7 @@ impl Tenant { // Our current tests don't need the background loops. #[cfg(test)] pub async fn create_test_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2302,7 +2421,7 @@ impl Tenant { #[cfg(test)] #[allow(clippy::too_many_arguments)] pub async fn create_test_timeline_with_layers( - &self, + self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, pg_version: u32, @@ -2439,6 +2558,16 @@ impl Tenant { self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) .await? } + CreateTimelineParams::ImportPgdata(params) => { + self.create_timeline_import_pgdata( + params, + ActivateTimelineArgs::Yes { + broker_client: broker_client.clone(), + }, + ctx, + ) + .await? + } }; // At this point we have dropped our guard on [`Self::timelines_creating`], and @@ -2481,11 +2610,202 @@ impl Tenant { ); timeline } + CreateTimelineResult::ImportSpawned(timeline) => { + info!("import task spawned, timeline will become visible and activated once the import is done"); + timeline + } }; Ok(activated_timeline) } + /// The returned [`Arc`] is NOT in the [`Tenant::timelines`] map until the import + /// completes in the background. A DIFFERENT [`Arc`] will be inserted into the + /// [`Tenant::timelines`] map when the import completes. + /// We only return an [`Arc`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`] + /// for the response. + async fn create_timeline_import_pgdata( + self: &Arc, + params: CreateTimelineParamsImportPgdata, + activate: ActivateTimelineArgs, + ctx: &RequestContext, + ) -> Result { + let CreateTimelineParamsImportPgdata { + new_timeline_id, + location, + idempotency_key, + } = params; + + let started_at = chrono::Utc::now().naive_utc(); + + // + // There's probably a simpler way to upload an index part, but, remote_timeline_client + // is the canonical way we do it. + // - create an empty timeline in-memory + // - use its remote_timeline_client to do the upload + // - dispose of the uninit timeline + // - keep the creation guard alive + + let timeline_create_guard = match self + .start_creating_timeline( + new_timeline_id, + CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { + idempotency_key: idempotency_key.clone(), + }), + ) + .await? + { + StartCreatingTimelineResult::CreateGuard(guard) => guard, + StartCreatingTimelineResult::Idempotent(timeline) => { + return Ok(CreateTimelineResult::Idempotent(timeline)) + } + }; + + let mut uninit_timeline = { + let this = &self; + let initdb_lsn = Lsn(0); + let _ctx = ctx; + async move { + let new_metadata = TimelineMetadata::new( + // Initialize disk_consistent LSN to 0, The caller must import some data to + // make it valid, before calling finish_creation() + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + 15, + ); + this.prepare_new_timeline( + new_timeline_id, + &new_metadata, + timeline_create_guard, + initdb_lsn, + None, + ) + .await + } + } + .await?; + + let in_progress = import_pgdata::index_part_format::InProgress { + idempotency_key, + location, + started_at, + }; + let index_part = import_pgdata::index_part_format::Root::V1( + import_pgdata::index_part_format::V1::InProgress(in_progress), + ); + uninit_timeline + .raw_timeline() + .unwrap() + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?; + + // wait_completion happens in caller + + let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); + + tokio::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + index_part, + activate, + timeline_create_guard, + )); + + // NB: the timeline doesn't exist in self.timelines at this point + Ok(CreateTimelineResult::ImportSpawned(timeline)) + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] + async fn create_timeline_import_pgdata_task( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + info!("starting"); + scopeguard::defer! {info!("exiting")}; + + let res = self + .create_timeline_import_pgdata_task_impl( + timeline, + index_part, + activate, + timeline_create_guard, + ) + .await; + if let Err(err) = &res { + error!(?err, "task failed"); + // TODO sleep & retry, sensitive to tenant shutdown + // TODO: allow timeline deletion requests => should cancel the task + } + } + + async fn create_timeline_import_pgdata_task_impl( + self: Arc, + timeline: Arc, + index_part: import_pgdata::index_part_format::Root, + activate: ActivateTimelineArgs, + timeline_create_guard: TimelineCreateGuard, + ) -> Result<(), anyhow::Error> { + let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn); + + info!("importing pgdata"); + import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) + .await + .context("import")?; + info!("import done"); + + // + // Reload timeline from remote. + // This proves that the remote state is attachable, and it reuses the code. + // + // TODO: think about whether this is safe to do with concurrent Tenant::shutdown. + // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. + // But our activate() call might launch new background tasks after Tenant::shutdown + // already went past shutting down the Tenant::timelines, which this timeline here is no part of. + // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting + // down while bootstrapping/branching + activating), but, the race condition is much more likely + // to manifest because of the long runtime of this import task. + + // in theory this shouldn't even .await anything except for coop yield + info!("shutting down timeline"); + timeline.shutdown(ShutdownMode::Hard).await; + info!("timeline shut down, reloading from remote"); + // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc + // let Some(timeline) = Arc::into_inner(timeline) else { + // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); + // }; + let timeline_id = timeline.timeline_id; + + // load from object storage like Tenant::attach does + let resources = self.build_timeline_resources(timeline_id); + let index_part = resources + .remote_client + .download_index_file(&self.cancel) + .await?; + let index_part = match index_part { + MaybeDeletedIndexPart::Deleted(_) => { + // likely concurrent delete call, cplane should prevent this + anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + let metadata = index_part.metadata.clone(); + self + .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + create_guard: timeline_create_guard, activate, }, &ctx) + .await? + .ready_to_activate() + .context("implementation error: reloaded timeline still needs import after import reported success")?; + + anyhow::Ok(()) + } + pub(crate) async fn delete_timeline( self: Arc, timeline_id: TimelineId, @@ -3337,6 +3657,13 @@ where Ok(result) } +enum ActivateTimelineArgs { + Yes { + broker_client: storage_broker::BrokerClientChannel, + }, + No, +} + impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() @@ -3520,6 +3847,7 @@ impl Tenant { /// `validate_ancestor == false` is used when a timeline is created for deletion /// and we might not have the ancestor present anymore which is fine for to be /// deleted timelines. + #[allow(clippy::too_many_arguments)] fn create_timeline_struct( &self, new_timeline_id: TimelineId, @@ -4283,16 +4611,17 @@ impl Tenant { /// If the timeline was already created in the meantime, we check whether this /// request conflicts or is idempotent , based on `state`. async fn start_creating_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, - ) -> Result, CreateTimelineError> { + ) -> Result { let allow_offloaded = false; match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) { Ok(create_guard) => { pausable_failpoint!("timeline-creation-after-uninit"); Ok(StartCreatingTimelineResult::CreateGuard(create_guard)) } + Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown), Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot // check if this request matches the existing one, so caller must try @@ -4582,7 +4911,7 @@ impl Tenant { &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - create_guard: TimelineCreateGuard<'a>, + create_guard: TimelineCreateGuard, start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result> { @@ -4642,7 +4971,7 @@ impl Tenant { /// The `allow_offloaded` parameter controls whether to tolerate the existence of /// offloaded timelines or not. fn create_timeline_create_guard( - &self, + self: &Arc, timeline_id: TimelineId, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, @@ -4902,48 +5231,16 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; - let mut initdb_command = tokio::process::Command::new(&initdb_bin_path); - initdb_command - .args(["--pgdata", initdb_target_dir.as_ref()]) - .args(["--username", &conf.superuser]) - .args(["--encoding", "utf8"]) - .args(["--locale", &conf.locale]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &initdb_lib_dir) - .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdin(std::process::Stdio::null()) - // stdout invocation produces the same output every time, we don't need it - .stdout(std::process::Stdio::null()) - // we would be interested in the stderr output, if there was any - .stderr(std::process::Stdio::piped()); - - // Before version 14, only the libc provide was available. - if pg_version > 14 { - // Version 17 brought with it a builtin locale provider which only provides - // C and C.UTF-8. While being safer for collation purposes since it is - // guaranteed to be consistent throughout a major release, it is also more - // performant. - let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; - - initdb_command.args(["--locale-provider", locale_provider]); - } - - let initdb_proc = initdb_command.spawn()?; - - // Ideally we'd select here with the cancellation token, but the problem is that - // we can't safely terminate initdb: it launches processes of its own, and killing - // initdb doesn't kill them. After we return from this function, we want the target - // directory to be able to be cleaned up. - // See https://github.com/neondatabase/neon/issues/6385 - let initdb_output = initdb_proc.wait_with_output().await?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed( - initdb_output.status, - initdb_output.stderr, - )); - } + let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: &conf.superuser, + locale: &conf.locale, + initdb_bin: &initdb_bin_path, + pg_version, + library_search_path: &initdb_lib_dir, + pgdata: initdb_target_dir, + }) + .await + .map_err(InitdbError::Inner); // This isn't true cancellation support, see above. Still return an error to // excercise the cancellation code path. @@ -4951,7 +5248,7 @@ async fn run_initdb( return Err(InitdbError::Cancelled); } - Ok(()) + res } /// Dump contents of a layer file to stdout. diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 377bc23542..007bd3eef0 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -199,7 +199,7 @@ use utils::backoff::{ use utils::pausable_failpoint; use utils::shard::ShardNumber; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; @@ -223,7 +223,7 @@ use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; +use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -244,6 +244,7 @@ use self::index::IndexPart; use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; +use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; @@ -813,6 +814,18 @@ impl RemoteTimelineClient { Ok(need_wait) } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( + self: &Arc, + state: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.import_pgdata = state; + self.schedule_index_upload(upload_queue)?; + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -1090,7 +1103,7 @@ impl RemoteTimelineClient { "scheduled layer file upload {layer}", ); - let op = UploadOp::UploadLayer(layer, metadata); + let op = UploadOp::UploadLayer(layer, metadata, None); self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -1805,7 +1818,7 @@ impl RemoteTimelineClient { // have finished. upload_queue.inprogress_tasks.is_empty() } - UploadOp::Delete(_) => { + UploadOp::Delete(..) => { // Wait for preceding uploads to finish. Concurrent deletions are OK, though. upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() } @@ -1833,19 +1846,32 @@ impl RemoteTimelineClient { } // We can launch this task. Remove it from the queue first. - let next_op = upload_queue.queued_operations.pop_front().unwrap(); + let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); debug!("starting op: {}", next_op); - // Update the counters - match next_op { - UploadOp::UploadLayer(_, _) => { + // Update the counters and prepare + match &mut next_op { + UploadOp::UploadLayer(layer, meta, mode) => { + if upload_queue + .recently_deleted + .remove(&(layer.layer_desc().layer_name().clone(), meta.generation)) + { + *mode = Some(OpType::FlushDeletion); + } else { + *mode = Some(OpType::MayReorder) + } upload_queue.num_inprogress_layer_uploads += 1; } UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } - UploadOp::Delete(_) => { + UploadOp::Delete(Delete { layers }) => { + for (name, meta) in layers { + upload_queue + .recently_deleted + .insert((name.clone(), meta.generation)); + } upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { @@ -1921,7 +1947,66 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata) => { + UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + if let Some(OpType::FlushDeletion) = mode { + if self.config.read().unwrap().block_deletions { + // Of course, this is not efficient... but usually the queue should be empty. + let mut queue_locked = self.upload_queue.lock().unwrap(); + let mut detected = false; + if let Ok(queue) = queue_locked.initialized_mut() { + for list in queue.blocked_deletions.iter_mut() { + list.layers.retain(|(name, meta)| { + if name == &layer.layer_desc().layer_name() + && meta.generation == layer_metadata.generation + { + detected = true; + // remove the layer from deletion queue + false + } else { + // keep the layer + true + } + }); + } + } + if detected { + info!( + "cancelled blocked deletion of layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } else { + // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions + // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted, + // which is not possible in the current system. + info!( + "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + { + // We are going to flush, we can clean up the recently deleted list. + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.recently_deleted.clear(); + } + } + if let Err(e) = self.deletion_queue_client.flush_execute().await { + warn!( + "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } else { + info!( + "done flushing deletion queue before uploading layer {} at gen {:?}", + layer.layer_desc().layer_name(), + layer_metadata.generation + ); + } + } + } let local_path = layer.local_path(); // We should only be uploading layers created by this `Tenant`'s lifetime, so @@ -2085,7 +2170,7 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(_, _, _) => { upload_queue.num_inprogress_layer_uploads -= 1; None } @@ -2162,7 +2247,7 @@ impl RemoteTimelineClient { )> { use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize; let res = match op { - UploadOp::UploadLayer(_, m) => ( + UploadOp::UploadLayer(_, m, _) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), @@ -2259,6 +2344,7 @@ impl RemoteTimelineClient { blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), + recently_deleted: HashSet::new(), }; let upload_queue = std::mem::replace( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index efcd20d1bf..d632e595ad 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -706,7 +706,7 @@ where .and_then(|x| x) } -async fn download_retry_forever( +pub(crate) async fn download_retry_forever( op: O, description: &str, cancel: &CancellationToken, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index d8a881a2c4..506990fb2f 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -12,6 +12,7 @@ use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::import_pgdata; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; @@ -37,6 +38,13 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub archived_at: Option, + /// This field supports import-from-pgdata ("fast imports" platform feature). + /// We don't currently use fast imports, so, this field is None for all production timelines. + /// See for more information. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub import_pgdata: Option, + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata @@ -90,10 +98,11 @@ impl IndexPart { /// - 7: metadata_bytes is no longer written, but still read /// - 8: added `archived_at` /// - 9: +gc_blocking - const LATEST_VERSION: usize = 9; + /// - 10: +import_pgdata + const LATEST_VERSION: usize = 10; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -108,6 +117,7 @@ impl IndexPart { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, } } @@ -381,6 +391,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -425,6 +436,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -470,6 +482,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -518,6 +531,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -561,6 +575,7 @@ mod tests { lineage: Lineage::default(), gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -607,6 +622,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: None, + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -658,6 +674,7 @@ mod tests { }, gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -714,6 +731,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -771,6 +789,7 @@ mod tests { lineage: Default::default(), gc_blocking: None, last_aux_file_policy: Default::default(), + import_pgdata: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -833,6 +852,83 @@ mod tests { }), last_aux_file_policy: Default::default(), archived_at: None, + import_pgdata: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v10_importpgdata_is_parsed() { + let example = r#"{ + "version": 10, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + } + }"#; + + let expected = IndexPart { + version: 10, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))) }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 1331c07d05..3df89a928c 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -111,15 +111,6 @@ pub(crate) struct SecondaryTenant { pub(super) heatmap_total_size_metric: UIntGauge, } -impl Drop for SecondaryTenant { - fn drop(&mut self) { - let tenant_id = self.tenant_shard_id.tenant_id.to_string(); - let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); - let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); - } -} - impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, @@ -167,6 +158,13 @@ impl SecondaryTenant { // Wait for any secondary downloader work to complete self.gate.close().await; + + self.validate_metrics(); + + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); } pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) { @@ -254,6 +252,20 @@ impl SecondaryTenant { .await .expect("secondary eviction should not have panicked"); } + + /// Exhaustive check that incrementally updated metrics match the actual state. + #[cfg(feature = "testing")] + fn validate_metrics(&self) { + let detail = self.detail.lock().unwrap(); + let resident_size = detail.total_resident_size(); + + assert_eq!(resident_size, self.resident_size_metric.get()); + } + + #[cfg(not(feature = "testing"))] + fn validate_metrics(&self) { + // No-op in non-testing builds + } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 82c5702686..7443261a9c 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -242,6 +242,19 @@ impl SecondaryDetail { } } + #[cfg(feature = "testing")] + pub(crate) fn total_resident_size(&self) -> u64 { + self.timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::() + } + pub(super) fn evict_layer( &mut self, name: LayerName, @@ -763,24 +776,7 @@ impl<'a> TenantDownloader<'a> { } // Metrics consistency check in testing builds - if cfg!(feature = "testing") { - let detail = self.secondary_state.detail.lock().unwrap(); - let resident_size = detail - .timelines - .values() - .map(|tl| { - tl.on_disk_layers - .values() - .map(|v| v.metadata.file_size) - .sum::() - }) - .sum::(); - assert_eq!( - resident_size, - self.secondary_state.resident_size_metric.get() - ); - } - + self.secondary_state.validate_metrics(); // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 95864af4d0..4881be33a6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +pub(crate) mod import_pgdata; mod init; pub mod layer_manager; pub(crate) mod logical_size; @@ -2085,6 +2086,11 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) } + pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf.is_gc_blocked_by_lsn_lease_deadline() + } + pub(crate) fn get_lazy_slru_download(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2647,6 +2653,7 @@ impl Timeline { // // NB: generation numbers naturally protect against this because they disambiguate // (1) and (4) + // TODO: this is basically a no-op now, should we remove it? self.remote_client.schedule_barrier()?; // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. @@ -2702,20 +2709,23 @@ impl Timeline { { Some(cancel) => cancel.cancel(), None => { - let state = self.current_state(); - if matches!( - state, - TimelineState::Broken { .. } | TimelineState::Stopping - ) { - - // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). - // Don't make noise. - } else { - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); - debug_assert!(false); + match self.current_state() { + TimelineState::Broken { .. } | TimelineState::Stopping => { + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + // Don't make noise. + } + TimelineState::Loading => { + // Import does not return an activated timeline. + info!("discarding priority boost for logical size calculation because timeline is not yet active"); + } + TimelineState::Active => { + // activation should be setting the once cell + warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); + } } } - }; + } } } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs new file mode 100644 index 0000000000..de56468580 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -0,0 +1,218 @@ +use std::sync::Arc; + +use anyhow::{bail, Context}; +use remote_storage::RemotePath; +use tokio_util::sync::CancellationToken; +use tracing::{info, info_span, Instrument}; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; + +use super::Timeline; + +mod flow; +mod importbucket_client; +mod importbucket_format; +pub(crate) mod index_part_format; +pub(crate) mod upcall_api; + +pub async fn doit( + timeline: &Arc, + index_part: index_part_format::Root, + ctx: &RequestContext, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let index_part_format::Root::V1(v1) = index_part; + let index_part_format::InProgress { + location, + idempotency_key, + started_at, + } = match v1 { + index_part_format::V1::Done(_) => return Ok(()), + index_part_format::V1::InProgress(in_progress) => in_progress, + }; + + let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + + info!("get spec early so we know we'll be able to upcall when done"); + let Some(spec) = storage.get_spec().await? else { + bail!("spec not found") + }; + + let upcall_client = + upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?; + + // + // send an early progress update to clean up k8s job early and generate potentially useful logs + // + info!("send early progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("early_progress_update")) + .await?; + + let status_prefix = RemotePath::from_string("status").unwrap(); + + // + // See if shard is done. + // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. + // + let shard_status_key = + status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); + let shard_status: Option = + storage.get_json(&shard_status_key).await?; + info!(?shard_status, "peeking shard status"); + if shard_status.map(|st| st.done).unwrap_or(false) { + info!("shard status indicates that the shard is done, skipping import"); + } else { + // TODO: checkpoint the progress into the IndexPart instead of restarting + // from the beginning. + + // + // Wipe the slate clean - the flow does not allow resuming. + // We can implement resuming in the future by checkpointing the progress into the IndexPart. + // + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefintely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + // + // Do the import + // + info!("do the import"); + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + let _ctx: &RequestContext = ctx; + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + flow::run( + timeline.clone(), + base_lsn, + control_file, + storage.clone(), + ctx, + ) + .await?; + + // + // Communicate that shard is done. + // + storage + .put_json( + &shard_status_key, + &importbucket_format::ShardStatus { done: true }, + ) + .await + .context("put shard status")?; + } + + // + // Ensure at-least-once deliver of the upcall to cplane + // before we mark the task as done and never come here again. + // + info!("send final progress update"); + upcall_client + .send_progress_until_success(&spec) + .instrument(info_span!("final_progress_update")) + .await?; + + // + // Mark as done in index_part. + // This makes subsequent timeline loads enter the normal load code path + // instead of spawning the import task and calling this here function. + // + info!("mark import as complete in index part"); + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( + index_part_format::V1::Done(index_part_format::Done { + idempotency_key, + started_at, + finished_at: chrono::Utc::now().naive_utc(), + }), + )))?; + + timeline.remote_client.wait_completion().await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs new file mode 100644 index 0000000000..cbd4168c06 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -0,0 +1,798 @@ +//! Import a PGDATA directory into an empty root timeline. +//! +//! This module is adapted hackathon code by Heikki and Stas. +//! Other code in the parent module was written by Christian as part of a customer PoC. +//! +//! The hackathon code was producing image layer files as a free-standing program. +//! +//! It has been modified to +//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard) +//! - => sharding-awareness: produce image layers with only the data relevant for this shard +//! - => S3 as the source for the PGDATA instead of local filesystem +//! +//! TODOs before productionization: +//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding. +//! => produced image layers likely too small. +//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size. +//! - asserts / unwraps need to be replaced with errors +//! - don't trust remote objects will be small (=prevent OOMs in those cases) +//! - limit all in-memory buffers in size, or download to disk and read from there +//! - limit task concurrency +//! - generally play nice with other tenants in the system +//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits +//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc +//! - integrate with layer eviction system +//! - audit for Tenant::cancel nor Timeline::cancel responsivity +//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!) +//! +//! An incomplete set of TODOs from the Hackathon: +//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) + +use std::sync::Arc; + +use anyhow::{bail, ensure}; +use bytes::Bytes; + +use itertools::Itertools; +use pageserver_api::{ + key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, + reltag::RelTag, + shard::ShardIdentity, +}; +use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; +use tokio::task::JoinSet; +use tracing::{debug, info_span, instrument, Instrument}; + +use crate::{ + assert_u64_eq_usize::UsizeIsU64, + pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, +}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + pgdatadir_mapping::{DbDirectory, RelDirectory}, + task_mgr::TaskKind, + tenant::storage_layer::{ImageLayerWriter, Layer}, +}; + +use pageserver_api::key::Key; +use pageserver_api::key::{ + slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, + TWOPHASEDIR_KEY, +}; +use pageserver_api::keyspace::singleton_range; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; +use pageserver_api::reltag::SlruKind; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +use std::collections::HashSet; +use std::ops::Range; + +use super::{ + importbucket_client::{ControlFile, RemoteStorageWrapper}, + Timeline, +}; + +use remote_storage::RemotePath; + +pub async fn run( + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + storage: RemoteStorageWrapper, + ctx: &RequestContext, +) -> anyhow::Result<()> { + Flow { + timeline, + pgdata_lsn, + control_file, + tasks: Vec::new(), + storage, + } + .run(ctx) + .await +} + +struct Flow { + timeline: Arc, + pgdata_lsn: Lsn, + control_file: ControlFile, + tasks: Vec, + storage: RemoteStorageWrapper, +} + +impl Flow { + /// Perform the ingestion into [`Self::timeline`]. + /// Assumes the timeline is empty (= no layers). + pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); + + self.pgdata_lsn = pgdata_lsn; + + let datadir = PgDataDir::new(&self.storage).await?; + + // Import dbdir (00:00:00 keyspace) + // This is just constructed here, but will be written to the image layer in the first call to import_db() + let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory { + dbdirs: datadir + .dbs + .iter() + .map(|db| ((db.spcnode, db.dboid), true)) + .collect(), + })?); + self.tasks + .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into()); + + // Import databases (00:spcnode:dbnode keyspace for each db) + for db in datadir.dbs { + self.import_db(&db).await?; + } + + // Import SLRUs + + // pg_xact (01:00 keyspace) + self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + .await?; + // pg_multixact/members (01:01 keyspace) + self.import_slru( + SlruKind::MultiXactMembers, + &self.storage.pgdata().join("pg_multixact/members"), + ) + .await?; + // pg_multixact/offsets (01:02 keyspace) + self.import_slru( + SlruKind::MultiXactOffsets, + &self.storage.pgdata().join("pg_multixact/offsets"), + ) + .await?; + + // Import pg_twophase. + // TODO: as empty + let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + TWOPHASEDIR_KEY, + Bytes::from(twophasedir_buf), + ))); + + // Controlfile, checkpoint + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CONTROLFILE_KEY, + self.control_file.control_file_buf().clone(), + ))); + + let checkpoint_buf = self + .control_file + .control_file_data() + .checkPointCopy + .encode()?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + CHECKPOINT_KEY, + checkpoint_buf, + ))); + + // Assigns parts of key space to later parallel jobs + let mut last_end_key = Key::MIN; + let mut current_chunk = Vec::new(); + let mut current_chunk_size: usize = 0; + let mut parallel_jobs = Vec::new(); + for task in std::mem::take(&mut self.tasks).into_iter() { + if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + let key_range = last_end_key..task.key_range().start; + parallel_jobs.push(ChunkProcessingJob::new( + key_range.clone(), + std::mem::take(&mut current_chunk), + &self, + )); + last_end_key = key_range.end; + current_chunk_size = 0; + } + current_chunk_size += task.total_size(); + current_chunk.push(task); + } + parallel_jobs.push(ChunkProcessingJob::new( + last_end_key..Key::MAX, + current_chunk, + &self, + )); + + // Start all jobs simultaneosly + let mut work = JoinSet::new(); + // TODO: semaphore? + for job in parallel_jobs { + let ctx: RequestContext = + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); + } + let mut results = Vec::new(); + while let Some(result) = work.join_next().await { + match result { + Ok(res) => { + results.push(res); + } + Err(_joinset_err) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] + async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> { + debug!("start"); + scopeguard::defer! { + debug!("return"); + } + + // Import relmap (00:spcnode:dbnode:00:*:00) + let relmap_key = relmap_file_key(db.spcnode, db.dboid); + debug!("Constructing relmap entry, key {relmap_key}"); + let relmap_path = db.path.join("pg_filenode.map"); + let relmap_buf = self.storage.get(&relmap_path).await?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + relmap_key, relmap_buf, + ))); + + // Import reldir (00:spcnode:dbnode:00:*:01) + let reldir_key = rel_dir_to_key(db.spcnode, db.dboid); + debug!("Constructing reldirs entry, key {reldir_key}"); + let reldir_buf = RelDirectory::ser(&RelDirectory { + rels: db + .files + .iter() + .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)) + .collect(), + })?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + reldir_key, + Bytes::from(reldir_buf), + ))); + + // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last + // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff) + for file in &db.files { + debug!(%file.path, %file.filesize, "importing file"); + let len = file.filesize; + ensure!(len % 8192 == 0); + let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192); + let start_key = rel_block_to_key(file.rel_tag, start_blk); + let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); + self.tasks + .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &file.path, + self.storage.clone(), + ))); + + // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff) + if let Some(nblocks) = file.nblocks { + let size_key = rel_size_to_key(file.rel_tag); + //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}"); + let buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + size_key, + Bytes::from(buf.to_vec()), + ))); + } + } + + Ok(()) + } + + async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { + let segments = self.storage.listfilesindir(path).await?; + let segments: Vec<(String, u32, usize)> = segments + .into_iter() + .filter_map(|(path, size)| { + let filename = path.object_name()?; + let segno = u32::from_str_radix(filename, 16).ok()?; + Some((filename.to_string(), segno, size)) + }) + .collect(); + + // Write SlruDir + let slrudir_key = slru_dir_to_key(kind); + let segnos: HashSet = segments + .iter() + .map(|(_path, segno, _size)| *segno) + .collect(); + let slrudir = SlruSegmentDirectory { segments: segnos }; + let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?; + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + slrudir_key, + Bytes::from(slrudir_buf), + ))); + + for (segpath, segno, size) in segments { + // SlruSegBlocks for each segment + let p = path.join(&segpath); + let file_size = size; + ensure!(file_size % 8192 == 0); + let nblocks = u32::try_from(file_size / 8192)?; + let start_key = slru_block_to_key(kind, segno, 0); + let end_key = slru_block_to_key(kind, segno, nblocks); + debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); + self.tasks + .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( + *self.timeline.get_shard_identity(), + start_key..end_key, + &p, + self.storage.clone(), + ))); + + // Followed by SlruSegSize + let segsize_key = slru_segment_size_to_key(kind, segno); + let segsize_buf = nblocks.to_le_bytes(); + self.tasks + .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new( + segsize_key, + Bytes::copy_from_slice(&segsize_buf), + ))); + } + Ok(()) + } +} + +// +// dbdir iteration tools +// + +struct PgDataDir { + pub dbs: Vec, // spcnode, dboid, path +} + +struct PgDataDirDb { + pub spcnode: u32, + pub dboid: u32, + pub path: RemotePath, + pub files: Vec, +} + +struct PgDataDirDbFile { + pub path: RemotePath, + pub rel_tag: RelTag, + pub segno: u32, + pub filesize: usize, + // Cummulative size of the given fork, set only for the last segment of that fork + pub nblocks: Option, +} + +impl PgDataDir { + async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result { + let datadir_path = storage.pgdata(); + // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first + // Traverse database in increasing oid order + + let basedir = &datadir_path.join("base"); + let db_oids: Vec<_> = storage + .listdir(basedir) + .await? + .into_iter() + .filter_map(|path| path.object_name().and_then(|name| name.parse::().ok())) + .sorted() + .collect(); + debug!(?db_oids, "found databases"); + let mut databases = Vec::new(); + for dboid in db_oids { + databases.push( + PgDataDirDb::new( + storage, + &basedir.join(dboid.to_string()), + pg_constants::DEFAULTTABLESPACE_OID, + dboid, + &datadir_path, + ) + .await?, + ); + } + + // special case for global catalogs + databases.push( + PgDataDirDb::new( + storage, + &datadir_path.join("global"), + postgres_ffi::pg_constants::GLOBALTABLESPACE_OID, + 0, + &datadir_path, + ) + .await?, + ); + + databases.sort_by_key(|db| (db.spcnode, db.dboid)); + + Ok(Self { dbs: databases }) + } +} + +impl PgDataDirDb { + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))] + async fn new( + storage: &RemoteStorageWrapper, + db_path: &RemotePath, + spcnode: u32, + dboid: u32, + datadir_path: &RemotePath, + ) -> anyhow::Result { + let mut files: Vec = storage + .listfilesindir(db_path) + .await? + .into_iter() + .filter_map(|(path, size)| { + debug!(%path, %size, "found file in dbdir"); + path.object_name().and_then(|name| { + // returns (relnode, forknum, segno) + parse_relfilename(name).ok().map(|x| (size, x)) + }) + }) + .sorted_by_key(|(_, relfilename)| *relfilename) + .map(|(filesize, (relnode, forknum, segno))| { + let rel_tag = RelTag { + spcnode, + dbnode: dboid, + relnode, + forknum, + }; + + let path = datadir_path.join(rel_tag.to_segfile_name(segno)); + assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error + let nblocks = filesize / BLCKSZ as usize; + + PgDataDirDbFile { + path, + filesize, + rel_tag, + segno, + nblocks: Some(nblocks), // first non-cummulative sizes + } + }) + .collect(); + + // Set cummulative sizes. Do all of that math here, so that later we could easier + // parallelize over segments and know with which segments we need to write relsize + // entry. + let mut cumulative_nblocks: usize = 0; + let mut prev_rel_tag: Option = None; + for i in 0..files.len() { + if prev_rel_tag == Some(files[i].rel_tag) { + cumulative_nblocks += files[i].nblocks.unwrap(); + } else { + cumulative_nblocks = files[i].nblocks.unwrap(); + } + + files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag { + Some(cumulative_nblocks) + } else { + None + }; + + prev_rel_tag = Some(files[i].rel_tag); + } + + Ok(PgDataDirDb { + files, + path: db_path.clone(), + spcnode, + dboid, + }) + } +} + +trait ImportTask { + fn key_range(&self) -> Range; + + fn total_size(&self) -> usize { + // TODO: revisit this + if is_contiguous_range(&self.key_range()) { + contiguous_range_len(&self.key_range()) as usize * 8192 + } else { + u32::MAX as usize + } + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result; +} + +struct ImportSingleKeyTask { + key: Key, + buf: Bytes, +} + +impl ImportSingleKeyTask { + fn new(key: Key, buf: Bytes) -> Self { + ImportSingleKeyTask { key, buf } + } +} + +impl ImportTask for ImportSingleKeyTask { + fn key_range(&self) -> Range { + singleton_range(self.key) + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + layer_writer.put_image(self.key, self.buf, ctx).await?; + Ok(1) + } +} + +struct ImportRelBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportRelBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportRelBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportRelBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))] + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing relation file"); + + let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; + let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?; + assert_eq!(rel_tag, rel_tag_end); + + let ranges = (start_blk..end_blk) + .enumerate() + .filter_map(|(i, blknum)| { + let key = rel_block_to_key(rel_tag, blknum); + if self.shard_identity.is_key_disposable(&key) { + return None; + } + let file_offset = i.checked_mul(8192).unwrap(); + Some(( + vec![key], + file_offset, + file_offset.checked_add(8192).unwrap(), + )) + }) + .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| { + assert_eq!(key.len(), 1); + assert!(!acc.is_empty()); + assert!(acc_end > acc_start); + if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ { + acc.push(key.pop().unwrap()); + Ok((acc, acc_start, end)) + } else { + Err(((acc, acc_start, acc_end), (key, start, end))) + } + }); + + let mut nimages = 0; + for (keys, range_start, range_end) in ranges { + let range_buf = self + .storage + .get_range(&self.path, range_start.into_u64(), range_end.into_u64()) + .await?; + let mut buf = Bytes::from(range_buf); + // TODO: batched writes + for key in keys { + let image = buf.split_to(8192); + layer_writer.put_image(key, image, ctx).await?; + nimages += 1; + } + } + + Ok(nimages) + } +} + +struct ImportSlruBlocksTask { + shard_identity: ShardIdentity, + key_range: Range, + path: RemotePath, + storage: RemoteStorageWrapper, +} + +impl ImportSlruBlocksTask { + fn new( + shard_identity: ShardIdentity, + key_range: Range, + path: &RemotePath, + storage: RemoteStorageWrapper, + ) -> Self { + ImportSlruBlocksTask { + shard_identity, + key_range, + path: path.clone(), + storage, + } + } +} + +impl ImportTask for ImportSlruBlocksTask { + fn key_range(&self) -> Range { + self.key_range.clone() + } + + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + debug!("Importing SLRU segment file {}", self.path); + let buf = self.storage.get(&self.path).await?; + + let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?; + let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?; + let mut blknum = start_blk; + let mut nimages = 0; + let mut file_offset = 0; + while blknum < end_blk { + let key = slru_block_to_key(kind, segno, blknum); + assert!( + !self.shard_identity.is_key_disposable(&key), + "SLRU keys need to go into every shard" + ); + let buf = &buf[file_offset..(file_offset + 8192)]; + file_offset += 8192; + layer_writer + .put_image(key, Bytes::copy_from_slice(buf), ctx) + .await?; + blknum += 1; + nimages += 1; + } + Ok(nimages) + } +} + +enum AnyImportTask { + SingleKey(ImportSingleKeyTask), + RelBlocks(ImportRelBlocksTask), + SlruBlocks(ImportSlruBlocksTask), +} + +impl ImportTask for AnyImportTask { + fn key_range(&self) -> Range { + match self { + Self::SingleKey(t) => t.key_range(), + Self::RelBlocks(t) => t.key_range(), + Self::SlruBlocks(t) => t.key_range(), + } + } + /// returns the number of images put into the `layer_writer` + async fn doit( + self, + layer_writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + match self { + Self::SingleKey(t) => t.doit(layer_writer, ctx).await, + Self::RelBlocks(t) => t.doit(layer_writer, ctx).await, + Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await, + } + } +} + +impl From for AnyImportTask { + fn from(t: ImportSingleKeyTask) -> Self { + Self::SingleKey(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportRelBlocksTask) -> Self { + Self::RelBlocks(t) + } +} + +impl From for AnyImportTask { + fn from(t: ImportSlruBlocksTask) -> Self { + Self::SlruBlocks(t) + } +} + +struct ChunkProcessingJob { + timeline: Arc, + range: Range, + tasks: Vec, + + pgdata_lsn: Lsn, +} + +impl ChunkProcessingJob { + fn new(range: Range, tasks: Vec, env: &Flow) -> Self { + assert!(env.pgdata_lsn.is_valid()); + Self { + timeline: env.timeline.clone(), + range, + tasks, + pgdata_lsn: env.pgdata_lsn, + } + } + + async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + let mut writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + &self.range, + self.pgdata_lsn, + ctx, + ) + .await?; + + let mut nimages = 0; + for task in self.tasks { + nimages += task.doit(&mut writer, ctx).await?; + } + + let resident_layer = if nimages > 0 { + let (desc, path) = writer.finish(ctx).await?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + } else { + // dropping the writer cleans up + return Ok(()); + }; + + // this is sharing the same code as create_image_layers + let mut guard = self.timeline.layers.write().await; + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + crate::tenant::timeline::drop_wlock(guard); + + // Schedule the layer for upload but don't add barriers such as + // wait for completion or index upload, so we don't inhibit upload parallelism. + // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) + // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. + self.timeline + .remote_client + .schedule_layer_file_upload(resident_layer)?; + + Ok(()) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs new file mode 100644 index 0000000000..8d5ab1780f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -0,0 +1,315 @@ +use std::{ops::Bound, sync::Arc}; + +use anyhow::Context; +use bytes::Bytes; +use postgres_ffi::ControlFileData; +use remote_storage::{ + Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, +}; +use serde::de::DeserializeOwned; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, instrument}; +use utils::lsn::Lsn; + +use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; + +use super::{importbucket_format, index_part_format}; + +pub async fn new( + conf: &'static PageServerConf, + location: &index_part_format::Location, + cancel: CancellationToken, +) -> Result { + // FIXME: we probably want some timeout, and we might be able to assume the max file + // size on S3 is 1GiB (postgres segment size). But the problem is that the individual + // downloaders don't know enough about concurrent downloads to make a guess on the + // expected bandwidth and resulting best timeout. + let timeout = std::time::Duration::from_secs(24 * 60 * 60); + let location_storage = match location { + #[cfg(feature = "testing")] + index_part_format::Location::LocalFs { path } => { + GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?) + } + index_part_format::Location::AwsS3 { + region, + bucket, + key, + } => { + // TODO: think about security implications of letting the client specify the bucket & prefix. + // It's the most flexible right now, but, possibly we want to move bucket name into PS conf + // and force the timeline_id into the prefix? + GenericRemoteStorage::AwsS3(Arc::new( + remote_storage::S3Bucket::new( + &remote_storage::S3Config { + bucket_name: bucket.clone(), + prefix_in_bucket: Some(key.clone()), + bucket_region: region.clone(), + endpoint: conf + .import_pgdata_aws_endpoint_url + .clone() + .map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env + concurrency_limit: 100.try_into().unwrap(), // TODO: think about this + max_keys_per_list_response: Some(1000), // TODO: think about this + upload_storage_class: None, // irrelevant + }, + timeout, + ) + .await + .context("setup s3 bucket")?, + )) + } + }; + let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel); + Ok(storage_wrapper) +} + +/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API +/// such as [`tokio::fs`], which was used in the original implementation of the import code. +#[derive(Clone)] +pub struct RemoteStorageWrapper { + storage: GenericRemoteStorage, + cancel: CancellationToken, +} + +impl RemoteStorageWrapper { + pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self { + Self { storage, cancel } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listfilesindir( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes: _ } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, size, .. }| (key, size.into_usize())) + .collect(); + Ok(res) + }, + &format!("listfilesindir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn listdir(&self, path: &RemotePath) -> Result, DownloadError> { + assert!( + path.object_name().is_some(), + "must specify dirname, without trailing slash" + ); + let path = path.add_trailing_slash(); + + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Listing { keys, prefixes } = self + .storage + .list( + Some(&path), + remote_storage::ListingMode::WithDelimiter, + None, + &self.cancel, + ) + .await?; + let res = keys + .into_iter() + .map(|ListingObject { key, .. }| key) + .chain(prefixes.into_iter()) + .collect(); + Ok(res) + }, + &format!("listdir {path:?}"), + &self.cancel, + ) + .await; + debug!(?res, "returning"); + res + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get(&self, path: &RemotePath) -> Result { + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download(path, &DownloadOpts::default(), &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + // XXX optimize this, can we get the capacity hint from somewhere? + let mut buf = Vec::new(); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(Bytes::from(buf)) + }, + &format!("download {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub async fn get_spec(&self) -> Result, anyhow::Error> { + self.get_json(&RemotePath::from_string("spec.json").unwrap()) + .await + .context("get spec") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_json( + &self, + path: &RemotePath, + ) -> Result, DownloadError> { + let buf = match self.get(path).await { + Ok(buf) => buf, + Err(DownloadError::NotFound) => return Ok(None), + Err(err) => return Err(err), + }; + let res = serde_json::from_slice(&buf) + .context("serialize") + // TODO: own error type + .map_err(DownloadError::Other)?; + Ok(Some(res)) + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> + where + T: serde::Serialize, + { + let buf = serde_json::to_vec(value)?; + let bytes = Bytes::from(buf); + utils::backoff::retry( + || async { + let size = bytes.len(); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); + self.storage + .upload_storage_object(bytes, size, path, &self.cancel) + .await + }, + remote_storage::TimeoutOrCancel::caused_by_cancel, + 1, + u32::MAX, + &format!("put json {path}"), + &self.cancel, + ) + .await + .expect("practically infinite retries") + } + + #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] + pub async fn get_range( + &self, + path: &RemotePath, + start_inclusive: u64, + end_exclusive: u64, + ) -> Result, DownloadError> { + let len = end_exclusive + .checked_sub(start_inclusive) + .unwrap() + .into_usize(); + let res = crate::tenant::remote_timeline_client::download::download_retry_forever( + || async { + let Download { + download_stream, .. + } = self + .storage + .download( + path, + &DownloadOpts { + etag: None, + byte_start: Bound::Included(start_inclusive), + byte_end: Bound::Excluded(end_exclusive) + }, + &self.cancel) + .await?; + let mut reader = tokio_util::io::StreamReader::new(download_stream); + + let mut buf = Vec::with_capacity(len); + tokio::io::copy_buf(&mut reader, &mut buf).await?; + Ok(buf) + }, + &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"), + &self.cancel, + ) + .await; + debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done"); + res + } + + pub fn pgdata(&self) -> RemotePath { + RemotePath::from_string("pgdata").unwrap() + } + + pub async fn get_control_file(&self) -> Result { + let control_file_path = self.pgdata().join("global/pg_control"); + info!("get control file from {control_file_path}"); + let control_file_buf = self.get(&control_file_path).await?; + ControlFile::new(control_file_buf) + } +} + +pub struct ControlFile { + control_file_data: ControlFileData, + control_file_buf: Bytes, +} + +impl ControlFile { + pub(crate) fn new(control_file_buf: Bytes) -> Result { + // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes. + let control_file_data = ControlFileData::decode(&control_file_buf)?; + let control_file = ControlFile { + control_file_data, + control_file_buf, + }; + control_file.try_pg_version()?; // so that we can offer infallible pg_version() + Ok(control_file) + } + pub(crate) fn base_lsn(&self) -> Lsn { + Lsn(self.control_file_data.checkPoint).align() + } + pub(crate) fn pg_version(&self) -> u32 { + self.try_pg_version() + .expect("prepare() checks that try_pg_version doesn't error") + } + pub(crate) fn control_file_data(&self) -> &ControlFileData { + &self.control_file_data + } + pub(crate) fn control_file_buf(&self) -> &Bytes { + &self.control_file_buf + } + fn try_pg_version(&self) -> anyhow::Result { + Ok(match self.control_file_data.catalog_version_no { + // thesea are from catversion.h + 202107181 => 14, + 202209061 => 15, + 202307071 => 16, + /* XXX pg17 */ + catversion => { + anyhow::bail!("unrecognized catalog version {catversion}") + } + }) + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs new file mode 100644 index 0000000000..04ba3c6f1f --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -0,0 +1,20 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct PgdataStatus { + pub done: bool, + // TODO: remaining fields +} + +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardStatus { + pub done: bool, + // TODO: remaining fields +} + +// TODO: dedupe with fast_import code +#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] +pub struct Spec { + pub project_id: String, + pub branch_id: String, +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs new file mode 100644 index 0000000000..310d97a6a9 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -0,0 +1,68 @@ +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Root { + V1(V1), +} +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum V1 { + InProgress(InProgress), + Done(Done), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(transparent)] +pub struct IdempotencyKey(String); + +impl IdempotencyKey { + pub fn new(s: String) -> Self { + Self(s) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct InProgress { + pub idempotency_key: IdempotencyKey, + pub location: Location, + pub started_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct Done { + pub idempotency_key: IdempotencyKey, + pub started_at: chrono::NaiveDateTime, + pub finished_at: chrono::NaiveDateTime, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Location { + #[cfg(feature = "testing")] + LocalFs { path: Utf8PathBuf }, + AwsS3 { + region: String, + bucket: String, + key: String, + }, +} + +impl Root { + pub fn is_done(&self) -> bool { + match self { + Root::V1(v1) => match v1 { + V1::Done(_) => true, + V1::InProgress(_) => false, + }, + } + } + pub fn idempotency_key(&self) -> &IdempotencyKey { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.idempotency_key, + V1::Done(done) => &done.idempotency_key, + }, + } + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs new file mode 100644 index 0000000000..c5210f9a30 --- /dev/null +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -0,0 +1,119 @@ +//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. +use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::error; + +use crate::config::PageServerConf; +use reqwest::Method; + +use super::importbucket_format::Spec; + +pub struct Client { + base_url: String, + authorization_header: Option, + client: reqwest::Client, + cancel: CancellationToken, +} + +pub type Result = std::result::Result; + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressRequest { + // no fields yet, not sure if there every will be any +} + +#[derive(Serialize, Deserialize, Debug)] +struct ImportProgressResponse { + // we don't care +} + +impl Client { + pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result { + let Some(ref base_url) = conf.import_pgdata_upcall_api else { + anyhow::bail!("import_pgdata_upcall_api is not configured") + }; + Ok(Self { + base_url: base_url.to_string(), + client: reqwest::Client::new(), + cancel, + authorization_header: conf + .import_pgdata_upcall_api_token + .as_ref() + .map(|secret_string| secret_string.get_contents()) + .map(|jwt| format!("Bearer {jwt}")), + }) + } + + fn start_request( + &self, + method: Method, + uri: U, + ) -> reqwest::RequestBuilder { + let req = self.client.request(method, uri); + if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + } + } + + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; + let response = res.error_from_body().await?; + Ok(response) + } + + pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> { + let url = format!( + "{}/projects/{}/branches/{}/import_progress", + self.base_url, spec.project_id, spec.branch_id + ); + let ImportProgressResponse {} = self + .request(Method::POST, url, &ImportProgressRequest {}) + .await? + .json() + .await + .map_err(Error::ReceiveBody)?; + Ok(()) + } + + pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> { + loop { + match self.send_progress_once(spec).await { + Ok(()) => return Ok(()), + Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")), + Err(err) => { + error!(?err, "error sending progress, retrying"); + if tokio::time::timeout( + std::time::Duration::from_secs(10), + self.cancel.cancelled(), + ) + .await + .is_ok() + { + anyhow::bail!("cancelled while sending early progress update"); + } + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index a93bdde3f8..80a09b4840 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; use crate::{ context::RequestContext, @@ -23,14 +23,14 @@ use super::Timeline; pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard)>, ) -> Self { Self { owning_tenant, @@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> { } } + pub(crate) fn finish_creation_myself(&mut self) -> (Arc, TimelineCreateGuard) { + self.raw_timeline.take().expect("already checked") + } + /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, @@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { /// A guard for timeline creations in process: as long as this object exists, the timeline ID /// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineCreateGuard<'t> { - owning_tenant: &'t Tenant, - timeline_id: TimelineId, +pub(crate) struct TimelineCreateGuard { + pub(crate) _tenant_gate_guard: GateGuard, + pub(crate) owning_tenant: Arc, + pub(crate) timeline_id: TimelineId, pub(crate) timeline_path: Utf8PathBuf, pub(crate) idempotency: CreateTimelineIdempotency, } @@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError { }, #[error("Already creating")] AlreadyCreating, + #[error("Shutting down")] + ShuttingDown, // e.g. I/O errors, or some failure deep in postgres initdb #[error(transparent)] Other(#[from] anyhow::Error), } -impl<'t> TimelineCreateGuard<'t> { +impl TimelineCreateGuard { pub(crate) fn new( - owning_tenant: &'t Tenant, + owning_tenant: &Arc, timeline_id: TimelineId, timeline_path: Utf8PathBuf, idempotency: CreateTimelineIdempotency, allow_offloaded: bool, ) -> Result { + let _tenant_gate_guard = owning_tenant + .gate + .enter() + .map_err(|_| TimelineExclusionError::ShuttingDown)?; + // Lock order: this is the only place we take both locks. During drop() we only // lock creating_timelines let timelines = owning_tenant.timelines.lock().unwrap(); @@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> { return Err(TimelineExclusionError::AlreadyCreating); } creating_timelines.insert(timeline_id); + drop(creating_timelines); + drop(timelines_offloaded); + drop(timelines); Ok(Self { - owning_tenant, + _tenant_gate_guard, + owning_tenant: Arc::clone(owning_tenant), timeline_id, timeline_path, idempotency, @@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> { } } -impl Drop for TimelineCreateGuard<'_> { +impl Drop for TimelineCreateGuard { fn drop(&mut self) { self.owning_tenant .timelines_creating diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index f14bf2f8c3..ef3aa759f3 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use std::collections::HashSet; use std::collections::{HashMap, VecDeque}; use std::fmt::Debug; @@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; -#[cfg(feature = "testing")] use utils::generation::Generation; // clippy warns that Uninitialized is much smaller than Initialized, which wastes @@ -38,6 +38,12 @@ impl UploadQueue { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub(crate) enum OpType { + MayReorder, + FlushDeletion, +} + /// This keeps track of queued and in-progress tasks. pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs @@ -88,6 +94,9 @@ pub(crate) struct UploadQueueInitialized { #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, + /// Ensure we order file operations correctly. + pub(crate) recently_deleted: HashSet<(LayerName, Generation)>, + /// Deletions that are blocked by the tenant configuration pub(crate) blocked_deletions: Vec, @@ -183,6 +192,7 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), @@ -224,6 +234,7 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + recently_deleted: HashSet::new(), blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), @@ -282,8 +293,8 @@ pub(crate) struct Delete { #[derive(Debug)] pub(crate) enum UploadOp { - /// Upload a layer file - UploadLayer(ResidentLayer, LayerFileMetadata), + /// Upload a layer file. The last field indicates the last operation for thie file. + UploadLayer(ResidentLayer, LayerFileMetadata, Option), /// Upload a index_part.json file UploadMetadata { @@ -305,11 +316,11 @@ pub(crate) enum UploadOp { impl std::fmt::Display for UploadOp { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { - UploadOp::UploadLayer(layer, metadata) => { + UploadOp::UploadLayer(layer, metadata, mode) => { write!( f, - "UploadLayer({}, size={:?}, gen={:?})", - layer, metadata.file_size, metadata.generation + "UploadLayer({}, size={:?}, gen={:?}, mode={:?})", + layer, metadata.file_size, metadata.generation, mode ) } UploadOp::UploadMetadata { uploaded, .. } => { diff --git a/poetry.lock b/poetry.lock index 6171f92391..e2fca7be47 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -114,7 +114,6 @@ files = [ [package.dependencies] aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" -async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" frozenlist = ">=1.1.1" multidict = ">=4.5,<7.0" @@ -219,10 +218,8 @@ files = [ ] [package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} idna = ">=2.8" sniffio = ">=1.1" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] @@ -737,10 +734,7 @@ files = [ [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = [ - {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, - {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, -] +urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""} [package.extras] crt = ["awscrt (==0.19.19)"] @@ -1069,20 +1063,6 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] ssh = ["paramiko (>=2.4.3)"] websockets = ["websocket-client (>=1.3.0)"] -[[package]] -name = "exceptiongroup" -version = "1.1.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "execnet" version = "1.9.0" @@ -1110,7 +1090,6 @@ files = [ [package.dependencies] click = ">=8.0" -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" Werkzeug = ">=2.2.2" @@ -1319,25 +1298,6 @@ files = [ {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] -[[package]] -name = "importlib-metadata" -version = "4.12.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, - {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, -] - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] -perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - [[package]] name = "iniconfig" version = "1.1.1" @@ -1898,48 +1858,54 @@ files = [ [[package]] name = "mypy" -version = "1.3.0" +version = "1.13.0" description = "Optional static typing for Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"}, - {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"}, - {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"}, - {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"}, - {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"}, - {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"}, - {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"}, - {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"}, - {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"}, - {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"}, - {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"}, - {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"}, - {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"}, - {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"}, - {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"}, - {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"}, - {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"}, - {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"}, - {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"}, - {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"}, - {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"}, - {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"}, - {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=3.10" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] install-types = ["pip"] -python2 = ["typed-ast (>=1.4.0,<2)"] +mypyc = ["setuptools (>=50)"] reports = ["lxml"] [[package]] @@ -2514,11 +2480,9 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] @@ -2581,10 +2545,7 @@ files = [ ] [package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] +pytest = {version = ">=6.2.4", markers = "python_version >= \"3.10\""} [[package]] name = "pytest-repeat" @@ -3092,17 +3053,6 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - [[package]] name = "types-jwcrypto" version = "1.5.0.20240925" @@ -3359,16 +3309,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3523,21 +3463,6 @@ idna = ">=2.0" multidict = ">=4.0" propcache = ">=0.2.0" -[[package]] -name = "zipp" -version = "3.19.1" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, - {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, -] - -[package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] - [[package]] name = "zstandard" version = "0.21.0" @@ -3598,5 +3523,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "8cb9c38d83eec441391c0528ac2fbefde18c734373b2399e07c69382044e8ced" +python-versions = "^3.11" +content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486" diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 5772471486..bf7a1cb070 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -6,6 +6,7 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; +use crate::auth::IpPattern; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; @@ -74,10 +75,10 @@ impl ConsoleRedirectBackend { ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { + ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(ConsoleRedirectNodeInfo) + .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) } } @@ -102,7 +103,7 @@ async fn authenticate( auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result { +) -> auth::Result<(NodeInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -176,9 +177,12 @@ async fn authenticate( config.password(password.as_ref()); } - Ok(NodeInfo { - config, - aux: db_info.aux, - allow_self_signed_compute: false, // caller may override - }) + Ok(( + NodeInfo { + config, + aux: db_info.aux, + allow_self_signed_compute: false, // caller may override + }, + db_info.allowed_ips, + )) } diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index f721d81aa2..517d4fd34b 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -132,6 +132,93 @@ struct JwkSet<'a> { keys: Vec<&'a RawValue>, } +/// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs. +/// Returns `None` and log a warning if there are any errors. +async fn fetch_jwks( + client: &reqwest_middleware::ClientWithMiddleware, + jwks_url: url::Url, +) -> Option { + let req = client.get(jwks_url.clone()); + // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. + let resp = req.send().await.and_then(|r| { + r.error_for_status() + .map_err(reqwest_middleware::Error::Reqwest) + }); + + let resp = match resp { + Ok(r) => r, + // TODO: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs"); + return None; + } + }; + + let resp: http::Response = resp.into(); + + let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + let jwks = match serde_json::from_slice::(&bytes) { + Ok(jwks) => jwks, + Err(e) => { + tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs"); + return None; + } + }; + + // `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need. + // + // Even though we limit our responses to 64KiB, we could still receive a payload like + // `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB. + // Pre-allocating the corresponding `Vec::::with_capacity(30000)` uses 8.2MiB. + let mut keys = vec![]; + + let mut failed = 0; + for key in jwks.keys { + let key = match serde_json::from_str::(key.get()) { + Ok(key) => key, + Err(e) => { + tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK"); + failed += 1; + continue; + } + }; + + // if `use` (called `cls` in rust) is specified to be something other than signing, + // we can skip storing it. + if key + .prm + .cls + .as_ref() + .is_some_and(|c| *c != jose_jwk::Class::Signing) + { + continue; + } + + keys.push(key); + } + + keys.shrink_to_fit(); + + if failed > 0 { + tracing::warn!(url=?jwks_url, failed, "could not decode JWKs"); + } + + if keys.is_empty() { + tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body"); + return None; + } + + Some(jose_jwk::JwkSet { keys }) +} + impl JwkCacheEntryLock { async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { JwkRenewalPermit::acquire_permit(self).await @@ -166,87 +253,15 @@ impl JwkCacheEntryLock { // TODO(conrad): run concurrently // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) for rule in rules { - let req = client.get(rule.jwks_url.clone()); - // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. - // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. - match req.send().await.and_then(|r| { - r.error_for_status() - .map_err(reqwest_middleware::Error::Reqwest) - }) { - // todo: should we re-insert JWKs if we want to keep this JWKs URL? - // I expect these failures would be quite sparse. - Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), - Ok(r) => { - let resp: http::Response = r.into(); - - let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE) - .await - { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - continue; - } - }; - - match serde_json::from_slice::(&bytes) { - Err(e) => { - tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); - } - Ok(jwks) => { - // size_of::<&RawValue>() == 16 - // size_of::() == 288 - // better to not pre-allocate this as it might be pretty large - especially if it has many - // keys we don't want or need. - // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}` - // this would consume 8MiB just like that! - let mut keys = vec![]; - let mut failed = 0; - for key in jwks.keys { - match serde_json::from_str::(key.get()) { - Ok(key) => { - // if `use` (called `cls` in rust) is specified to be something other than signing, - // we can skip storing it. - if key - .prm - .cls - .as_ref() - .is_some_and(|c| *c != jose_jwk::Class::Signing) - { - continue; - } - - keys.push(key); - } - Err(e) => { - tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK"); - failed += 1; - } - } - } - keys.shrink_to_fit(); - - if failed > 0 { - tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs"); - } - - if keys.is_empty() { - tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body"); - continue; - } - - let jwks = jose_jwk::JwkSet { keys }; - key_sets.insert( - rule.id, - KeySet { - jwks, - audience: rule.audience, - role_names: rule.role_names, - }, - ); - } - }; - } + if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await { + key_sets.insert( + rule.id, + KeySet { + jwks, + audience: rule.audience, + role_names: rule.role_names, + }, + ); } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 57ecd5e499..7e1b26a11a 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -6,7 +6,6 @@ pub mod local; use std::net::IpAddr; use std::sync::Arc; -use std::time::Duration; pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; @@ -30,7 +29,7 @@ use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; -use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{scram, stream}; @@ -192,21 +191,6 @@ impl MaskedIp { // This can't be just per IP because that would limit some PaaS that share IP addresses pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; -impl RateBucketInfo { - /// All of these are per endpoint-maskedip pair. - /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). - /// - /// First bucket: 1000mcpus total per endpoint-ip pair - /// * 4096000 requests per second with 1 hash rounds. - /// * 1000 requests per second with 4096 hash rounds. - /// * 6.8 requests per second with 600000 hash rounds. - pub const DEFAULT_AUTH_SET: [Self; 3] = [ - Self::new(1000 * 4096, Duration::from_secs(1)), - Self::new(600 * 4096, Duration::from_secs(60)), - Self::new(300 * 4096, Duration::from_secs(600)), - ]; -} - impl AuthenticationConfig { pub(crate) fn check_rate_limit( &self, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 45fbe4a398..a935378162 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -428,8 +428,9 @@ async fn main() -> anyhow::Result<()> { )?))), None => None, }; + let cancellation_handler = Arc::new(CancellationHandler::< - Option>>, + Option>>, >::new( cancel_map.clone(), redis_publisher, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 3ad2d55b53..4b72a66e63 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -10,16 +10,23 @@ use tokio_postgres::{CancelToken, NoTls}; use tracing::{debug, info}; use uuid::Uuid; +use crate::auth::{check_peer_addr_is_in_list, IpPattern}; use crate::error::ReportableError; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; +use std::net::IpAddr; + +use ipnet::{IpNet, Ipv4Net, Ipv6Net}; pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; pub(crate) type CancellationHandlerMainInternal = Option>>; +type IpSubnetKey = IpNet; + /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. @@ -29,14 +36,23 @@ pub struct CancellationHandler

{ /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. from: CancellationSource, + // rate limiter of cancellation requests + limiter: Arc>>, } #[derive(Debug, Error)] pub(crate) enum CancelError { #[error("{0}")] IO(#[from] std::io::Error), + #[error("{0}")] Postgres(#[from] tokio_postgres::Error), + + #[error("rate limit exceeded")] + RateLimit, + + #[error("IP is not allowed")] + IpNotAllowed, } impl ReportableError for CancelError { @@ -47,6 +63,8 @@ impl ReportableError for CancelError { crate::error::ErrorKind::Postgres } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + CancelError::RateLimit => crate::error::ErrorKind::RateLimit, + CancelError::IpNotAllowed => crate::error::ErrorKind::User, } } } @@ -79,13 +97,36 @@ impl CancellationHandler

{ cancellation_handler: self, } } + /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. + /// check_allowed - if true, check if the IP is allowed to cancel the query pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, + peer_addr: &IpAddr, + check_allowed: bool, ) -> Result<(), CancelError> { + // TODO: check for unspecified address is only for backward compatibility, should be removed + if !peer_addr.is_unspecified() { + let subnet_key = match *peer_addr { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock().unwrap().check(subnet_key, 1) { + tracing::debug!("Rate limit exceeded. Skipping cancellation message"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); + } + } + // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); @@ -96,7 +137,13 @@ impl CancellationHandler

{ source: self.from, kind: crate::metrics::CancellationOutcome::NotFound, }); - match self.client.try_publish(key, session_id).await { + + if session_id == Uuid::nil() { + // was already published, do not publish it again + return Ok(()); + } + + match self.client.try_publish(key, session_id, *peer_addr).await { Ok(()) => {} // do nothing Err(e) => { return Err(CancelError::IO(std::io::Error::new( @@ -107,6 +154,13 @@ impl CancellationHandler

{ } return Ok(()); }; + + if check_allowed + && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice()) + { + return Err(CancelError::IpNotAllowed); + } + Metrics::get() .proxy .cancellation_requests_total @@ -135,13 +189,29 @@ impl CancellationHandler<()> { map, client: (), from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), } } } impl CancellationHandler>>> { pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { - Self { map, client, from } + Self { + map, + client, + from, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), + } } } @@ -152,13 +222,19 @@ impl CancellationHandler>>> { pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, + ip_allowlist: Vec, } impl CancelClosure { - pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self { + pub(crate) fn new( + socket_addr: SocketAddr, + cancel_token: CancelToken, + ip_allowlist: Vec, + ) -> Self { Self { socket_addr, cancel_token, + ip_allowlist, } } /// Cancels the query running on user's compute node. @@ -168,6 +244,9 @@ impl CancelClosure { debug!("query was cancelled"); Ok(()) } + pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { + self.ip_allowlist = ip_allowlist; + } } /// Helper for registering query cancellation tokens. @@ -229,6 +308,8 @@ mod tests { cancel_key: 0, }, Uuid::new_v4(), + &("127.0.0.1".parse().unwrap()), + true, ) .await .unwrap(); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index e7fbe9ab47..8408d4720b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -342,7 +342,7 @@ impl ConnCfg { // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]); let connection = PostgresConnection { stream, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index c88b2936db..fbd0c8e5c5 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -156,16 +156,21 @@ pub(crate) async fn handle_client( let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); - let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } @@ -174,7 +179,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let user_info = match backend + let (user_info, ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -198,6 +203,8 @@ pub(crate) async fn handle_client( .or_else(|e| stream.throw_error(e)) .await?; + node.cancel_closure + .set_ip_allowlist(ip_allowlist.unwrap_or_default()); let session = cancellation_handler.get_session(); prepare_client_connection(&node, &session, &mut stream).await?; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f91fcd4120..659c57c865 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -351,6 +351,7 @@ pub enum CancellationSource { pub enum CancellationOutcome { NotFound, Found, + RateLimitExceeded, } #[derive(LabelGroup)] diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 9415b54a4a..5d9468d89a 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -268,12 +268,18 @@ pub(crate) async fn handle_client( let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); + let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { return Ok(cancellation_handler - .cancel_session(cancel_key_data, ctx.session_id()) + .cancel_session( + cancel_key_data, + ctx.session_id(), + &ctx.peer_addr(), + config.authentication_config.ip_allowlist_check_enabled, + ) .await .map(|()| None)?) } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 4259fd04f4..a048721e77 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -14,13 +14,13 @@ use tracing::info; use crate::intern::EndpointIdInt; -pub(crate) struct GlobalRateLimiter { +pub struct GlobalRateLimiter { data: Vec, info: Vec, } impl GlobalRateLimiter { - pub(crate) fn new(info: Vec) -> Self { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -34,7 +34,7 @@ impl GlobalRateLimiter { } /// Check that number of connections is below `max_rps` rps. - pub(crate) fn check(&mut self) -> bool { + pub fn check(&mut self) -> bool { let now = Instant::now(); let should_allow_request = self @@ -137,6 +137,19 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; + pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 3ae2ecaf8f..5f90102da3 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -8,5 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd; pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub(crate) use limiter::GlobalRateLimiter; -pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 7392b0d316..633a2f1b81 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use core::net::IpAddr; use pq_proto::CancelKeyData; use redis::AsyncCommands; use tokio::sync::Mutex; @@ -15,6 +16,7 @@ pub trait CancellationPublisherMut: Send + Sync + 'static { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -24,6 +26,7 @@ pub trait CancellationPublisher: Send + Sync + 'static { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()>; } @@ -32,6 +35,7 @@ impl CancellationPublisher for () { &self, _cancel_key_data: CancelKeyData, _session_id: Uuid, + _peer_addr: IpAddr, ) -> anyhow::Result<()> { Ok(()) } @@ -42,8 +46,10 @@ impl CancellationPublisherMut for P { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { -

::try_publish(self, cancel_key_data, session_id).await +

::try_publish(self, cancel_key_data, session_id, peer_addr) + .await } } @@ -52,9 +58,10 @@ impl CancellationPublisher for Option

{ &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { if let Some(p) = self { - p.try_publish(cancel_key_data, session_id).await + p.try_publish(cancel_key_data, session_id, peer_addr).await } else { Ok(()) } @@ -66,10 +73,11 @@ impl CancellationPublisher for Arc> { &self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { self.lock() .await - .try_publish(cancel_key_data, session_id) + .try_publish(cancel_key_data, session_id, peer_addr) .await } } @@ -97,11 +105,13 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { let payload = serde_json::to_string(&Notification::Cancel(CancelSession { region_id: Some(self.region_id.clone()), cancel_key_data, session_id, + peer_addr: Some(peer_addr), }))?; let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; Ok(()) @@ -120,13 +130,14 @@ impl RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { // TODO: review redundant error duplication logs. if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping cancellation message"); return Err(anyhow::anyhow!("Rate limit exceeded")); } - match self.publish(cancel_key_data, session_id).await { + match self.publish(cancel_key_data, session_id, peer_addr).await { Ok(()) => return Ok(()), Err(e) => { tracing::error!("failed to publish a message: {e}"); @@ -134,7 +145,7 @@ impl RedisPublisherClient { } tracing::info!("Publisher is disconnected. Reconnectiong..."); self.try_connect().await?; - self.publish(cancel_key_data, session_id).await + self.publish(cancel_key_data, session_id, peer_addr).await } } @@ -143,9 +154,13 @@ impl CancellationPublisherMut for RedisPublisherClient { &mut self, cancel_key_data: CancelKeyData, session_id: Uuid, + peer_addr: IpAddr, ) -> anyhow::Result<()> { tracing::info!("publishing cancellation key to Redis"); - match self.try_publish_internal(cancel_key_data, session_id).await { + match self + .try_publish_internal(cancel_key_data, session_id, peer_addr) + .await + { Ok(()) => { tracing::debug!("cancellation key successfuly published to Redis"); Ok(()) diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 62e7b1b565..65008ae943 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -60,6 +60,7 @@ pub(crate) struct CancelSession { pub(crate) region_id: Option, pub(crate) cancel_key_data: CancelKeyData, pub(crate) session_id: Uuid, + pub(crate) peer_addr: Option, } fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result @@ -137,10 +138,20 @@ impl MessageHandler { return Ok(()); } } + + // TODO: Remove unspecified peer_addr after the complete migration to the new format + let peer_addr = cancel_session + .peer_addr + .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED)); // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. match self .cancellation_handler - .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) + .cancel_session( + cancel_session.cancel_key_data, + uuid::Uuid::nil(), + &peer_addr, + cancel_session.peer_addr.is_some(), + ) .await { Ok(()) => {} @@ -335,6 +346,7 @@ mod tests { cancel_key_data, region_id: None, session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; @@ -344,6 +356,7 @@ mod tests { cancel_key_data, region_id: Some("region".to_string()), session_id: uuid, + peer_addr: None, }); let text = serde_json::to_string(&msg)?; let result: Notification = serde_json::from_str(&text)?; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 03b37bccd5..afd93d02f0 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -14,7 +14,7 @@ use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; -use tokio::time; +use tokio::time::{self, Instant}; use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; @@ -980,10 +980,11 @@ async fn query_to_json( current_size: &mut usize, parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { - info!("executing query"); + let query_start = Instant::now(); + let query_params = data.params; let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); - info!("finished executing query"); + let query_acknowledged = Instant::now(); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too @@ -1002,6 +1003,7 @@ async fn query_to_json( } } + let query_resp_end = Instant::now(); let ready = row_stream.ready_status(); // grab the command tag and number of rows affected @@ -1021,7 +1023,9 @@ async fn query_to_json( rows = rows.len(), ?ready, command_tag, - "finished reading rows" + acknowledgement = ?(query_acknowledged - query_start), + response = ?(query_resp_end - query_start), + "finished executing query" ); let columns_len = row_stream.columns().len(); diff --git a/pyproject.toml b/pyproject.toml index 197946fff8..ccd3ab1864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ authors = [] package-mode = false [tool.poetry.dependencies] -python = "^3.9" +python = "^3.11" pytest = "^7.4.4" psycopg2-binary = "^2.9.10" typing-extensions = "^4.6.1" @@ -51,7 +51,7 @@ testcontainers = "^4.8.1" jsonnet = "^0.20.0" [tool.poetry.group.dev.dependencies] -mypy = "==1.3.0" +mypy = "==1.13.0" ruff = "^0.7.0" [build-system] @@ -89,7 +89,7 @@ module = [ ignore_missing_imports = true [tool.ruff] -target-version = "py39" +target-version = "py311" extend-exclude = [ "vendor/", "target/", @@ -108,6 +108,3 @@ select = [ "B", # bugbear "UP", # pyupgrade ] - -[tool.ruff.lint.pyupgrade] -keep-runtime-typing = true # Remove this stanza when we require Python 3.10 diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 85561e4aff..ab77b63d54 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +pprof.workspace = true rand.workspace = true regex.workspace = true scopeguard.workspace = true diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md index 4119cc8d6e..d73fbccf05 100644 --- a/safekeeper/benches/README.md +++ b/safekeeper/benches/README.md @@ -14,6 +14,10 @@ cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false # List available benchmarks. cargo bench --package safekeeper --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false --profile-time 10 ``` Additional charts and statistics are available in `target/criterion/report/index.html`. diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index e32d7526ca..c637b4fb24 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -10,6 +10,7 @@ use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use pprof::criterion::{Output, PProfProfiler}; use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, @@ -24,8 +25,9 @@ const GB: usize = 1024 * MB; // Register benchmarks with Criterion. criterion_group!( - benches, - bench_process_msg, + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, bench_file_write diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index df68f8a68e..28294abdb9 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,7 +1,6 @@ -use hyper::{Body, Request, Response, StatusCode, Uri}; -use once_cell::sync::Lazy; +use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; @@ -14,7 +13,9 @@ use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; +use utils::http::endpoint::{ + profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, +}; use utils::http::request::parse_query_param; use postgres_ffi::WAL_SEGMENT_SIZE; @@ -572,14 +573,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - #[allow(clippy::mutable_key_type)] - static ALLOWLIST_ROUTES: Lazy> = Lazy::new(|| { - ["/v1/status", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect() - }); - if ALLOWLIST_ROUTES.contains(request.uri()) { + const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { // Option> is always provided as data below, hence unwrap(). @@ -598,6 +593,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(Arc::new(conf)) .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 9312f8b3e7..3fb668ed2d 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -14,7 +14,7 @@ import psycopg2.extras import toml if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any FLAKY_TESTS_QUERY = """ SELECT @@ -65,7 +65,7 @@ def main(args: argparse.Namespace): pageserver_virtual_file_io_engine_parameter = "" # re-use existing records of flaky tests from before parametrization by compaction_algorithm - def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: """Duplicated from parametrize.py""" toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py index a4fd3f6132..835e28c5d6 100644 --- a/scripts/force_layer_download.py +++ b/scripts/force_layer_download.py @@ -194,9 +194,11 @@ async def main_impl(args, report_out, client: Client): tenant_ids = await client.get_tenant_ids() get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids] gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True) - assert len(tenant_ids) == len(gathered) tenant_and_timline_ids = [] - for tid, tlids in zip(tenant_ids, gathered): + for tid, tlids in zip(tenant_ids, gathered, strict=True): + # TODO: add error handling if tlids isinstance(Exception) + assert isinstance(tlids, list) + for tlid in tlids: tenant_and_timline_ids.append((tid, tlid)) elif len(comps) == 1: diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index e0dd0a7189..064c516718 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -11,7 +11,7 @@ import re import sys from contextlib import contextmanager from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import UTC, datetime from pathlib import Path import backoff @@ -31,6 +31,7 @@ CREATE TABLE IF NOT EXISTS results ( duration INT NOT NULL, flaky BOOLEAN NOT NULL, arch arch DEFAULT 'X64', + lfc BOOLEAN DEFAULT false NOT NULL, build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -54,6 +55,7 @@ class Row: duration: int flaky: bool arch: str + lfc: bool build_type: str pg_version: int run_id: int @@ -132,6 +134,7 @@ def ingest_test_result( if p["name"].startswith("__") } arch = parameters.get("arch", "UNKNOWN").strip("'") + lfc = parameters.get("lfc", "False") == "True" build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} @@ -140,11 +143,12 @@ def ingest_test_result( suite=labels["suite"], name=unparametrized_name, status=test["status"], - started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=timezone.utc), - stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc), + started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=UTC), + stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=UTC), duration=test["time"]["duration"], flaky=test["flaky"] or test["retriesStatusChange"], arch=arch, + lfc=lfc, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/test_runner/README.md b/test_runner/README.md index 55d8d2faa9..f342ef8aaa 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -113,7 +113,7 @@ The test suite has a Python enum with equal name but different meaning: ```python @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py index be16be81de..8382ce20b3 100644 --- a/test_runner/fixtures/auth_tokens.py +++ b/test_runner/fixtures/auth_tokens.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from typing import Any import jwt @@ -37,8 +37,7 @@ class AuthKeys: return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): +class TokenScope(StrEnum): ADMIN = "admin" PAGE_SERVER_API = "pageserverapi" GENERATIONS_API = "generations_api" diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 8e68775471..bb8e75902e 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -9,6 +9,7 @@ import re import timeit from contextlib import contextmanager from datetime import datetime +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING @@ -24,8 +25,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver if TYPE_CHECKING: - from collections.abc import Iterator, Mapping - from typing import Callable, Optional + from collections.abc import Callable, Iterator, Mapping """ @@ -61,7 +61,7 @@ class PgBenchRunResult: number_of_threads: int number_of_transactions_actually_processed: int latency_average: float - latency_stddev: Optional[float] + latency_stddev: float | None tps: float run_duration: float run_start_timestamp: int @@ -171,14 +171,14 @@ _PGBENCH_INIT_EXTRACTORS: Mapping[str, re.Pattern[str]] = { @dataclasses.dataclass class PgBenchInitResult: - total: Optional[float] - drop_tables: Optional[float] - create_tables: Optional[float] - client_side_generate: Optional[float] - server_side_generate: Optional[float] - vacuum: Optional[float] - primary_keys: Optional[float] - foreign_keys: Optional[float] + total: float | None + drop_tables: float | None + create_tables: float | None + client_side_generate: float | None + server_side_generate: float | None + vacuum: float | None + primary_keys: float | None + foreign_keys: float | None duration: float start_timestamp: int end_timestamp: int @@ -196,7 +196,7 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - timings: dict[str, Optional[float]] = {} + timings: dict[str, float | None] = {} last_line_items = re.split(r"\(|\)|,", last_line) for item in last_line_items: for key, regex in _PGBENCH_INIT_EXTRACTORS.items(): @@ -227,7 +227,7 @@ class PgBenchInitResult: @enum.unique -class MetricReport(str, enum.Enum): # str is a hack to make it json serializable +class MetricReport(StrEnum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients TEST_PARAM = "test_param" @@ -256,9 +256,8 @@ class NeonBenchmarker: metric_value: float, unit: str, report: MetricReport, - labels: Optional[ - dict[str, str] - ] = None, # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric + # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric + labels: dict[str, str] | None = None, ): """ Record a benchmark result. @@ -412,7 +411,7 @@ class NeonBenchmarker: self, pageserver: NeonPageserver, metric_name: str, - label_filters: Optional[dict[str, str]] = None, + label_filters: dict[str, str] | None = None, ) -> int: """Fetch the value of given int counter from pageserver metrics.""" all_metrics = pageserver.http_client().get_metrics() diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 0ea7148f50..6c22b31e00 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -2,14 +2,14 @@ from __future__ import annotations import random from dataclasses import dataclass -from enum import Enum +from enum import StrEnum from functools import total_ordering from typing import TYPE_CHECKING, TypeVar from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Union + from typing import Any T = TypeVar("T", bound="Id") @@ -24,7 +24,7 @@ class Lsn: representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ - def __init__(self, x: Union[int, str]): + def __init__(self, x: int | str): if isinstance(x, int): self.lsn_int = x else: @@ -67,7 +67,7 @@ class Lsn: return NotImplemented return self.lsn_int - other.lsn_int - def __add__(self, other: Union[int, Lsn]) -> Lsn: + def __add__(self, other: int | Lsn) -> Lsn: if isinstance(other, int): return Lsn(self.lsn_int + other) elif isinstance(other, Lsn): @@ -190,8 +190,23 @@ class TenantTimelineId: ) -# Workaround for compat with python 3.9, which does not have `typing.Self` -TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId") +@dataclass +class ShardIndex: + shard_number: int + shard_count: int + + # cf impl Display for ShardIndex + @override + def __str__(self) -> str: + return f"{self.shard_number:02x}{self.shard_count:02x}" + + @classmethod + def parse(cls: type[ShardIndex], input: str) -> ShardIndex: + assert len(input) == 4 + return cls( + shard_number=int(input[0:2], 16), + shard_count=int(input[2:4], 16), + ) class TenantShardId: @@ -202,7 +217,7 @@ class TenantShardId: assert self.shard_number < self.shard_count or self.shard_count == 0 @classmethod - def parse(cls: type[TTenantShardId], input: str) -> TTenantShardId: + def parse(cls: type[TenantShardId], input: str) -> TenantShardId: if len(input) == 32: return cls( tenant_id=TenantId(input), @@ -226,6 +241,10 @@ class TenantShardId: # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) return str(self.tenant_id) + @property + def shard_index(self) -> ShardIndex: + return ShardIndex(self.shard_number, self.shard_count) + @override def __repr__(self): return self.__str__() @@ -249,7 +268,6 @@ class TenantShardId: return hash(self._tuple()) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TimelineArchivalState(str, Enum): +class TimelineArchivalState(StrEnum): ARCHIVED = "Archived" UNARCHIVED = "Unarchived" diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 85b6e7a3b8..c0892399bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -99,7 +99,7 @@ class PgCompare(ABC): assert row is not None assert len(row) == len(pg_stat.columns) - for col, val in zip(pg_stat.columns, row): + for col, val in zip(pg_stat.columns, row, strict=False): results[f"{pg_stat.table}.{col}"] = int(val) return results diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 6354b7f833..33f01f80fb 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -12,7 +12,8 @@ from fixtures.common_types import TenantId from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any class ComputeReconfigure: @@ -20,12 +21,12 @@ class ComputeReconfigure: self.server = server self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" self.workloads: dict[TenantId, Any] = {} - self.on_notify: Optional[Callable[[Any], None]] = None + self.on_notify: Callable[[Any], None] | None = None def register_workload(self, workload: Any): self.workloads[workload.tenant_id] = workload - def register_on_notify(self, fn: Optional[Callable[[Any], None]]): + def register_on_notify(self, fn: Callable[[Any], None] | None): """ Add some extra work during a notification, like sleeping to slow things down, or logging what was notified. @@ -68,7 +69,7 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] return Response(status=200) diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py index e890b2bcf1..3e35af3b5b 100644 --- a/test_runner/fixtures/h2server.py +++ b/test_runner/fixtures/h2server.py @@ -31,7 +31,7 @@ from h2.settings import SettingCodes from typing_extensions import override if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any RequestData = collections.namedtuple("RequestData", ["headers", "data"]) @@ -49,7 +49,7 @@ class H2Protocol(asyncio.Protocol): def __init__(self): config = H2Configuration(client_side=False, header_encoding="utf-8") self.conn = H2Connection(config=config) - self.transport: Optional[asyncio.Transport] = None + self.transport: asyncio.Transport | None = None self.stream_data: dict[int, RequestData] = {} self.flow_control_futures: dict[int, asyncio.Future[Any]] = {} @@ -61,7 +61,7 @@ class H2Protocol(asyncio.Protocol): self.transport.write(self.conn.data_to_send()) @override - def connection_lost(self, exc: Optional[Exception]): + def connection_lost(self, exc: Exception | None): for future in self.flow_control_futures.values(): future.cancel() self.flow_control_futures = {} diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 39c8f70a9c..330f007a77 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,16 +1,12 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Optional - class Metrics: metrics: dict[str, list[Sample]] @@ -20,7 +16,7 @@ class Metrics: self.metrics = defaultdict(list) self.name = name - def query_all(self, name: str, filter: Optional[dict[str, str]] = None) -> list[Sample]: + def query_all(self, name: str, filter: dict[str, str] | None = None) -> list[Sample]: filter = filter or {} res: list[Sample] = [] @@ -32,7 +28,7 @@ class Metrics: pass return res - def query_one(self, name: str, filter: Optional[dict[str, str]] = None) -> Sample: + def query_one(self, name: str, filter: dict[str, str] | None = None) -> Sample: res = self.query_all(name, filter or {}) assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" return res[0] @@ -47,9 +43,7 @@ class MetricsGetter: def get_metrics(self) -> Metrics: raise NotImplementedError() - def get_metric_value( - self, name: str, filter: Optional[dict[str, str]] = None - ) -> Optional[float]: + def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None: metrics = self.get_metrics() results = metrics.query_all(name, filter=filter) if not results: @@ -59,7 +53,7 @@ class MetricsGetter: return results[0].value def get_metrics_values( - self, names: list[str], filter: Optional[dict[str, str]] = None, absence_ok: bool = False + self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False ) -> dict[str, float]: """ When fetching multiple named metrics, it is more efficient to use this diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 9de6681beb..df80f0683c 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -8,7 +8,7 @@ import requests from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Literal, Optional + from typing import Any, Literal from fixtures.pg_version import PgVersion @@ -40,11 +40,11 @@ class NeonAPI: def create_project( self, - pg_version: Optional[PgVersion] = None, - name: Optional[str] = None, - branch_name: Optional[str] = None, - branch_role_name: Optional[str] = None, - branch_database_name: Optional[str] = None, + pg_version: PgVersion | None = None, + name: str | None = None, + branch_name: str | None = None, + branch_role_name: str | None = None, + branch_database_name: str | None = None, ) -> dict[str, Any]: data: dict[str, Any] = { "project": { @@ -179,8 +179,8 @@ class NeonAPI: def get_connection_uri( self, project_id: str, - branch_id: Optional[str] = None, - endpoint_id: Optional[str] = None, + branch_id: str | None = None, + endpoint_id: str | None = None, database_name: str = "neondb", role_name: str = "neondb_owner", pooled: bool = True, @@ -249,7 +249,7 @@ class NeonAPI: @final class NeonApiEndpoint: - def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None): self.neon_api = neon_api self.project_id: str self.endpoint_id: str diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index d220ea57a2..a85a191455 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -20,13 +20,9 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from typing import ( Any, - Optional, - TypeVar, cast, ) - T = TypeVar("T") - # Used to be an ABC. abc.ABC removed due to linter without name change. class AbstractNeonCli: @@ -36,7 +32,7 @@ class AbstractNeonCli: Do not use directly, use specific subclasses instead. """ - def __init__(self, extra_env: Optional[dict[str, str]], binpath: Path): + def __init__(self, extra_env: dict[str, str] | None, binpath: Path): self.extra_env = extra_env self.binpath = binpath @@ -45,7 +41,7 @@ class AbstractNeonCli: def raw_cli( self, arguments: list[str], - extra_env_vars: Optional[dict[str, str]] = None, + extra_env_vars: dict[str, str] | None = None, check_return_code=True, timeout=None, ) -> subprocess.CompletedProcess[str]: @@ -173,7 +169,7 @@ class NeonLocalCli(AbstractNeonCli): def __init__( self, - extra_env: Optional[dict[str, str]], + extra_env: dict[str, str] | None, binpath: Path, repo_dir: Path, pg_distrib_dir: Path, @@ -195,10 +191,10 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, pg_version: PgVersion, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ): """ @@ -302,8 +298,8 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, timeline_id: TimelineId, new_branch_name, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, ): cmd = [ "timeline", @@ -331,8 +327,8 @@ class NeonLocalCli(AbstractNeonCli): base_lsn: Lsn, base_tarfile: Path, pg_version: PgVersion, - end_lsn: Optional[Lsn] = None, - wal_tarfile: Optional[Path] = None, + end_lsn: Lsn | None = None, + wal_tarfile: Path | None = None, ): cmd = [ "timeline", @@ -380,7 +376,7 @@ class NeonLocalCli(AbstractNeonCli): def init( self, init_config: dict[str, Any], - force: Optional[str] = None, + force: str | None = None, ) -> subprocess.CompletedProcess[str]: with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: init_config_tmpfile.write(toml.dumps(init_config)) @@ -400,9 +396,9 @@ class NeonLocalCli(AbstractNeonCli): def storage_controller_start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: @@ -413,7 +409,7 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): + def storage_controller_stop(self, immediate: bool, instance_id: int | None = None): cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) @@ -424,8 +420,8 @@ class NeonLocalCli(AbstractNeonCli): def pageserver_start( self, id: int, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: start_args = ["pageserver", "start", f"--id={id}"] if timeout_in_seconds is not None: @@ -442,9 +438,9 @@ class NeonLocalCli(AbstractNeonCli): def safekeeper_start( self, id: int, - extra_opts: Optional[list[str]] = None, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, + extra_opts: list[str] | None = None, + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: if extra_opts is not None: extra_opts = [f"-e={opt}" for opt in extra_opts] @@ -457,7 +453,7 @@ class NeonLocalCli(AbstractNeonCli): ) def safekeeper_stop( - self, id: Optional[int] = None, immediate=False + self, id: int | None = None, immediate=False ) -> subprocess.CompletedProcess[str]: args = ["safekeeper", "stop"] if id is not None: @@ -467,7 +463,7 @@ class NeonLocalCli(AbstractNeonCli): return self.raw_cli(args) def storage_broker_start( - self, timeout_in_seconds: Optional[int] = None + self, timeout_in_seconds: int | None = None ) -> subprocess.CompletedProcess[str]: cmd = ["storage_broker", "start"] if timeout_in_seconds is not None: @@ -485,10 +481,10 @@ class NeonLocalCli(AbstractNeonCli): http_port: int, tenant_id: TenantId, pg_version: PgVersion, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + pageserver_id: int | None = None, allow_multiple=False, ) -> subprocess.CompletedProcess[str]: args = [ @@ -523,11 +519,11 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_start( self, endpoint_id: str, - safekeepers: Optional[list[int]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + safekeepers: list[int] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple=False, - basebackup_request_tries: Optional[int] = None, + basebackup_request_tries: int | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -555,9 +551,9 @@ class NeonLocalCli(AbstractNeonCli): def endpoint_reconfigure( self, endpoint_id: str, - tenant_id: Optional[TenantId] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + tenant_id: TenantId | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, check_return_code=True, ) -> subprocess.CompletedProcess[str]: args = ["endpoint", "reconfigure", endpoint_id] @@ -574,7 +570,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, - mode: Optional[str] = None, + mode: str | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 205a47a9d5..07d442b4a6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,7 +17,7 @@ from collections.abc import Iterable, Iterator from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime -from enum import Enum +from enum import StrEnum from functools import cached_property from pathlib import Path from types import TracebackType @@ -90,10 +90,12 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, + USE_LFC, allure_add_grafana_links, assert_no_errors, get_dir_size, print_gc_result, + size_to_bytes, subprocess_capture, wait_until, ) @@ -101,13 +103,8 @@ from fixtures.utils import ( from .neon_api import NeonAPI, NeonApiEndpoint if TYPE_CHECKING: - from typing import ( - Any, - Callable, - Optional, - TypeVar, - Union, - ) + from collections.abc import Callable + from typing import Any, Self, TypeVar from fixtures.paths import SnapshotDirLocked @@ -338,10 +335,10 @@ class NeonEnvBuilder: top_output_dir: Path, test_output_dir: Path, combination, - test_overlay_dir: Optional[Path] = None, - pageserver_remote_storage: Optional[RemoteStorage] = None, + test_overlay_dir: Path | None = None, + pageserver_remote_storage: RemoteStorage | None = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` - pageserver_config_override: Optional[str | Callable[[dict[str, Any]], None]] = None, + pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -349,16 +346,16 @@ class NeonEnvBuilder: # fsync is disabled by default to make the tests go faster safekeepers_enable_fsync: bool = False, auth_enabled: bool = False, - rust_log_override: Optional[str] = None, + rust_log_override: str | None = None, default_branch_name: str = DEFAULT_BRANCH_NAME, preserve_database_files: bool = False, - initial_tenant: Optional[TenantId] = None, - initial_timeline: Optional[TimelineId] = None, - pageserver_virtual_file_io_engine: Optional[str] = None, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None, - safekeeper_extra_opts: Optional[list[str]] = None, - storage_controller_port_override: Optional[int] = None, - pageserver_virtual_file_io_mode: Optional[str] = None, + initial_tenant: TenantId | None = None, + initial_timeline: TimelineId | None = None, + pageserver_virtual_file_io_engine: str | None = None, + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = None, + safekeeper_extra_opts: list[str] | None = None, + storage_controller_port_override: int | None = None, + pageserver_virtual_file_io_mode: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -367,7 +364,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.safekeepers_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: RemoteStorage | None = None self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server @@ -378,7 +375,7 @@ class NeonEnvBuilder: self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name - self.env: Optional[NeonEnv] = None + self.env: NeonEnv | None = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath self.neon_local_binpath = neon_binpath @@ -391,14 +388,14 @@ class NeonEnvBuilder: self.test_output_dir = test_output_dir self.test_overlay_dir = test_overlay_dir self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] - self.config_init_force: Optional[str] = None + self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: Optional[str] = None - self.storage_controller_config: Optional[dict[Any, Any]] = None + self.control_plane_compute_hook_api: str | None = None + self.storage_controller_config: dict[Any, Any] | None = None - self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine - self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = ( + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) if self.pageserver_default_tenant_config_compaction_algorithm is not None: @@ -440,10 +437,10 @@ class NeonEnvBuilder: def init_start( self, - initial_tenant_conf: Optional[dict[str, Any]] = None, + initial_tenant_conf: dict[str, Any] | None = None, default_remote_storage_if_missing: bool = True, - initial_tenant_shard_count: Optional[int] = None, - initial_tenant_shard_stripe_size: Optional[int] = None, + initial_tenant_shard_count: int | None = None, + initial_tenant_shard_stripe_size: int | None = None, ) -> NeonEnv: """ Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline. @@ -781,8 +778,8 @@ class NeonEnvBuilder: self, kind: RemoteStorageKind, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: ret = kind.configure( self.repo_dir, @@ -840,14 +837,14 @@ class NeonEnvBuilder: if isinstance(x, S3Storage): x.do_cleanup() - def __enter__(self) -> NeonEnvBuilder: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, ): # Stop all the nodes. if self.env: @@ -1136,7 +1133,7 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: Optional[int] = None): + def start(self, timeout_in_seconds: int | None = None): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1150,21 +1147,19 @@ class NeonEnv: with concurrent.futures.ThreadPoolExecutor( max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: - futs.append( - executor.submit(lambda: self.broker.start() or None) - ) # The `or None` is for the linter + futs.append(executor.submit(lambda: self.broker.start())) for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in self.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) @@ -1237,7 +1232,7 @@ class NeonEnv: ), "env.pageserver must only be used with single pageserver NeonEnv" return self.pageservers[0] - def get_pageserver(self, id: Optional[int]) -> NeonPageserver: + def get_pageserver(self, id: int | None) -> NeonPageserver: """ Look up a pageserver by its node ID. @@ -1254,7 +1249,7 @@ class NeonEnv: raise RuntimeError(f"Pageserver with ID {id} not found") - def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): + def get_tenant_pageserver(self, tenant_id: TenantId | TenantShardId): """ Get the NeonPageserver where this tenant shard is currently attached, according to the storage controller. @@ -1316,12 +1311,12 @@ class NeonEnv: def create_tenant( self, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - conf: Optional[dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + conf: dict[str, Any] | None = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + placement_policy: str | None = None, set_default: bool = False, ) -> tuple[TenantId, TimelineId]: """ @@ -1343,7 +1338,7 @@ class NeonEnv: return tenant_id, timeline_id - def config_tenant(self, tenant_id: Optional[TenantId], conf: dict[str, str]): + def config_tenant(self, tenant_id: TenantId | None, conf: dict[str, str]): """ Update tenant config. """ @@ -1353,10 +1348,10 @@ class NeonEnv: def create_branch( self, new_branch_name: str = DEFAULT_BRANCH_NAME, - tenant_id: Optional[TenantId] = None, - ancestor_branch_name: Optional[str] = None, - ancestor_start_lsn: Optional[Lsn] = None, - new_timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + ancestor_branch_name: str | None = None, + ancestor_start_lsn: Lsn | None = None, + new_timeline_id: TimelineId | None = None, ) -> TimelineId: new_timeline_id = new_timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1370,8 +1365,8 @@ class NeonEnv: def create_timeline( self, new_branch_name: str, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, ) -> TimelineId: timeline_id = timeline_id or TimelineId.generate() tenant_id = tenant_id or self.initial_tenant @@ -1396,8 +1391,8 @@ def neon_simple_env( compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1453,9 +1448,9 @@ def neon_env_builder( test_overlay_dir: Path, top_output_dir: Path, pageserver_virtual_file_io_engine: str, - pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], + pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], - pageserver_virtual_file_io_mode: Optional[str], + pageserver_virtual_file_io_mode: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1530,7 +1525,7 @@ class LogUtils: def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: """Check that the log contains a line that matches the given regex""" logfile = self.logfile if not logfile.exists(): @@ -1569,14 +1564,13 @@ class StorageControllerApiException(Exception): # See libs/pageserver_api/src/controller_api.rs # for the rust definitions of the enums below -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class PageserverAvailability(str, Enum): +class PageserverAvailability(StrEnum): ACTIVE = "Active" UNAVAILABLE = "Unavailable" OFFLINE = "Offline" -class PageserverSchedulingPolicy(str, Enum): +class PageserverSchedulingPolicy(StrEnum): ACTIVE = "Active" DRAINING = "Draining" FILLING = "Filling" @@ -1584,7 +1578,7 @@ class PageserverSchedulingPolicy(str, Enum): PAUSE_FOR_RESTART = "PauseForRestart" -class StorageControllerLeadershipStatus(str, Enum): +class StorageControllerLeadershipStatus(StrEnum): LEADER = "leader" STEPPED_DOWN = "stepped_down" CANDIDATE = "candidate" @@ -1602,16 +1596,16 @@ class NeonStorageController(MetricsGetter, LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: if self.running: self.env.neon_cli.storage_controller_stop(immediate) self.running = False @@ -1673,7 +1667,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return resp - def headers(self, scope: Optional[TokenScope]) -> dict[str, str]: + def headers(self, scope: TokenScope | None) -> dict[str, str]: headers = {} if self.auth_enabled and scope is not None: jwt_token = self.env.auth_keys.generate_token(scope=scope) @@ -1711,9 +1705,9 @@ class NeonStorageController(MetricsGetter, LogUtils): def attach_hook_issue( self, - tenant_shard_id: Union[TenantId, TenantShardId], + tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, - generation_override: Optional[int] = None, + generation_override: int | None = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: @@ -1729,7 +1723,7 @@ class NeonStorageController(MetricsGetter, LogUtils): assert isinstance(gen, int) return gen - def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): + def attach_hook_drop(self, tenant_shard_id: TenantId | TenantShardId): self.request( "POST", f"{self.api}/debug/v1/attach-hook", @@ -1737,7 +1731,7 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) - def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: + def inspect(self, tenant_shard_id: TenantId | TenantShardId) -> tuple[int, int] | None: """ :return: 2-tuple of (generation, pageserver id), or None if unknown """ @@ -1857,10 +1851,10 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_create( self, tenant_id: TenantId, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - tenant_config: Optional[dict[Any, Any]] = None, - placement_policy: Optional[Union[dict[Any, Any], str]] = None, + shard_count: int | None = None, + shard_stripe_size: int | None = None, + tenant_config: dict[Any, Any] | None = None, + placement_policy: dict[Any, Any] | str | None = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -1891,6 +1885,20 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"tenant_create success: {response.json()}") + def timeline_create( + self, + tenant_id: TenantId, + body: dict[str, Any], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"timeline_create success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} @@ -1941,7 +1949,7 @@ class NeonStorageController(MetricsGetter, LogUtils): return response.json() def tenant_shard_split( - self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: int | None = None ) -> list[TenantShardId]: response = self.request( "PUT", @@ -2039,8 +2047,8 @@ class NeonStorageController(MetricsGetter, LogUtils): def poll_node_status( self, node_id: int, - desired_availability: Optional[PageserverAvailability], - desired_scheduling_policy: Optional[PageserverSchedulingPolicy], + desired_availability: PageserverAvailability | None, + desired_scheduling_policy: PageserverSchedulingPolicy | None, max_attempts: int, backoff: float, ): @@ -2259,7 +2267,7 @@ class NeonStorageController(MetricsGetter, LogUtils): json=body, ) - def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + def get_safekeeper(self, id: int) -> dict[str, Any] | None: try: response = self.request( "GET", @@ -2285,14 +2293,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] - def __enter__(self) -> NeonStorageController: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2304,10 +2312,10 @@ class NeonProxiedStorageController(NeonStorageController): def start( self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + instance_id: int | None = None, + base_port: int | None = None, + ) -> Self: assert instance_id is not None and base_port is not None self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) @@ -2317,7 +2325,7 @@ class NeonProxiedStorageController(NeonStorageController): return self def stop_instance( - self, immediate: bool = False, instance_id: Optional[int] = None + self, immediate: bool = False, instance_id: int | None = None ) -> NeonStorageController: assert instance_id in self.instances if self.instances[instance_id]["running"]: @@ -2327,7 +2335,7 @@ class NeonProxiedStorageController(NeonStorageController): self.running = any(meta["running"] for meta in self.instances.values()) return self - def stop(self, immediate: bool = False) -> NeonStorageController: + def stop(self, immediate: bool = False) -> Self: for iid, details in self.instances.items(): if details["running"]: self.env.neon_cli.storage_controller_stop(immediate, iid) @@ -2346,7 +2354,7 @@ class NeonProxiedStorageController(NeonStorageController): def log_contains( self, pattern: str, offset: None | LogCursor = None - ) -> Optional[tuple[str, LogCursor]]: + ) -> tuple[str, LogCursor] | None: raise NotImplementedError() @@ -2393,8 +2401,8 @@ class NeonPageserver(PgProtocol, LogUtils): def timeline_dir( self, - tenant_shard_id: Union[TenantId, TenantShardId], - timeline_id: Optional[TimelineId] = None, + tenant_shard_id: TenantId | TenantShardId, + timeline_id: TimelineId | None = None, ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: @@ -2403,7 +2411,7 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_dir( self, - tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, + tenant_shard_id: TenantId | TenantShardId | None = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" if tenant_shard_id is None: @@ -2447,9 +2455,9 @@ class NeonPageserver(PgProtocol, LogUtils): def start( self, - extra_env_vars: Optional[dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, - ) -> NeonPageserver: + extra_env_vars: dict[str, str] | None = None, + timeout_in_seconds: int | None = None, + ) -> Self: """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -2484,7 +2492,7 @@ class NeonPageserver(PgProtocol, LogUtils): return self - def stop(self, immediate: bool = False) -> NeonPageserver: + def stop(self, immediate: bool = False) -> Self: """ Stop the page server. Returns self. @@ -2497,7 +2505,7 @@ class NeonPageserver(PgProtocol, LogUtils): def restart( self, immediate: bool = False, - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ): """ High level wrapper for restart: restarts the process, and waits for @@ -2532,14 +2540,14 @@ class NeonPageserver(PgProtocol, LogUtils): wait_until(20, 0.5, complete) - def __enter__(self) -> NeonPageserver: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop(immediate=True) @@ -2548,7 +2556,7 @@ class NeonPageserver(PgProtocol, LogUtils): pytest.skip("pageserver was built without 'testing' feature") def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> PageserverHttpClient: return PageserverHttpClient( port=self.service_port.http, @@ -2585,7 +2593,7 @@ class NeonPageserver(PgProtocol, LogUtils): self, tenant_id: TenantId, config: None | dict[str, Any] = None, - generation: Optional[int] = None, + generation: int | None = None, override_storage_controller_generation: bool = False, ): """ @@ -2619,7 +2627,7 @@ class NeonPageserver(PgProtocol, LogUtils): return client.tenant_location_conf(tenant_id, config, **kwargs) def read_tenant_location_conf( - self, tenant_shard_id: Union[TenantId, TenantShardId] + self, tenant_shard_id: TenantId | TenantShardId ) -> dict[str, Any]: path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") @@ -2634,9 +2642,9 @@ class NeonPageserver(PgProtocol, LogUtils): def tenant_create( self, tenant_id: TenantId, - conf: Optional[dict[str, Any]] = None, - auth_token: Optional[str] = None, - generation: Optional[int] = None, + conf: dict[str, Any] | None = None, + auth_token: str | None = None, + generation: int | None = None, ) -> TenantId: if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) @@ -2656,7 +2664,7 @@ class NeonPageserver(PgProtocol, LogUtils): return tenant_id def list_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ Inspect local storage on a pageserver to discover which layer files are present. @@ -2749,7 +2757,7 @@ class PgBin: if "/" not in str(command[0]): command[0] = str(self.pg_bin_path / command[0]) - def _build_env(self, env_add: Optional[Env]) -> Env: + def _build_env(self, env_add: Env | None) -> Env: if env_add is None: return self.env env = self.env.copy() @@ -2766,8 +2774,8 @@ class PgBin: def run_nonblocking( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> subprocess.Popen[Any]: """ Run one of the postgres binaries, not waiting for it to finish @@ -2790,8 +2798,8 @@ class PgBin: def run( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[Union[str, Path]] = None, + env: Env | None = None, + cwd: str | Path | None = None, ) -> None: """ Run one of the postgres binaries, waiting for it to finish @@ -2813,8 +2821,8 @@ class PgBin: def run_capture( self, command: list[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, + env: Env | None = None, + cwd: str | None = None, with_command_header=True, **popen_kwargs: Any, ) -> str: @@ -2941,7 +2949,7 @@ class VanillaPostgres(PgProtocol): conf_file.write("\n".join(hba) + "\n") conf_file.write(data) - def start(self, log_path: Optional[str] = None): + def start(self, log_path: str | None = None): assert not self.running self.running = True @@ -2960,14 +2968,14 @@ class VanillaPostgres(PgProtocol): """Return size of pgdatadir subdirectory in bytes.""" return get_dir_size(self.pgdatadir / subdir) - def __enter__(self) -> VanillaPostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self.running: self.stop() @@ -3009,14 +3017,14 @@ class RemotePostgres(PgProtocol): # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception("cannot get size of a Postgres instance") - def __enter__(self) -> RemotePostgres: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): # do nothing pass @@ -3092,7 +3100,7 @@ class PSQL: self.path = full_path self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" - async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: + async def run(self, query: str | None = None) -> asyncio.subprocess.Process: run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] if query is not None: run_args += ["--command", query] @@ -3138,7 +3146,7 @@ class NeonProxy(PgProtocol): """All auth backends must inherit from this class""" @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return None @abc.abstractmethod @@ -3155,7 +3163,7 @@ class NeonProxy(PgProtocol): ] class Console(AuthBackend): - def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None): + def __init__(self, endpoint: str, fixed_rate_limit: int | None = None): self.endpoint = endpoint self.fixed_rate_limit = fixed_rate_limit @@ -3183,7 +3191,7 @@ class NeonProxy(PgProtocol): pg_conn_url: str @property - def default_conn_url(self) -> Optional[str]: + def default_conn_url(self) -> str | None: return self.pg_conn_url def extra_args(self) -> list[str]: @@ -3202,8 +3210,8 @@ class NeonProxy(PgProtocol): mgmt_port: int, external_http_port: int, auth_backend: NeonProxy.AuthBackend, - metric_collection_endpoint: Optional[str] = None, - metric_collection_interval: Optional[str] = None, + metric_collection_endpoint: str | None = None, + metric_collection_interval: str | None = None, ): host = "127.0.0.1" domain = "proxy.localtest.me" # resolves to 127.0.0.1 @@ -3221,9 +3229,9 @@ class NeonProxy(PgProtocol): self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonProxy: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3351,14 +3359,14 @@ class NeonProxy(PgProtocol): log.info(f"SUCCESS, found auth url: {line}") return line - def __enter__(self) -> NeonProxy: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3439,9 +3447,9 @@ class NeonAuthBroker: self.mgmt_port = mgmt_port self.auth_backend = auth_backend self.http_timeout_seconds = 15 - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None - def start(self) -> NeonAuthBroker: + def start(self) -> Self: assert self._popen is None # generate key of it doesn't exist @@ -3510,14 +3518,14 @@ class NeonAuthBroker: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") return request_result.text - def __enter__(self) -> NeonAuthBroker: + def __enter__(self) -> Self: return self def __exit__( self, - _exc_type: Optional[type[BaseException]], - _exc_value: Optional[BaseException], - _traceback: Optional[TracebackType], + _exc_type: type[BaseException] | None, + _exc_value: BaseException | None, + _traceback: TracebackType | None, ): if self._popen is not None: self._popen.terminate() @@ -3673,9 +3681,9 @@ class Endpoint(PgProtocol, LogUtils): ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.branch_name: Optional[str] = None # dubious - self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA + self.branch_name: str | None = None # dubious + self.endpoint_id: str | None = None # dubious, see asserts below + self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -3692,7 +3700,7 @@ class Endpoint(PgProtocol, LogUtils): self._running = threading.Semaphore(0) def http_client( - self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( port=self.http_port, @@ -3701,13 +3709,13 @@ class Endpoint(PgProtocol, LogUtils): def create( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - ) -> Endpoint: + ) -> Self: """ Create a new Postgres endpoint. Returns self. @@ -3736,24 +3744,57 @@ class Endpoint(PgProtocol, LogUtils): self.pgdata_dir = self.env.repo_dir / path self.logfile = self.endpoint_path() / "compute.log" - config_lines = config_lines or [] - # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines + # Delete file cache if it exists (and we're recreating the endpoint) + if USE_LFC: + if (lfc_path := Path(self.lfc_path())).exists(): + lfc_path.unlink() + else: + lfc_path.parent.mkdir(parents=True, exist_ok=True) + for line in config_lines: + if ( + line.find("neon.max_file_cache_size") > -1 + or line.find("neon.file_cache_size_limit") > -1 + ): + m = re.search(r"=\s*(\S+)", line) + assert m is not None, f"malformed config line {line}" + size = m.group(1) + assert size_to_bytes(size) >= size_to_bytes( + "1MB" + ), "LFC size cannot be set less than 1MB" + # shared_buffers = 512kB to make postgres use LFC intensively + # neon.max_file_cache_size and neon.file_cache size limit are + # set to 1MB because small LFC is better for testing (helps to find more problems) + config_lines = [ + "shared_buffers = 512kB", + f"neon.file_cache_path = '{self.lfc_path()}'", + "neon.max_file_cache_size = 1MB", + "neon.file_cache_size_limit = 1MB", + ] + config_lines + else: + for line in config_lines: + assert ( + line.find("neon.max_file_cache_size") == -1 + ), "Setting LFC parameters is not allowed when LFC is disabled" + assert ( + line.find("neon.file_cache_size_limit") == -1 + ), "Setting LFC parameters is not allowed when LFC is disabled" + self.config(config_lines) return self def start( self, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[list[int]] = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + safekeepers: list[int] | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Start the Postgres instance. Returns self. @@ -3775,6 +3816,9 @@ class Endpoint(PgProtocol, LogUtils): basebackup_request_tries=basebackup_request_tries, ) self._running.release(1) + self.log_config_value("shared_buffers") + self.log_config_value("neon.max_file_cache_size") + self.log_config_value("neon.file_cache_size_limit") return self @@ -3800,7 +3844,11 @@ class Endpoint(PgProtocol, LogUtils): """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" return self.endpoint_path() / "postgresql.conf" - def config(self, lines: list[str]) -> Endpoint: + def lfc_path(self) -> Path: + """Path to the lfc file""" + return self.endpoint_path() / "file_cache" / "file.cache" + + def config(self, lines: list[str]) -> Self: """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. @@ -3828,9 +3876,7 @@ class Endpoint(PgProtocol, LogUtils): def is_running(self): return self._running._value > 0 - def reconfigure( - self, pageserver_id: Optional[int] = None, safekeepers: Optional[list[int]] = None - ): + def reconfigure(self, pageserver_id: int | None = None, safekeepers: list[int] | None = None): assert self.endpoint_id is not None # If `safekeepers` is not None, they are remember them as active and use # in the following commands. @@ -3877,8 +3923,8 @@ class Endpoint(PgProtocol, LogUtils): def stop( self, mode: str = "fast", - sks_wait_walreceiver_gone: Optional[tuple[list[Safekeeper], TimelineId]] = None, - ) -> Endpoint: + sks_wait_walreceiver_gone: tuple[list[Safekeeper], TimelineId] | None = None, + ) -> Self: """ Stop the Postgres instance if it's running. @@ -3912,7 +3958,7 @@ class Endpoint(PgProtocol, LogUtils): return self - def stop_and_destroy(self, mode: str = "immediate") -> Endpoint: + def stop_and_destroy(self, mode: str = "immediate") -> Self: """ Stop the Postgres instance, then destroy the endpoint. Returns self. @@ -3931,15 +3977,15 @@ class Endpoint(PgProtocol, LogUtils): def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, + endpoint_id: str | None = None, hot_standby: bool = False, - lsn: Optional[Lsn] = None, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, + lsn: Lsn | None = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, allow_multiple: bool = False, - basebackup_request_tries: Optional[int] = None, - ) -> Endpoint: + basebackup_request_tries: int | None = None, + ) -> Self: """ Create an endpoint, apply config, and start Postgres. Returns self. @@ -3962,14 +4008,14 @@ class Endpoint(PgProtocol, LogUtils): return self - def __enter__(self) -> Endpoint: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -3980,16 +4026,46 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 - def clear_shared_buffers(self, cursor: Optional[Any] = None): + def clear_buffers(self, cursor: Any | None = None): """ Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' - - Might also clear LFC. + It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value, + if LFC is enabled """ if cursor is not None: cursor.execute("select clear_buffer_cache()") + if not USE_LFC: + return + cursor.execute("SHOW neon.file_cache_size_limit") + res = cursor.fetchone() + assert res, "Cannot get neon.file_cache_size_limit" + file_cache_size_limit = res[0] + if file_cache_size_limit == 0: + return + cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0") + cursor.execute("SELECT pg_reload_conf()") + cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'") + cursor.execute("SELECT pg_reload_conf()") else: self.safe_psql("select clear_buffer_cache()") + if not USE_LFC: + return + file_cache_size_limit = self.safe_psql_scalar( + "SHOW neon.file_cache_size_limit", log_query=False + ) + if file_cache_size_limit == 0: + return + self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0") + self.safe_psql("SELECT pg_reload_conf()") + self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'") + self.safe_psql("SELECT pg_reload_conf()") + + def log_config_value(self, param): + """ + Writes the config value param to log + """ + res = self.safe_psql_scalar(f"SHOW {param}", log_query=False) + log.info("%s = %s", param, res) class EndpointFactory: @@ -4003,14 +4079,14 @@ class EndpointFactory: def create_start( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - basebackup_request_tries: Optional[int] = None, + config_lines: list[str] | None = None, + remote_ext_config: str | None = None, + pageserver_id: int | None = None, + basebackup_request_tries: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4035,12 +4111,12 @@ class EndpointFactory: def create( self, branch_name: str, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, + endpoint_id: str | None = None, + tenant_id: TenantId | None = None, + lsn: Lsn | None = None, hot_standby: bool = False, - config_lines: Optional[list[str]] = None, - pageserver_id: Optional[int] = None, + config_lines: list[str] | None = None, + pageserver_id: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4063,7 +4139,7 @@ class EndpointFactory: pageserver_id=pageserver_id, ) - def stop_all(self, fail_on_error=True) -> EndpointFactory: + def stop_all(self, fail_on_error=True) -> Self: exception = None for ep in self.endpoints: try: @@ -4078,7 +4154,7 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4094,7 +4170,7 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None + self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4132,7 +4208,7 @@ class Safekeeper(LogUtils): port: SafekeeperPort, id: int, running: bool = False, - extra_opts: Optional[list[str]] = None, + extra_opts: list[str] | None = None, ): self.env = env self.port = port @@ -4158,8 +4234,8 @@ class Safekeeper(LogUtils): self.extra_opts = extra_opts def start( - self, extra_opts: Optional[list[str]] = None, timeout_in_seconds: Optional[int] = None - ) -> Safekeeper: + self, extra_opts: list[str] | None = None, timeout_in_seconds: int | None = None + ) -> Self: if extra_opts is None: # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. extra_opts = self.extra_opts @@ -4194,7 +4270,7 @@ class Safekeeper(LogUtils): break # success return self - def stop(self, immediate: bool = False) -> Safekeeper: + def stop(self, immediate: bool = False) -> Self: self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -4238,7 +4314,7 @@ class Safekeeper(LogUtils): return res def http_client( - self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True + self, auth_token: str | None = None, gen_sk_wide_token: bool = True ) -> SafekeeperHttpClient: """ When auth_token is None but gen_sk_wide is True creates safekeeper wide @@ -4371,14 +4447,14 @@ class NeonBroker(LogUtils): def start( self, - timeout_in_seconds: Optional[int] = None, - ): + timeout_in_seconds: int | None = None, + ) -> Self: assert not self.running self.env.neon_cli.storage_broker_start(timeout_in_seconds) self.running = True return self - def stop(self): + def stop(self) -> Self: if self.running: self.env.neon_cli.storage_broker_stop() self.running = False @@ -4394,8 +4470,7 @@ class NeonBroker(LogUtils): assert_no_errors(self.logfile, "storage_controller", []) -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class NodeKind(str, Enum): +class NodeKind(StrEnum): PAGESERVER = "pageserver" SAFEKEEPER = "safekeeper" @@ -4406,7 +4481,7 @@ class StorageScrubber: self.log_dir = log_dir def scrubber_cli( - self, args: list[str], timeout, extra_env: Optional[dict[str, str]] = None + self, args: list[str], timeout, extra_env: dict[str, str] | None = None ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -4469,8 +4544,8 @@ class StorageScrubber: self, post_to_storage_controller: bool = False, node_kind: NodeKind = NodeKind.PAGESERVER, - timeline_lsns: Optional[list[dict[str, Any]]] = None, - extra_env: Optional[dict[str, str]] = None, + timeline_lsns: list[dict[str, Any]] | None = None, + extra_env: dict[str, str] | None = None, ) -> tuple[bool, Any]: """ Returns the health status and the metadata summary. @@ -4504,8 +4579,8 @@ class StorageScrubber: def pageserver_physical_gc( self, min_age_secs: int, - tenant_ids: Optional[list[TenantId]] = None, - mode: Optional[str] = None, + tenant_ids: list[TenantId] | None = None, + mode: str | None = None, ): args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] @@ -4619,7 +4694,7 @@ def check_restored_datadir_content( test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, - ignored_files: Optional[list[str]] = None, + ignored_files: list[str] | None = None, ): pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -4721,7 +4796,7 @@ def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> L def tenant_get_shards( - env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None + env: NeonEnv, tenant_id: TenantId, pageserver_id: int | None = None ) -> list[tuple[TenantShardId, NeonPageserver]]: """ Helper for when you want to talk to one or more pageservers, and the @@ -4784,8 +4859,8 @@ def wait_for_last_flush_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" @@ -4814,7 +4889,7 @@ def flush_ep_to_pageserver( ep: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """ Stop endpoint and wait until all committed WAL reaches the pageserver @@ -4857,7 +4932,7 @@ def wait_for_wal_insert_lsn( endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, - pageserver_id: Optional[int] = None, + pageserver_id: int | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) @@ -4878,7 +4953,7 @@ def fork_at_current_lsn( endpoint: Endpoint, new_branch_name: str, ancestor_branch_name: str, - tenant_id: Optional[TenantId] = None, + tenant_id: TenantId | None = None, ) -> TimelineId: """ Create new branch at the last LSN of an existing branch. @@ -4951,8 +5026,9 @@ def last_flush_lsn_upload( endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId, - pageserver_id: Optional[int] = None, - auth_token: Optional[str] = None, + pageserver_id: int | None = None, + auth_token: str | None = None, + wait_until_uploaded: bool = True, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, @@ -4966,7 +5042,9 @@ def last_flush_lsn_upload( for tenant_shard_id, pageserver in shards: ps_http = pageserver.http_client(auth_token=auth_token) wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) - ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint( + tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded + ) return last_flush_lsn @@ -4987,10 +5065,11 @@ def generate_uploads_and_deletions( env: NeonEnv, *, init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, + tenant_id: TenantId | None = None, + timeline_id: TimelineId | None = None, + data: str | None = None, pageserver: NeonPageserver, + wait_until_uploaded: bool = True, ): """ Using the environment's default tenant + timeline, generate a load pattern @@ -5013,7 +5092,12 @@ def generate_uploads_and_deletions( if init: endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) def churn(data): @@ -5036,7 +5120,12 @@ def generate_uploads_and_deletions( # in a state where there are "future layers" in remote storage that will generate deletions # after a restart. last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + env, + endpoint, + tenant_id, + timeline_id, + pageserver_id=pageserver.id, + wait_until_uploaded=wait_until_uploaded, ) # Compaction should generate some GC-elegible layers @@ -5052,4 +5141,4 @@ def generate_uploads_and_deletions( # background ingest, no more uploads pending, and therefore no non-determinism # in subsequent actions like pageserver restarts. flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded) diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py index 2319701e0b..0e068db593 100644 --- a/test_runner/fixtures/pageserver/common_types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -2,7 +2,7 @@ from __future__ import annotations import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @@ -46,7 +46,7 @@ class DeltaLayerName: return ret -LayerName = Union[ImageLayerName, DeltaLayerName] +LayerName = ImageLayerName | DeltaLayerName class InvalidFileName(Exception): diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 01583757fa..4cf3ece396 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,24 +1,32 @@ from __future__ import annotations +import dataclasses +import json +import random +import string import time from collections import defaultdict from dataclasses import dataclass from datetime import datetime -from typing import TYPE_CHECKING, Any +from typing import Any import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId +from fixtures.common_types import ( + Id, + Lsn, + TenantId, + TenantShardId, + TimelineArchivalState, + TimelineId, +) from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion from fixtures.utils import Fn -if TYPE_CHECKING: - from typing import Optional, Union - class PageserverApiException(Exception): def __init__(self, message, status_code: int): @@ -27,6 +35,69 @@ class PageserverApiException(Exception): self.status_code = status_code +@dataclass +class ImportPgdataIdemptencyKey: + key: str + + @staticmethod + def random() -> ImportPgdataIdemptencyKey: + return ImportPgdataIdemptencyKey( + "".join(random.choices(string.ascii_letters + string.digits, k=20)) + ) + + +@dataclass +class LocalFs: + path: str + + +@dataclass +class AwsS3: + region: str + bucket: str + key: str + + +@dataclass +class ImportPgdataLocation: + LocalFs: None | LocalFs = None + AwsS3: None | AwsS3 = None + + +@dataclass +class TimelineCreateRequestModeImportPgdata: + location: ImportPgdataLocation + idempotency_key: ImportPgdataIdemptencyKey + + +@dataclass +class TimelineCreateRequestMode: + Branch: None | dict[str, Any] = None + Bootstrap: None | dict[str, Any] = None + ImportPgdata: None | TimelineCreateRequestModeImportPgdata = None + + +@dataclass +class TimelineCreateRequest: + new_timeline_id: TimelineId + mode: TimelineCreateRequestMode + + def to_json(self) -> str: + class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o) and not isinstance(o, type): + return dataclasses.asdict(o) + elif isinstance(o, Id): + return o.id.hex() + return super().default(o) + + # mode is flattened + this = dataclasses.asdict(self) + mode = this.pop("mode") + this.update(mode) + return json.dumps(self, cls=EnhancedJSONEncoder) + + class TimelineCreate406(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 406 @@ -43,7 +114,7 @@ class TimelineCreate409(PageserverApiException): class InMemoryLayerInfo: kind: str lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None @classmethod def from_json(cls, d: dict[str, Any]) -> InMemoryLayerInfo: @@ -60,10 +131,10 @@ class HistoricLayerInfo: layer_file_name: str layer_file_size: int lsn_start: str - lsn_end: Optional[str] + lsn_end: str | None remote: bool # None for image layers, true if pageserver thinks this is an L0 delta layer - l0: Optional[bool] + l0: bool | None visible: bool @classmethod @@ -180,8 +251,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self, port: int, is_testing_enabled_or_skip: Fn, - auth_token: Optional[str] = None, - retries: Optional[Retry] = None, + auth_token: str | None = None, + retries: Retry | None = None, ): super().__init__() self.port = port @@ -278,7 +349,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_attach( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, generation: int, config: None | dict[str, Any] = None, ): @@ -305,7 +376,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): }, ) - def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): + def tenant_reset(self, tenant_id: TenantId | TenantShardId, drop_cache: bool): params = {} if drop_cache: params["drop_cache"] = "true" @@ -315,10 +386,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_location_conf( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, location_conf: dict[str, Any], flush_ms=None, - lazy: Optional[bool] = None, + lazy: bool | None = None, ): body = location_conf.copy() @@ -346,20 +417,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json["tenant_shards"], list) return res_json - def tenant_get_location(self, tenant_id: TenantShardId): + def tenant_get_location(self, tenant_id: TenantId | TenantShardId): res = self.get( f"http://localhost:{self.port}/v1/location_config/{tenant_id}", ) self.verbose_error(res) return res.json() - def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_delete(self, tenant_id: TenantId | TenantShardId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res def tenant_status( - self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + self, tenant_id: TenantId | TenantShardId, activate: bool = False ) -> dict[Any, Any]: """ :activate: hint the server not to accelerate activation of this tenant in response @@ -378,17 +449,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig: + def tenant_config(self, tenant_id: TenantId | TenantShardId) -> TenantConfig: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") self.verbose_error(res) return TenantConfig.from_json(res.json()) - def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_heatmap_upload(self, tenant_id: TenantId | TenantShardId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) def tenant_secondary_download( - self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + self, tenant_id: TenantId | TenantShardId, wait_ms: int | None = None ) -> tuple[int, dict[Any, Any]]: url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" if wait_ms is not None: @@ -397,13 +468,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) - def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_secondary_status(self, tenant_id: TenantId | TenantShardId): url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" res = self.get(url) self.verbose_error(res) return res.json() - def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): + def set_tenant_config(self, tenant_id: TenantId | TenantShardId, config: dict[str, Any]): """ Only use this via storage_controller.pageserver_api(). @@ -420,8 +491,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def patch_tenant_config_client_side( self, tenant_id: TenantId, - inserts: Optional[dict[str, Any]] = None, - removes: Optional[list[str]] = None, + inserts: dict[str, Any] | None = None, + removes: list[str] | None = None, ): """ Only use this via storage_controller.pageserver_api(). @@ -436,11 +507,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): del current[key] self.set_tenant_config(tenant_id, current) - def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int: + def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] def tenant_size_and_modelinputs( - self, tenant_id: Union[TenantId, TenantShardId] + self, tenant_id: TenantId | TenantShardId ) -> tuple[int, dict[str, Any]]: """ Returns the tenant size, together with the model inputs as the second tuple item. @@ -456,7 +527,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(inputs, dict) return (size, inputs) - def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str: + def tenant_size_debug(self, tenant_id: TenantId | TenantShardId) -> str: """ Returns the tenant size debug info, as an HTML string """ @@ -468,10 +539,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_time_travel_remote_storage( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timestamp: datetime, done_if_after: datetime, - shard_counts: Optional[list[int]] = None, + shard_counts: list[int] | None = None, ): """ Issues a request to perform time travel operations on the remote storage @@ -490,7 +561,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, ) -> list[dict[str, Any]]: @@ -510,7 +581,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_and_offloaded_list( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, ) -> TimelinesInfoAndOffloaded: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded", @@ -523,11 +594,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_create( self, pg_version: PgVersion, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, new_timeline_id: TimelineId, - ancestor_timeline_id: Optional[TimelineId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - existing_initdb_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: TimelineId | None = None, + ancestor_start_lsn: Lsn | None = None, + existing_initdb_timeline_id: TimelineId | None = None, **kwargs, ) -> dict[Any, Any]: body: dict[str, Any] = { @@ -558,7 +629,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_detail( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, @@ -584,7 +655,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_delete( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs ): """ Note that deletion is not instant, it is scheduled and performed mostly in the background. @@ -600,9 +671,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_gc( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - gc_horizon: Optional[int], + gc_horizon: int | None, ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -624,16 +695,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def timeline_block_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc", ) log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) - def timeline_unblock_gc( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_unblock_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc", ) @@ -642,7 +711,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_offload( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() @@ -658,14 +727,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_compact( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, enhanced_gc_bottom_most_compaction=False, - body: Optional[dict[str, Any]] = None, + body: dict[str, Any] | None = None, ): self.is_testing_enabled_or_skip() query = {} @@ -692,7 +761,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res_json is None def timeline_preserve_initdb_archive( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ): log.info( f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}" @@ -704,7 +773,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_archival_config( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, state: TimelineArchivalState, ): @@ -720,7 +789,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_get_lsn_by_timestamp( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, timestamp: datetime, with_lease: bool = False, @@ -739,7 +808,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_lsn_lease( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): data = { "lsn": str(lsn), @@ -755,7 +824,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return res_json def timeline_get_timestamp_of_lsn( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn ): log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}") res = self.get( @@ -765,9 +834,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json - def timeline_layer_map_info( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def timeline_layer_map_info(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", @@ -778,13 +845,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_checkpoint( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, force_l0_compaction=False, wait_until_uploaded=False, - compact: Optional[bool] = None, + compact: bool | None = None, **kwargs, ): self.is_testing_enabled_or_skip() @@ -801,7 +868,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): if compact is not None: query["compact"] = "true" if compact else "false" - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + log.info( + f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}" + ) res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", params=query, @@ -814,7 +883,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_spawn_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, ) -> dict[str, Any]: @@ -833,7 +902,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_poll_download_remote_layers_status( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, spawn_response: dict[str, Any], poll_state=None, @@ -855,7 +924,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def timeline_download_remote_layers( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, max_concurrent_downloads: int, errors_ok=False, @@ -905,7 +974,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[int]: + ) -> int | None: metrics = [ "pageserver_remote_timeline_client_calls_started_total", "pageserver_remote_timeline_client_calls_finished_total", @@ -929,7 +998,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def layer_map_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ) -> LayerMapInfo: res = self.get( @@ -939,7 +1008,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return LayerMapInfo.from_json(res.json()) def timeline_layer_scan_disposable_keys( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ) -> ScanDisposableKeysResponse: res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys", @@ -949,7 +1018,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return ScanDisposableKeysResponse.from_json(res.json()) def download_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -958,9 +1027,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code == 200 - def download_all_layers( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ): + def download_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: if not layer.remote: @@ -969,9 +1036,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def detach_ancestor( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - batch_size: Optional[int] = None, + batch_size: int | None = None, **kwargs, ) -> set[TimelineId]: params = {} @@ -987,7 +1054,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str ): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", @@ -996,7 +1063,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): assert res.status_code in (200, 304) - def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): + def evict_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -1009,7 +1076,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return res.json() - def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]): + def tenant_break(self, tenant_id: TenantId | TenantShardId): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) @@ -1058,7 +1125,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def perf_info( self, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, ): self.is_testing_enabled_or_skip() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 37b4246d40..b6d19af84c 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -13,7 +13,8 @@ from fixtures.neon_fixtures import ( from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any def single_timeline( diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index ac7497ee6c..46700e3fe3 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -17,14 +17,14 @@ from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def assert_tenant_state( pageserver_http: PageserverHttpClient, tenant: TenantId, expected_state: str, - message: Optional[str] = None, + message: str | None = None, ) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") @@ -33,7 +33,7 @@ def assert_tenant_state( def remote_consistent_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http.timeline_detail(tenant, timeline) @@ -51,7 +51,7 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ): @@ -138,7 +138,7 @@ def wait_until_all_tenants_state( def wait_until_timeline_state( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, expected_state: str, iterations: int, @@ -188,7 +188,7 @@ def wait_until_tenant_active( def last_record_lsn( pageserver_http_client: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -200,7 +200,7 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http: PageserverHttpClient, - tenant: Union[TenantId, TenantShardId], + tenant: TenantId | TenantShardId, timeline: TimelineId, lsn: Lsn, ) -> Lsn: @@ -267,10 +267,10 @@ def wait_for_upload_queue_empty( def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int, - interval: Optional[float] = None, + interval: float | None = None, ): if interval is None: interval = 0.25 @@ -292,10 +292,10 @@ def wait_timeline_detail_404( def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, iterations: int = 20, - interval: Optional[float] = None, + interval: float | None = None, **delete_args, ) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) @@ -304,9 +304,9 @@ def timeline_delete_wait_completed( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, - allowed_postfix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, + allowed_postfix: str | None = None, delimiter: str = "/", ) -> None: assert remote_storage is not None @@ -348,8 +348,8 @@ def assert_prefix_empty( # remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_not_empty( - remote_storage: Optional[RemoteStorage], - prefix: Optional[str] = None, + remote_storage: RemoteStorage | None, + prefix: str | None = None, delimiter: str = "/", ): assert remote_storage is not None @@ -358,7 +358,7 @@ def assert_prefix_not_empty( def list_prefix( - remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/" + remote: RemoteStorage, prefix: str | None = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 1131bf090f..f57c0f801f 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -11,7 +11,7 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any """ @@ -20,31 +20,31 @@ Dynamically parametrize tests by different parameters @pytest.fixture(scope="function", autouse=True) -def pg_version() -> Optional[PgVersion]: +def pg_version() -> PgVersion | None: return None @pytest.fixture(scope="function", autouse=True) -def build_type() -> Optional[str]: +def build_type() -> str | None: return None @pytest.fixture(scope="session", autouse=True) -def platform() -> Optional[str]: +def platform() -> str | None: return None @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_engine() -> Optional[str]: +def pageserver_virtual_file_io_engine() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") @pytest.fixture(scope="function", autouse=True) -def pageserver_virtual_file_io_mode() -> Optional[str]: +def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") -def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: return None @@ -54,7 +54,7 @@ def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict @pytest.fixture(scope="function", autouse=True) -def pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: +def pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: return get_pageserver_default_tenant_config_compaction_algorithm() @@ -66,6 +66,7 @@ def pytest_generate_tests(metafunc: Metafunc): metafunc.parametrize("build_type", build_types) + pg_versions: list[PgVersion] if (v := os.getenv("DEFAULT_PG_VERSION")) is None: pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] else: @@ -115,5 +116,6 @@ def pytest_runtest_makereport(*args, **kwargs): }.get(os.uname().machine, "UNKNOWN") arch = os.getenv("RUNNER_ARCH", uname_m) allure.dynamic.parameter("__arch", arch) + allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false") yield diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 60221573eb..1c71abea19 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -18,7 +18,6 @@ from fixtures.utils import allure_attach_from_dir if TYPE_CHECKING: from collections.abc import Iterator - from typing import Optional BASE_DIR = Path(__file__).parents[2] @@ -26,9 +25,7 @@ COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" DEFAULT_OUTPUT_DIR: str = "test_output" -def get_test_dir( - request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None -) -> Path: +def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path: """Compute the path to a working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" @@ -112,7 +109,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_neon_binpath() -> Iterator[Optional[Path]]: +def compatibility_neon_binpath() -> Iterator[Path | None]: if os.getenv("REMOTE_ENV"): return comp_binpath = None @@ -133,7 +130,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]: +def compatibility_pg_distrib_dir() -> Iterator[Path | None]: compat_distrib_dir = None if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): compat_distrib_dir = Path(env_compat_postgres_bin).resolve() @@ -197,7 +194,7 @@ class FileAndThreadLock: def __init__(self, path: Path): self.path = path self.thread_lock = threading.Lock() - self.fd: Optional[int] = None + self.fd: int | None = None def __enter__(self): self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) @@ -208,9 +205,9 @@ class FileAndThreadLock: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): assert self.fd is not None assert self.thread_lock.locked() # ... by us @@ -263,9 +260,9 @@ class SnapshotDir: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ): self._lock.__exit__(exc_type, exc_value, exc_traceback) @@ -277,7 +274,7 @@ def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir: @pytest.fixture(scope="function") -def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path | None: """ Idempotently create a test's overlayfs mount state directory. If the functionality isn't enabled via env var, returns None. diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 798db1e8d9..46423e8c76 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -1,22 +1,16 @@ from __future__ import annotations -import enum -from typing import TYPE_CHECKING +from enum import StrEnum from typing_extensions import override -if TYPE_CHECKING: - from typing import Optional - - """ This fixture is used to determine which version of Postgres to use for tests. """ # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument -# TODO: use enum.StrEnum for Python >= 3.11 -class PgVersion(str, enum.Enum): +class PgVersion(StrEnum): V14 = "14" V15 = "15" V16 = "16" @@ -34,7 +28,6 @@ class PgVersion(str, enum.Enum): def __repr__(self) -> str: return f"'{self.value}'" - # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums @override def __str__(self) -> str: return self.value @@ -47,16 +40,18 @@ class PgVersion(str, enum.Enum): @classmethod @override - def _missing_(cls, value: object) -> Optional[PgVersion]: - known_values = {v.value for _, v in cls.__members__.items()} + def _missing_(cls, value: object) -> PgVersion | None: + if not isinstance(value, str): + return None - # Allow passing version as a string with "v" prefix (e.g. "v14") - if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values: - return cls(value[1:]) - # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15) - elif isinstance(value, int) and str(value)[:2] in known_values: - return cls(str(value)[:2]) + known_values = set(cls.__members__.values()) + + # Allow passing version as v-prefixed string (e.g. "v14") + if value.lower().startswith("v") and (v := value[1:]) in known_values: + return cls(v) + + # Allow passing version as an int (i.e. both "15" and "150002" matches PgVersion.V15) + if value.isdigit() and (v := value[:2]) in known_values: + return cls(v) - # Make mypy happy - # See https://github.com/python/mypy/issues/3974 return None diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py index df0eb2a809..6a829a9399 100644 --- a/test_runner/fixtures/port_distributor.py +++ b/test_runner/fixtures/port_distributor.py @@ -3,13 +3,9 @@ from __future__ import annotations import re import socket from contextlib import closing -from typing import TYPE_CHECKING from fixtures.log_helper import log -if TYPE_CHECKING: - from typing import Union - def can_bind(host: str, port: int) -> bool: """ @@ -49,17 +45,19 @@ class PortDistributor: "port range configured for test is exhausted, consider enlarging the range" ) - def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + def replace_with_new_port(self, value: int | str) -> int | str: """ Returns a new port for a port number in a string (like "localhost:1234") or int. Replacements are memorised, so a substitution for the same port is always the same. """ - # TODO: replace with structural pattern matching for Python >= 3.10 - if isinstance(value, int): - return self._replace_port_int(value) - - return self._replace_port_str(value) + match value: + case int(): + return self._replace_port_int(value) + case str(): + return self._replace_port_str(value) + case _: + raise TypeError(f"Unsupported type {type(value)}, should be int | str") def _replace_port_int(self, value: int) -> int: known_port = self.port_map.get(value) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index c630ea98b4..4e1e8a884f 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -6,8 +6,9 @@ import json import os import re from dataclasses import dataclass +from enum import StrEnum from pathlib import Path -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import boto3 import toml @@ -20,7 +21,7 @@ from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" @@ -28,7 +29,7 @@ TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @enum.unique -class RemoteStorageUser(str, enum.Enum): +class RemoteStorageUser(StrEnum): """ Instead of using strings for the users, use a more strict enum. """ @@ -77,21 +78,19 @@ class MockS3Server: class LocalFsStorage: root: Path - def tenant_path(self, tenant_id: Union[TenantId, TenantShardId]) -> Path: + def tenant_path(self, tenant_id: TenantId | TenantShardId) -> Path: return self.root / "tenants" / str(tenant_id) - def timeline_path( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ) -> Path: + def timeline_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) def timeline_latest_generation( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ) -> Optional[int]: + self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId + ) -> int | None: timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id)) index_parts = [f for f in timeline_files if f.startswith("index_part")] - def parse_gen(filename: str) -> Optional[int]: + def parse_gen(filename: str) -> int | None: log.info(f"parsing index_part '{filename}'") parts = filename.split("-") if len(parts) == 2: @@ -104,9 +103,7 @@ class LocalFsStorage: raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}") return generations[-1] - def index_path( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ) -> Path: + def index_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path: latest_gen = self.timeline_latest_generation(tenant_id, timeline_id) if latest_gen is None: filename = TIMELINE_INDEX_PART_FILE_NAME @@ -120,7 +117,7 @@ class LocalFsStorage: tenant_id: TenantId, timeline_id: TimelineId, local_name: str, - generation: Optional[int] = None, + generation: int | None = None, ): if generation is None: generation = self.timeline_latest_generation(tenant_id, timeline_id) @@ -130,9 +127,7 @@ class LocalFsStorage: filename = f"{local_name}-{generation:08x}" return self.timeline_path(tenant_id, timeline_id) / filename - def index_content( - self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId - ) -> Any: + def index_content(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Any: with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) @@ -164,17 +159,17 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: Optional[str] - secret_key: Optional[str] - aws_profile: Optional[str] + access_key: str | None + secret_key: str | None + aws_profile: str | None prefix_in_bucket: str client: S3Client cleanup: bool """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool - endpoint: Optional[str] = None + endpoint: str | None = None """formatting deserialized with humantime crate, for example "1s".""" - custom_timeout: Optional[str] = None + custom_timeout: str | None = None def access_env_vars(self) -> dict[str, str]: if self.aws_profile is not None: @@ -272,12 +267,10 @@ class S3Storage: def tenants_path(self) -> str: return f"{self.prefix_in_bucket}/tenants" - def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str: + def tenant_path(self, tenant_id: TenantShardId | TenantId) -> str: return f"{self.tenants_path()}/{tenant_id}" - def timeline_path( - self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId - ) -> str: + def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" def get_latest_index_key(self, index_keys: list[str]) -> str: @@ -315,11 +308,11 @@ class S3Storage: assert self.real is False -RemoteStorage = Union[LocalFsStorage, S3Storage] +RemoteStorage = LocalFsStorage | S3Storage @enum.unique -class RemoteStorageKind(str, enum.Enum): +class RemoteStorageKind(StrEnum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" @@ -331,8 +324,8 @@ class RemoteStorageKind(str, enum.Enum): run_id: str, test_name: str, user: RemoteStorageUser, - bucket_name: Optional[str] = None, - bucket_region: Optional[str] = None, + bucket_name: str | None = None, + bucket_region: str | None = None, ) -> RemoteStorage: if self == RemoteStorageKind.LOCAL_FS: return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user)) diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 5d9a3bd149..094188c0b5 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -13,7 +13,7 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.utils import wait_until if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # Walreceiver as returned by sk's timeline status endpoint. @@ -72,7 +72,7 @@ class TermBumpResponse: class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + def __init__(self, port: int, auth_token: str | None = None, is_testing_enabled=False): super().__init__() self.port = port self.auth_token = auth_token @@ -98,7 +98,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): if not self.is_testing_enabled: pytest.skip("safekeeper was built without 'testing' feature") - def configure_failpoints(self, config_strings: Union[tuple[str, str], list[tuple[str, str]]]): + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]): self.is_testing_enabled_or_skip() if isinstance(config_strings, tuple): @@ -195,7 +195,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json - def debug_dump(self, params: Optional[dict[str, str]] = None) -> dict[str, Any]: + def debug_dump(self, params: dict[str, str] | None = None) -> dict[str, Any]: params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) res.raise_for_status() @@ -204,7 +204,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def debug_dump_timeline( - self, timeline_id: TimelineId, params: Optional[dict[str, str]] = None + self, timeline_id: TimelineId, params: dict[str, str] | None = None ) -> Any: params = params or {} params["timeline_id"] = str(timeline_id) @@ -285,7 +285,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): self, tenant_id: TenantId, timeline_id: TimelineId, - term: Optional[int], + term: int | None, ) -> TermBumpResponse: body = {} if term is not None: diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py index c174358ef5..be95a98ff9 100644 --- a/test_runner/fixtures/storage_controller_proxy.py +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -13,14 +13,14 @@ from werkzeug.wrappers.response import Response from fixtures.log_helper import log if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any class StorageControllerProxy: def __init__(self, server: HTTPServer): self.server: HTTPServer = server self.listen: str = f"http://{server.host}:{server.port}" - self.routing_to: Optional[str] = None + self.routing_to: str | None = None def route_to(self, storage_controller_api: str): self.routing_to = storage_controller_api diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index bb45385ea6..04e98fe494 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -8,10 +8,10 @@ import subprocess import tarfile import threading import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from hashlib import sha256 from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar from urllib.parse import urlencode import allure @@ -29,7 +29,7 @@ from fixtures.pg_version import PgVersion if TYPE_CHECKING: from collections.abc import Iterable - from typing import IO, Optional + from typing import IO from fixtures.common_types import TimelineId from fixtures.neon_fixtures import PgBin @@ -57,6 +57,10 @@ VERSIONS_COMBINATIONS = ( ) # fmt: on +# If the environment variable USE_LFC is set and its value is "false", then LFC is disabled for tests. +# If it is not set or set to a value not equal to "false", LFC is enabled by default. +USE_LFC = os.environ.get("USE_LFC") != "false" + def subprocess_capture( capture_dir: Path, @@ -66,10 +70,10 @@ def subprocess_capture( echo_stderr: bool = False, echo_stdout: bool = False, capture_stdout: bool = False, - timeout: Optional[float] = None, + timeout: float | None = None, with_command_header: bool = True, **popen_kwargs: Any, -) -> tuple[str, Optional[str], int]: +) -> tuple[str, str | None, int]: """Run a process and bifurcate its output to files and the `log` logger stderr and stdout are always captured in files. They are also optionally @@ -536,7 +540,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str """ started_at = time.time() - def hash_extracted(reader: Optional[IO[bytes]]) -> bytes: + def hash_extracted(reader: IO[bytes] | None) -> bytes: assert reader is not None digest = sha256(usedforsecurity=False) while True: @@ -563,7 +567,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str mismatching: set[str] = set() - for left_tuple, right_tuple in zip(left_list, right_list): + for left_tuple, right_tuple in zip(left_list, right_list, strict=False): left_path, left_hash = left_tuple right_path, right_hash = right_tuple assert ( @@ -595,7 +599,7 @@ class PropagatingThread(threading.Thread): self.exc = e @override - def join(self, timeout: Optional[float] = None) -> Any: + def join(self, timeout: float | None = None) -> Any: super().join(timeout) if self.exc: raise self.exc @@ -653,6 +657,23 @@ def allpairs_versions(): return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids} +def size_to_bytes(hr_size: str) -> int: + """ + Gets human-readable size from postgresql.conf (e.g. 512kB, 10MB) + returns size in bytes + """ + units = {"B": 1, "kB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4, "PB": 1024**5} + match = re.search(r"^\'?(\d+)\s*([kMGTP]?B)?\'?$", hr_size) + assert match is not None, f'"{hr_size}" is not a well-formatted human-readable size' + number, unit = match.groups() + + if unit: + amp = units[unit] + else: + amp = 8192 + return int(number) * amp + + def skip_on_postgres(version: PgVersion, reason: str): return pytest.mark.skipif( PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version, @@ -674,6 +695,13 @@ def run_only_on_default_postgres(reason: str): ) +def run_only_on_postgres(versions: Iterable[PgVersion], reason: str): + return pytest.mark.skipif( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) not in versions, + reason=reason, + ) + + def skip_in_debug_build(reason: str): return pytest.mark.skipif( os.getenv("BUILD_TYPE", "debug") == "debug", diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index e869c43185..1b8c9fef44 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -15,7 +15,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex # to ensure we don't do that: this enables running lots of Workloads in parallel safely. @@ -36,8 +36,8 @@ class Workload: env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId, - branch_name: Optional[str] = None, - endpoint_opts: Optional[dict[str, Any]] = None, + branch_name: str | None = None, + endpoint_opts: dict[str, Any] | None = None, ): self.env = env self.tenant_id = tenant_id @@ -50,10 +50,10 @@ class Workload: self.expect_rows = 0 self.churn_cursor = 0 - self._endpoint: Optional[Endpoint] = None + self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} - def reconfigure(self): + def reconfigure(self) -> None: """ Request the endpoint to reconfigure based on location reported by storage controller """ @@ -61,7 +61,7 @@ class Workload: with ENDPOINT_LOCK: self._endpoint.reconfigure() - def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: + def endpoint(self, pageserver_id: int | None = None) -> Endpoint: # We may be running alongside other Workloads for different tenants. Full TTID is # obnoxiously long for use here, but a cut-down version is still unique enough for tests. endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" @@ -94,16 +94,17 @@ class Workload: def __del__(self): self.stop() - def init(self, pageserver_id: Optional[int] = None): + def init(self, pageserver_id: int | None = None, allow_recreate=False): endpoint = self.endpoint(pageserver_id) - + if allow_recreate: + endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};") endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") last_flush_lsn_upload( self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n: int, pageserver_id: Optional[int] = None, upload: bool = True): + def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -125,7 +126,7 @@ class Workload: return False def churn_rows( - self, n: int, pageserver_id: Optional[int] = None, upload: bool = True, ingest: bool = True + self, n: int, pageserver_id: int | None = None, upload: bool = True, ingest: bool = True ): assert self.expect_rows >= n @@ -190,9 +191,9 @@ class Workload: else: log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") - def validate(self, pageserver_id: Optional[int] = None): + def validate(self, pageserver_id: int | None = None): endpoint = self.endpoint(pageserver_id) - endpoint.clear_shared_buffers() + endpoint.clear_buffers() result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 227319c425..bcc3db69f0 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -16,7 +16,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_until_all_tenants_state if TYPE_CHECKING: - from typing import Any, Callable, Optional + from collections.abc import Callable + from typing import Any def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -46,7 +47,7 @@ def setup_pageserver_with_tenants( name: str, n_tenants: int, setup: Callable[[NeonEnv], tuple[TenantId, TimelineId, dict[str, Any]]], - timeout_in_seconds: Optional[int] = None, + timeout_in_seconds: int | None = None, ) -> NeonEnv: """ Utility function to set up a pageserver with a given number of identical tenants. diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 36090dcad7..680eb62b39 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -56,7 +56,7 @@ def test_bulk_insert(neon_with_baseline: PgCompare): def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() - pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"]) + pg_version = PgVersion(str(client.timeline_detail(env.tenant, env.timeline)["pg_version"])) # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index d571fab6b5..0e56fdc96f 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -2,7 +2,7 @@ from __future__ import annotations from contextlib import closing from io import BufferedReader, RawIOBase -from typing import Optional, final +from typing import final from fixtures.compare_fixtures import PgCompare from typing_extensions import override @@ -13,7 +13,7 @@ class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows self.rownum = 0 - self.linebuf: Optional[bytes] = None + self.linebuf: bytes | None = None self.ptr = 0 @override diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index d56f6dce09..38b04b9114 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -18,7 +18,7 @@ from fixtures.neon_api import connection_parameters_to_env from fixtures.pg_version import PgVersion if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.neon_api import NeonAPI @@ -247,7 +247,7 @@ def test_replication_start_stop( ], env=master_env, ) - replica_pgbench: list[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + replica_pgbench: list[subprocess.Popen[Any] | None] = [None] * num_replicas # Use the bits of iconfig to tell us which configuration we are on. For example # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index dc051483f8..142bd3d669 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,7 +4,7 @@ import concurrent.futures import random import time from collections import defaultdict -from enum import Enum +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId @@ -139,7 +139,7 @@ def test_storage_controller_many_tenants( tenant_timelines_count = 100 # These lists are maintained for use with rng.choice - tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_with_timelines = list(rng.sample(list(tenants.keys()), tenant_timelines_count)) tenants_without_timelines = list( tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines ) @@ -171,7 +171,7 @@ def test_storage_controller_many_tenants( # start timing on test nodes if we aren't a bit careful. create_concurrency = 16 - class Operation(str, Enum): + class Operation(StrEnum): TIMELINE_OPS = "timeline_ops" SHARD_MIGRATE = "shard_migrate" TENANT_PASSTHROUGH = "tenant_passthrough" diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 576a4f0467..c6d795ce4d 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -17,7 +17,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix if TYPE_CHECKING: - from typing import Any, Callable + from collections.abc import Callable + from typing import Any @pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7d19ba3b5d..5744c445f6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,7 +2,6 @@ from __future__ import annotations from collections.abc import Generator from dataclasses import dataclass -from typing import Optional import pytest from fixtures.common_types import TenantId @@ -105,7 +104,7 @@ def test_null_config(negative_env: NegativeTests): @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: str | None): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py index 57d5b2d8b3..2db16d9f64 100644 --- a/test_runner/regress/test_combocid.py +++ b/test_runner/regress/test_combocid.py @@ -5,12 +5,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start( - "main", - config_lines=[ - "shared_buffers='1MB'", - ], - ) + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -36,7 +31,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): # Clear the cache, so that we exercise reconstructing the pages # from WAL - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # Check that the cursor opened earlier still works. If the # combocids are not restored correctly, it won't. @@ -65,12 +60,7 @@ def test_combocid_lock(neon_env_builder: NeonEnvBuilder): def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start( - "main", - config_lines=[ - "shared_buffers='1MB'", - ], - ) + endpoint = env.endpoints.create_start("main") conn = endpoint.connect() cur = conn.cursor() @@ -98,7 +88,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): cur.execute("delete from t") # Clear the cache, so that we exercise reconstructing the pages # from WAL - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # Check that the cursor opened earlier still works. If the # combocids are not restored correctly, it won't. diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 48950a5a50..f71e05924a 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,9 +1,8 @@ from __future__ import annotations -import enum import json import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.log_helper import log @@ -15,10 +14,6 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - AGGRESIVE_COMPACTION_TENANT_CONF = { # Disable gc and compaction. The test runs compaction manually. "gc_period": "0s", @@ -172,7 +167,7 @@ LARGE_STRIPES = 32768 def test_sharding_compaction( neon_env_builder: NeonEnvBuilder, stripe_size: int, - shard_count: Optional[int], + shard_count: int | None, gc_compaction: bool, ): """ @@ -277,7 +272,7 @@ def test_sharding_compaction( ) -class CompactionAlgorithm(str, enum.Enum): +class CompactionAlgorithm(StrEnum): LEGACY = "legacy" TIERED = "tiered" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 96ba3dd5a4..ba7305148f 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,7 +7,6 @@ import subprocess import tempfile from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import fixtures.utils import pytest @@ -28,10 +27,6 @@ from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. @@ -385,7 +380,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r def dump_differs( - first: Path, second: Path, output: Path, allowed_diffs: Optional[list[str]] = None + first: Path, second: Path, output: Path, allowed_diffs: list[str] | None = None ) -> bool: """ Runs diff(1) command on two SQL dumps and write the output to the given output file. diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index c5e3034591..1b15c5f15e 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,6 +3,7 @@ from __future__ import annotations import enum import os import shutil +from enum import StrEnum from pathlib import Path from typing import TYPE_CHECKING, cast @@ -16,7 +17,7 @@ from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR if TYPE_CHECKING: from types import TracebackType - from typing import Optional, TypedDict, Union + from typing import Self, TypedDict from fixtures.neon_fixtures import NeonEnv from fixtures.pg_version import PgVersion @@ -26,15 +27,15 @@ if TYPE_CHECKING: metric_name: str type: str help: str - key_labels: Optional[list[str]] - values: Optional[list[str]] - query: Optional[str] - query_ref: Optional[str] + key_labels: list[str] | None + values: list[str] | None + query: str | None + query_ref: str | None class Collector(TypedDict): collector_name: str metrics: list[Metric] - queries: Optional[list[Query]] + queries: list[Query] | None class Query(TypedDict): query_name: str @@ -53,12 +54,12 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: if not rel: raise RuntimeError("Empty filename") - full_path: Optional[str] = None + full_path: str | None = None if os.path.isabs(rel): full_path = rel else: for p in (dir, *JSONNET_PATH): - assert isinstance(p, (str, Path)), "for mypy" + assert isinstance(p, str | Path), "for mypy" full_path = os.path.join(p, rel) assert isinstance(full_path, str), "for mypy" @@ -82,9 +83,9 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]: def jsonnet_evaluate_file( - jsonnet_file: Union[str, Path], - ext_vars: Optional[Union[str, dict[str, str]]] = None, - tla_vars: Optional[Union[str, dict[str, str]]] = None, + jsonnet_file: str | Path, + ext_vars: str | dict[str, str] | None = None, + tla_vars: str | dict[str, str] | None = None, ) -> str: return cast( "str", @@ -102,7 +103,7 @@ def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str: def evaluate_config( - jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str + jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str ) -> str: return jsonnet_evaluate_file( jsonnet_file, @@ -115,7 +116,7 @@ def evaluate_config( @enum.unique -class SqlExporterProcess(str, enum.Enum): +class SqlExporterProcess(StrEnum): COMPUTE = "compute" AUTOSCALING = "autoscaling" @@ -184,16 +185,16 @@ class SqlExporterRunner: def stop(self) -> None: raise NotImplementedError() - def __enter__(self) -> SqlExporterRunner: + def __enter__(self) -> Self: self.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.stop() @@ -241,8 +242,7 @@ if SQL_EXPORTER is None: self.with_volume_mapping(str(config_file), container_config_file, "z") self.with_volume_mapping(str(collector_file), container_collector_file, "z") - @override - def start(self) -> SqlExporterContainer: + def start(self) -> Self: super().start() log.info("Waiting for sql_exporter to be ready") diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index e517e83e6f..1c5554c379 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -13,7 +13,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def handle_db(dbs, roles, operation): @@ -91,15 +91,15 @@ class DdlForwardingContext: lambda request: ddl_forward_handler(request, self.dbs, self.roles, self) ) - def __enter__(self): + def __enter__(self) -> Self: self.pg.start() return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): self.pg.stop() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index c8d3b2ff3e..1807511008 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -5,6 +5,7 @@ import time from collections import Counter from collections.abc import Iterable from dataclasses import dataclass +from enum import StrEnum from typing import TYPE_CHECKING import pytest @@ -80,7 +81,7 @@ def test_min_resident_size_override_handling( @enum.unique -class EvictionOrder(str, enum.Enum): +class EvictionOrder(StrEnum): RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py index 2128bd93dd..382556fd7e 100644 --- a/test_runner/regress/test_explain_with_lfc_stats.py +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -2,10 +2,13 @@ from __future__ import annotations from pathlib import Path +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): env = neon_simple_env @@ -16,8 +19,6 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'", ], diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index a906e7a243..0b1ac11c16 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -170,7 +170,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): # re-execute the query, it will make GetPage # requests. This does not clear the last-written LSN cache # so we still remember the LSNs of the pages. - secondary.clear_shared_buffers(cursor=s_cur) + secondary.clear_buffers(cursor=s_cur) if pause_apply: s_cur.execute("SELECT pg_wal_replay_pause()") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py new file mode 100644 index 0000000000..29229b73c1 --- /dev/null +++ b/test_runner/regress/test_import_pgdata.py @@ -0,0 +1,307 @@ +import json +import re +import time +from enum import Enum + +import psycopg2 +import psycopg2.errors +import pytest +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres +from fixtures.pageserver.http import ( + ImportPgdataIdemptencyKey, + PageserverApiException, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import run_only_on_postgres +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +num_rows = 1000 + + +class RelBlockSize(Enum): + ONE_STRIPE_SIZE = 1 + TWO_STRPES_PER_SHARD = 2 + MULTIPLE_RELATION_SEGMENTS = 3 + + +smoke_params = [ + # unsharded (the stripe size needs to be given for rel block size calculations) + *[(None, 1024, s) for s in RelBlockSize], + # many shards, small stripe size to speed up test + *[(8, 1024, s) for s in RelBlockSize], +] + + +@run_only_on_postgres( + [PgVersion.V14, PgVersion.V15, PgVersion.V16], + "newer control file catalog version and struct format isn't supported", +) +@pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params) +def test_pgdata_import_smoke( + vanilla_pg: VanillaPostgres, + neon_env_builder: NeonEnvBuilder, + shard_count: int | None, + stripe_size: int, + rel_block_size: RelBlockSize, + make_httpserver: HTTPServer, +): + # + # Setup fake control plane for import progress + # + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api" + } + ) + env.pageserver.stop() + env.pageserver.start() + + # + # Put data in vanilla pg + # + + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + + log.info("create relblock data") + if rel_block_size == RelBlockSize.ONE_STRIPE_SIZE: + target_relblock_size = stripe_size * 8192 + elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: + target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 + elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + else: + raise ValueError + + # fillfactor so we don't need to produce that much data + # 900 byte per row is > 10% => 1 row per page + vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""") + + nrows = 0 + while True: + relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") + log.info( + f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + ) + if relblock_size >= target_relblock_size: + break + addrows = int((target_relblock_size - relblock_size) // 8192) + assert addrows >= 1, "forward progress" + vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + nrows += addrows + expect_nrows = nrows + expect_sum = ( + (nrows) * (nrows + 1) // 2 + ) # https://stackoverflow.com/questions/43901484/sum-of-the-integers-from-1-to-n + + def validate_vanilla_equivalence(ep): + # TODO: would be nicer to just compare pgdump + assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ + (expect_nrows, expect_sum) + ] + + validate_vanilla_equivalence(vanilla_pg) + + vanilla_pg.stop() + + # + # We have a Postgres data directory now. + # Make a localfs remote storage that looks like how after `fast_import` ran. + # TODO: actually exercise fast_import here + # TODO: test s3 remote storage + # + importbucket = neon_env_builder.repo_dir / "importbucket" + importbucket.mkdir() + # what cplane writes before scheduling fast_import + specpath = importbucket / "spec.json" + specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"})) + # what fast_import writes + vanilla_pg.pgdatadir.rename(importbucket / "pgdata") + statusdir = importbucket / "status" + statusdir.mkdir() + (statusdir / "pgdata").write_text(json.dumps({"done": True})) + + # + # Do the import + # + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=shard_count, shard_stripe_size=stripe_size + ) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket.absolute())}}, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + shard_status_file = statusdir / f"shard-{shard_id.shard_index}" + if state == "Active": + shard_status_file_contents = ( + shard_status_file.read_text() + ) # Active state implies import is done + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(1) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + # + # Get some timeline details for later. + # + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_http = shard_zero_ps.http_client() + shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) + disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) + _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) + remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) + # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` + assert remote_consistent_lsn_visible == disk_consistent_lsn + assert initdb_lsn == latest_gc_cutoff_lsn + assert disk_consistent_lsn == initdb_lsn + 8 + assert last_record_lsn == disk_consistent_lsn + # TODO: assert these values are the same everywhere + + # + # Validate the resulting remote storage state. + # + + # + # Validate the imported data + # + + ro_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn + ) + + validate_vanilla_equivalence(ro_endpoint) + + # ensure the import survives restarts + ro_endpoint.stop() + env.pageserver.stop(immediate=True) + env.pageserver.start() + ro_endpoint.start() + validate_vanilla_equivalence(ro_endpoint) + + # + # validate the layer files in each shard only have the shard-specific data + # (the implementation would be functional but not efficient without this characteristic) + # + + shards = env.storage_controller.locate(tenant_id) + for shard in shards: + shard_ps = env.get_pageserver(shard["node_id"]) + result = shard_ps.timeline_scan_no_disposable_keys(shard["shard_id"], timeline_id) + assert result.tally.disposable_count == 0 + assert ( + result.tally.not_disposable_count > 0 + ), "sanity check, each shard should have some data" + + # + # validate that we can write + # + rw_endpoint = env.endpoints.create_start( + branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id + ) + rw_endpoint.safe_psql("create table othertable(values text)") + rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + + # TODO: consider using `class Workload` here + # to do compaction and whatnot? + + # + # validate that we can branch (important use case) + # + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable") + + # ... at the initdb lsn + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable") diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 2916748925..9c9bc5b519 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -2,16 +2,12 @@ from __future__ import annotations from collections.abc import Iterable from dataclasses import dataclass -from typing import TYPE_CHECKING from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo from fixtures.utils import human_bytes, skip_in_debug_build -if TYPE_CHECKING: - from typing import Union - @skip_in_debug_build("debug run is unnecessarily slow") def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @@ -109,14 +105,12 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): @dataclass class Histogram: - buckets: list[Union[int, float]] + buckets: list[int | float] counts: list[int] sums: list[int] -def histogram_historic_layers( - infos: LayerMapInfo, minimum_sizes: list[Union[int, float]] -) -> Histogram: +def histogram_historic_layers(infos: LayerMapInfo, minimum_sizes: list[int | float]) -> Histogram: def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: log.info( f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" @@ -128,7 +122,7 @@ def histogram_historic_layers( return histogram(sizes, minimum_sizes) -def histogram(sizes: Iterable[int], minimum_sizes: list[Union[int, float]]) -> Histogram: +def histogram(sizes: Iterable[int], minimum_sizes: list[int | float]) -> Histogram: assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) buckets = list(enumerate(minimum_sizes)) counts = [0 for _ in buckets] diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 54ce7c8340..04ccec5875 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -99,11 +99,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv): res = client.metrics() info("Metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 2 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 @@ -116,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv): try: res = client.metrics() timeout = -1 - if len(parse_metrics(res).query_all("installed_extensions")) < 4: + if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4: # Assume that not all metrics that are collected yet time.sleep(1) timeout -= 1 @@ -128,17 +132,21 @@ def test_installed_extensions(neon_simple_env: NeonEnv): continue assert ( - len(parse_metrics(res).query_all("installed_extensions")) >= 4 + len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4 ), "Not all metrics are collected" info("After restart metrics: %s", res) m = parse_metrics(res) - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 - neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + neon_m = m.query_all( + "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} + ) assert len(neon_m) == 1 for sample in neon_m: assert sample.value == 1 diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 309e0f3015..761ec7568f 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -2,6 +2,7 @@ from __future__ import annotations import time +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver @@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import query_scalar, wait_until -def test_issue_5878(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "attach_mode", + ["default_generation", "same_generation"], +) +def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): """ Regression test for issue https://github.com/neondatabase/neon/issues/5878 . @@ -168,11 +173,32 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): tenant_conf = ps_http.tenant_config(tenant_id) generation_before_detach = get_generation_number() env.pageserver.tenant_detach(tenant_id) - failpoint_name = "before-delete-layer-pausable" + failpoint_deletion_queue = "deletion-queue-before-execute-pause" - ps_http.configure_failpoints((failpoint_name, "pause")) - env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) - generation_after_reattach = get_generation_number() + ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) + + if attach_mode == "default_generation": + env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides) + elif attach_mode == "same_generation": + # Attach with the same generation number -- this is possible with timeline offload and detach ancestor + env.pageserver.tenant_attach( + tenant_id, + tenant_conf.tenant_specific_overrides, + generation=generation_before_detach, + # We want to avoid the generation bump and don't want to talk with the storcon + override_storage_controller_generation=False, + ) + else: + raise AssertionError(f"Unknown attach_mode: {attach_mode}") + + # Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon + # API if attach_mode == "same_generation" + tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id) + generation_after_reattach = tenant_location["generation"] + + if attach_mode == "same_generation": + # The generation number should be the same as before the detach + assert generation_before_detach == generation_after_reattach wait_until_tenant_active(ps_http, tenant_id) # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue. @@ -182,15 +208,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): wait_until(10, 0.5, future_layer_is_gone_from_index_part) - # NB: the layer file is unlinked index part now, but, because we made the delete - # operation stuck, the layer file itself is still in the remote_storage - wait_until( - 10, - 0.5, - lambda: env.pageserver.assert_log_contains( - f".*{tenant_id}.*at failpoint.*{failpoint_name}" - ), - ) + # We already make deletion stuck here, but we don't necessarily hit the failpoint + # because deletions are batched. future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) @@ -224,11 +243,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): break time.sleep(1) - # Window has passed, unstuck the delete, let upload queue drain. + # Window has passed, unstuck the delete, let deletion queue drain; the upload queue should + # have drained because we put these layer deletion operations into the deletion queue and + # have consumed the operation from the upload queue. log.info("unstuck the DELETE") - ps_http.configure_failpoints(("before-delete-layer-pausable", "off")) - + ps_http.configure_failpoints((failpoint_deletion_queue, "off")) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + env.pageserver.http_client().deletion_queue_flush(True) # Examine the resulting S3 state. log.info("integrity-check the remote storage") @@ -247,3 +268,12 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): final_stat = future_layer_path.stat() log.info(f"future layer path: {future_layer_path}") assert final_stat.st_mtime != pre_stat.st_mtime + + # Ensure no weird errors in the end... + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + + if attach_mode == "same_generation": + # we should have detected a race upload and deferred it + env.pageserver.assert_log_contains( + "waiting for deletion queue flush to complete before uploading layer" + ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 3083128d87..377b0fb4d4 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import random import re import subprocess @@ -10,20 +9,24 @@ import time import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): """ Test resizing the Local File Cache """ env = neon_simple_env + cache_dir = env.repo_dir / "file_cache" + cache_dir.mkdir(exist_ok=True) + env.create_branch("test_lfc_resize") endpoint = env.endpoints.create_start( "main", config_lines=[ - "neon.file_cache_path='file.cache'", - "neon.max_file_cache_size=512MB", - "neon.file_cache_size_limit=512MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", ], ) n_resize = 10 @@ -63,8 +66,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): cur.execute("select pg_reload_conf()") nretries = 10 while True: - lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" - lfc_file_size = os.path.getsize(lfc_file_path) + lfc_file_path = endpoint.lfc_path() + lfc_file_size = lfc_file_path.stat().st_size res = subprocess.run( ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True ) diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 36dfec969f..17068849d4 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -3,11 +3,13 @@ from __future__ import annotations import time from pathlib import Path +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import query_scalar +from fixtures.utils import USE_LFC, query_scalar +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): env = neon_simple_env @@ -18,8 +20,6 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'", ], @@ -72,9 +72,10 @@ WITH (fillfactor='100'); # verify working set size after some index access of a few select pages only blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") - assert blocks < 10 + assert blocks < 12 +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index fbf018a167..94c630ffcf 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -6,10 +6,12 @@ import random import threading import time +import pytest from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.utils import query_scalar +from fixtures.utils import USE_LFC, query_scalar +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() @@ -19,8 +21,6 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start( "main", config_lines=[ - "shared_buffers='1MB'", - f"neon.file_cache_path='{cache_dir}/file.cache'", "neon.max_file_cache_size='64MB'", "neon.file_cache_size_limit='10MB'", ], diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index df83ca1c44..ba471b7147 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import ( logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.utils import wait_until +from fixtures.utils import USE_LFC, wait_until if TYPE_CHECKING: from fixtures.neon_fixtures import ( @@ -576,7 +576,15 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van # We want all data to fit into shared_buffers because later we stop # safekeeper and insert more; this shouldn't cause page requests as they # will be stuck. - sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"]) + sub = env.endpoints.create( + "subscriber", + config_lines=[ + "neon.max_file_cache_size = 32MB", + "neon.file_cache_size_limit = 32MB", + ] + if USE_LFC + else [], + ) sub.start() with vanilla_pg.cursor() as pcur: diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 8b41d0cb1c..7f0b541128 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -3,7 +3,7 @@ from __future__ import annotations import re import time from concurrent.futures import ThreadPoolExecutor -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta import pytest from fixtures.common_types import Lsn @@ -207,7 +207,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): for i in range(1000): cur.execute("INSERT INTO foo VALUES(%s)", (i,)) # Get the timestamp at UTC - after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=UTC) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) time.sleep(0.02) @@ -273,11 +273,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): ) log.info("result: %s, after_ts: %s", result, after_timestamp) - # TODO use fromisoformat once we have Python 3.11+ - # which has https://github.com/python/cpython/pull/92177 - timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace( - tzinfo=timezone.utc - ) + timestamp = datetime.fromisoformat(result).replace(tzinfo=UTC) assert timestamp < after_timestamp, "after_timestamp after timestamp" if i > 1: before_timestamp = tbl[i - step_size][1] diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py index f69c1112c7..e2bde8be6f 100644 --- a/test_runner/regress/test_oid_overflow.py +++ b/test_runner/regress/test_oid_overflow.py @@ -39,7 +39,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder): oid = cur.fetchall()[0][0] log.info(f"t2.relfilenode={oid}") - endpoint.clear_shared_buffers(cursor=cur) + endpoint.clear_buffers(cursor=cur) cur.execute("SELECT x from t1") assert cur.fetchone() == (1,) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py index 5eaba78331..f0f12290cc 100644 --- a/test_runner/regress/test_ondemand_slru_download.py +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -13,7 +11,7 @@ from fixtures.utils import query_scalar # Test on-demand download of the pg_xact SLRUs # @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -79,7 +77,7 @@ def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count @pytest.mark.parametrize("shard_count", [None, 4]) -def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index d1b70b9ee6..05e81b82e0 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, @@ -82,7 +80,7 @@ def expect_updated_msg_lsn( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - prev_msg_lsn: Optional[Lsn], + prev_msg_lsn: Lsn | None, ) -> Lsn: timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 4f59efb8b3..6ba5753420 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -11,11 +11,10 @@ of the pageserver are: from __future__ import annotations -import enum import os import re import time -from typing import TYPE_CHECKING +from enum import StrEnum import pytest from fixtures.common_types import TenantId, TimelineId @@ -41,10 +40,6 @@ from fixtures.remote_storage import ( from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - - # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. TENANT_CONF = { @@ -65,7 +60,7 @@ TENANT_CONF = { def read_all( - env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None + env: NeonEnv, tenant_id: TenantId | None = None, timeline_id: TimelineId | None = None ): if tenant_id is None: tenant_id = env.initial_tenant @@ -286,12 +281,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert get_deletion_queue_unexpected_errors(ps_http) == 0 -class KeepAttachment(str, enum.Enum): +class KeepAttachment(StrEnum): KEEP = "keep" LOSE = "lose" -class ValidateBefore(str, enum.Enum): +class ValidateBefore(StrEnum): VALIDATE = "validate" NO_VALIDATE = "no-validate" @@ -464,7 +459,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env.pageserver.start() # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) + # Because it is in emergency mode, it will not attempt to validate deletions required by the initial barrier, and therefore + # other files cannot be uploaded b/c it's waiting for the initial barrier to be validated. + generate_uploads_and_deletions( + env, init=False, pageserver=env.pageserver, wait_until_uploaded=False + ) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 200a323a3a..f6a7bfa1ad 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio import time -from typing import TYPE_CHECKING import psutil import pytest @@ -17,17 +16,13 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.utils import skip_in_debug_build, wait_until -if TYPE_CHECKING: - from typing import Optional - - TIMELINE_COUNT = 10 ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None + env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None ) -> Lsn: if offset is None: offset = 0 @@ -136,7 +131,7 @@ def test_pageserver_small_inmemory_layers( wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) ps_http_client = env.pageserver.http_client() total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) @@ -144,7 +139,7 @@ def test_pageserver_small_inmemory_layers( # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # must be uploaded to remain visible to the pageserver after restart. @@ -185,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # Stop the safekeepers, so that we cannot have any more WAL receiver connections for sk in env.safekeepers: @@ -198,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # The code below verifies that we do not flush on the first write # after an idle period longer than the checkpoint timeout. @@ -215,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) ) - dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # We shouldn't flush since we've just opened a new layer waited_for = 0 @@ -317,4 +312,4 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): dirty_bytes = get_dirty_bytes(env) assert dirty_bytes < max_dirty_data - wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index fb6050689c..4bf5705517 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -2,7 +2,6 @@ from __future__ import annotations import random from contextlib import closing -from typing import Optional import pytest from fixtures.log_helper import log @@ -156,7 +155,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.timeout(540) @pytest.mark.parametrize("shard_count", [None, 4]) @skip_in_debug_build("times out in debug builds") -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | None): # same rationale as with the immediate stop; we might leave orphan layers behind. neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 12134048e6..a264f4d3c9 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -23,7 +23,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any # A tenant configuration that is convenient for generating uploads and deletions @@ -199,7 +199,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) else: - secondary_conf: Optional[dict[str, Any]] = None + secondary_conf: dict[str, Any] | None = None if mode == "Secondary": secondary_conf = {"warm": rng.choice([True, False])} @@ -469,7 +469,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): def list_elegible_layers( - pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + pageserver, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId ) -> list[Path]: """ The subset of layer filenames that are elegible for secondary download: at time of writing this @@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): else: timeout = int(deadline - now) + 1 try: - wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) except: log.error(f"Timed out waiting for '{expression}'") raise diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 6a5e388c53..2877f14e0e 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -21,8 +21,6 @@ from fixtures.remote_storage import s3_storage from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: - from typing import Optional - from fixtures.neon_fixtures import PgBin from pytest import CaptureFixture @@ -48,7 +46,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End data properly. """ - ignored_files: Optional[list[str]] = None + ignored_files: list[str] | None = None # Neon handles unlogged relations in a special manner. During a # basebackup, we ship the init fork as the main fork. This presents a @@ -131,7 +129,7 @@ def test_pg_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" @@ -205,7 +203,7 @@ def test_isolation( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "isolation_regression" @@ -274,7 +272,7 @@ def test_sql_regress( capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, - shard_count: Optional[int], + shard_count: int | None, ): DBNAME = "regression" diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index e59d46e352..5a01d90d85 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -13,7 +13,7 @@ import requests from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any GET_CONNECTION_PID_QUERY = "SELECT pid FROM pg_stat_activity WHERE state = 'active'" @@ -228,7 +228,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): def test_sql_over_http(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( @@ -291,7 +291,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): ) ) - def q(sql: str, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, params: list[Any] | None = None) -> Any: params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" response = requests.post( @@ -310,7 +310,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") - def q(sql: str, raw_text: bool, array_mode: bool, params: Optional[list[Any]] = None) -> Any: + def q(sql: str, raw_text: bool, array_mode: bool, params: list[Any] | None = None) -> Any: params = params or [] connstr = ( f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" @@ -346,7 +346,7 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") def qq( - queries: list[tuple[str, Optional[list[Any]]]], + queries: list[tuple[str, list[Any] | None]], read_only: bool = False, deferrable: bool = False, ) -> Any: diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 471a3b406a..70a7a675df 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -54,7 +54,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - endpoint.clear_shared_buffers(cursor=c) + endpoint.clear_buffers(cursor=c) cache_entries = query_scalar( c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 826136d5f9..70d558ac5a 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,7 +1,6 @@ from __future__ import annotations import time -from typing import Union import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId @@ -122,7 +121,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754") def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): """ Test static endpoint is protected from GC by acquiring and renewing lsn leases. @@ -175,7 +173,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): def get_layers_protected_by_lease( ps_http: PageserverHttpClient, - tenant_id: Union[TenantId, TenantShardId], + tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lease_lsn: Lsn, ) -> set[str]: @@ -232,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): return offset # Insert some records on main branch - with env.endpoints.create_start("main") as ep_main: + with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main: with ep_main.cursor() as cur: cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") lsn = Lsn(0) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 79b5ebe39a..137e75f784 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -5,7 +5,6 @@ import queue import shutil import threading import time -from typing import TYPE_CHECKING import pytest from fixtures.common_types import Lsn, TenantId, TimelineId @@ -37,9 +36,6 @@ from fixtures.utils import ( ) from requests import ReadTimeout -if TYPE_CHECKING: - from typing import Optional - # # Tests that a piece of data is backed up and restored correctly: @@ -452,7 +448,7 @@ def test_remote_timeline_client_calls_started_metric( for (file_kind, op_kind), observations in calls_started.items(): log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}") assert all( - x < y for x, y in zip(observations, observations[1:]) + x < y for x, y in zip(observations, observations[1:], strict=False) ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): @@ -731,7 +727,7 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # sleep a bit to force the upload task go into exponential backoff time.sleep(1) - q: queue.Queue[Optional[PageserverApiException]] = queue.Queue() + q: queue.Queue[PageserverApiException | None] = queue.Queue() barrier = threading.Barrier(2) def create_in_background(): diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 7a9e6d62b2..8764da3c2f 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -1,7 +1,7 @@ from __future__ import annotations import time -from datetime import datetime, timezone +from datetime import UTC, datetime from fixtures.common_types import Lsn from fixtures.log_helper import log @@ -77,7 +77,7 @@ def test_tenant_s3_restore( # These sleeps are important because they fend off differences in clocks between us and S3 time.sleep(4) - ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) assert ( @@ -104,7 +104,7 @@ def test_tenant_s3_restore( ) time.sleep(4) - ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_deletion = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) ps_http.tenant_time_travel_remote_storage( diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 3194fe6ec4..411574bd86 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -3,7 +3,7 @@ from __future__ import annotations import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Any +from typing import Any import pytest import requests @@ -27,9 +27,6 @@ from typing_extensions import override from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response -if TYPE_CHECKING: - from typing import Optional, Union - def test_sharding_smoke( neon_env_builder: NeonEnvBuilder, @@ -189,7 +186,7 @@ def test_sharding_split_unsharded( ], ) def test_sharding_split_compaction( - neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str + neon_env_builder: NeonEnvBuilder, failpoint: str | None, build_type: str ): """ Test that after a split, we clean up parent layer data in the child shards via compaction. @@ -782,7 +779,7 @@ def test_sharding_split_stripe_size( tenant_id = env.initial_tenant assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -798,7 +795,7 @@ def test_sharding_split_stripe_size( # Check that we ended up with the stripe size that we expected, both on the pageserver # and in the notifications to compute assert len(notifications) == 2 - expect_after: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect_after: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": new_stripe_size, "shards": [ @@ -1046,7 +1043,7 @@ def test_sharding_ingest_gaps( class Failure: - pageserver_id: Optional[int] + pageserver_id: int | None def apply(self, env: NeonEnv): raise NotImplementedError() @@ -1370,7 +1367,7 @@ def test_sharding_split_failures( assert attached_count == initial_shard_count - def assert_split_done(exclude_ps_id: Optional[int] = None) -> None: + def assert_split_done(exclude_ps_id: int | None = None) -> None: secondary_count = 0 attached_count = 0 for ps in env.pageservers: @@ -1408,7 +1405,7 @@ def test_sharding_split_failures( # e.g. while waiting for a storage controller to re-attach a parent shard if we failed # inside the pageserver and the storage controller responds by detaching children and attaching # parents concurrently (https://github.com/neondatabase/neon/issues/7148) - wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) workload.validate() diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 402f27b384..2a26fef59a 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -3,7 +3,6 @@ from __future__ import annotations import socket import subprocess from pathlib import Path -from types import TracebackType from typing import TYPE_CHECKING import backoff @@ -12,7 +11,8 @@ from fixtures.neon_fixtures import PgProtocol, VanillaPostgres from fixtures.port_distributor import PortDistributor if TYPE_CHECKING: - from typing import Optional + from types import TracebackType + from typing import Self def generate_tls_cert(cn, certout, keyout): @@ -55,10 +55,10 @@ class PgSniRouter(PgProtocol): self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key - self._popen: Optional[subprocess.Popen[bytes]] = None + self._popen: subprocess.Popen[bytes] | None = None self.test_output_dir = test_output_dir - def start(self) -> PgSniRouter: + def start(self) -> Self: assert self._popen is None args = [ str(self.neon_binpath / "pg_sni_router"), @@ -91,14 +91,14 @@ class PgSniRouter(PgProtocol): if self._popen: self._popen.wait(timeout=2) - def __enter__(self) -> PgSniRouter: + def __enter__(self) -> Self: return self def __exit__( self, - exc_type: Optional[type[BaseException]], - exc: Optional[BaseException], - tb: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, ): if self._popen is not None: self._popen.terminate() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2c3d79b18a..13bc54a114 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -5,7 +5,7 @@ import json import threading import time from collections import defaultdict -from datetime import datetime, timezone +from datetime import UTC, datetime from enum import Enum from typing import TYPE_CHECKING @@ -56,7 +56,7 @@ from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response if TYPE_CHECKING: - from typing import Any, Optional, Union + from typing import Any def get_node_shard_counts(env: NeonEnv, tenant_ids): @@ -593,7 +593,7 @@ def test_storage_controller_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -708,7 +708,7 @@ def test_storage_controller_stuck_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect: dict[str, Union[list[dict[str, int]], str, None, int]] = { + expect: dict[str, list[dict[str, int]] | str | None | int] = { "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], @@ -1048,7 +1048,7 @@ def test_storage_controller_s3_time_travel_recovery( ) time.sleep(4) - ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_before_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Simulate a "disaster": delete some random files from remote storage for one of the shards @@ -1072,7 +1072,7 @@ def test_storage_controller_s3_time_travel_recovery( pass time.sleep(4) - ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + ts_after_disaster = datetime.now(tz=UTC).replace(tzinfo=None) time.sleep(4) # Do time travel recovery @@ -2274,7 +2274,7 @@ def test_storage_controller_node_deletion( @pytest.mark.parametrize("shard_count", [None, 2]) def test_storage_controller_metadata_health( neon_env_builder: NeonEnvBuilder, - shard_count: Optional[int], + shard_count: int | None, ): """ Create three tenants A, B, C. @@ -2494,14 +2494,14 @@ def start_env(env: NeonEnv, storage_controller_port: int): for pageserver in env.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) for safekeeper in env.safekeepers: futs.append( executor.submit( - lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] ) ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 11ad2173ae..3991bd7061 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -6,7 +6,6 @@ import shutil import threading import time from concurrent.futures import ThreadPoolExecutor -from typing import TYPE_CHECKING import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -20,12 +19,9 @@ from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: int | None): """ Test the `tenant-snapshot` subcommand, which grabs data from remote storage @@ -131,7 +127,7 @@ def drop_local_state(env: NeonEnv, tenant_id: TenantId): @pytest.mark.parametrize("shard_count", [None, 4]) -def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -179,9 +175,7 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt @pytest.mark.parametrize("shard_count", [None, 2]) -def test_scrubber_physical_gc_ancestors( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] -): +def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_count: int | None): neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.num_pageservers = 2 @@ -499,7 +493,7 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("shard_count", [None, 4]) def test_scrubber_scan_pageserver_metadata( - neon_env_builder: NeonEnvBuilder, shard_count: Optional[int] + neon_env_builder: NeonEnvBuilder, shard_count: int | None ): """ Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect. diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 59c14b3263..8d7ca7bc4e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,11 +1,10 @@ from __future__ import annotations import asyncio -import enum import random import time +from enum import StrEnum from threading import Thread -from typing import TYPE_CHECKING import asyncpg import pytest @@ -28,10 +27,6 @@ from fixtures.remote_storage import ( from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample -if TYPE_CHECKING: - from typing import Optional - - # In tests that overlap endpoint activity with tenant attach/detach, there are # a variety of warnings that the page service may emit when it cannot acquire # an active tenant to serve a request @@ -57,7 +52,7 @@ def do_gc_target( log.info("gc http thread returning") -class ReattachMode(str, enum.Enum): +class ReattachMode(StrEnum): REATTACH_EXPLICIT = "explicit" REATTACH_RESET = "reset" REATTACH_RESET_DROP = "reset_drop" @@ -498,7 +493,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( r".* Changing Active tenant to Broken state, reason: broken from test" ) - def only_int(samples: list[Sample]) -> Optional[int]: + def only_int(samples: list[Sample]) -> int | None: if len(samples) == 1: return int(samples[0].value) assert len(samples) == 0 diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index fc9adb14c9..bf6120aa0a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -28,7 +28,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -78,7 +78,7 @@ def populate_branch( tenant_id: TenantId, ps_http: PageserverHttpClient, create_table: bool, - expected_sum: Optional[int], + expected_sum: int | None, ) -> tuple[TimelineId, Lsn]: # insert some data with pg_cur(endpoint) as cur: diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 0650f12cd1..bc2e048f69 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -4,7 +4,6 @@ import json import random import threading import time -from typing import Optional import pytest import requests @@ -661,7 +660,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ], ) def test_timeline_retain_lsn( - neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: Optional[str] + neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: str | None ): """ Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index ef0eb05612..9c7e851ba8 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,6 +5,7 @@ import enum import threading import time from concurrent.futures import ThreadPoolExecutor +from enum import StrEnum from queue import Empty, Queue from threading import Barrier @@ -22,7 +23,8 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.utils import assert_pageserver_backups_equal, wait_until +from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until +from fixtures.workload import Workload from requests import ReadTimeout @@ -36,7 +38,7 @@ def layer_name(info: HistoricLayerInfo) -> str: @enum.unique -class Branchpoint(str, enum.Enum): +class Branchpoint(StrEnum): """ Have branches at these Lsns possibly relative to L0 layer boundary. """ @@ -414,7 +416,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None - ep.clear_shared_buffers() + ep.clear_buffers() assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 ep.stop() @@ -1549,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) +@skip_in_debug_build("only run with release build") +def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 ** 2}", + "lsn_lease_length": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 50 + + ps_http = env.pageserver.http_client() + + workload_parent = Workload(env, tenant_id, timeline_id) + workload_parent.init(env.pageserver.id) + log.info("Writing initial data ...") + workload_parent.write_rows(row_count, env.pageserver.id) + branch_id = env.create_branch("child") + workload_child = Workload(env, tenant_id, branch_id, branch_name="child") + workload_child.init(env.pageserver.id, allow_recreate=True) + log.info("Writing initial data on child...") + workload_child.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload_parent.churn_rows(row_count, env.pageserver.id) + workload_child.churn_rows(row_count, env.pageserver.id) + + ps_http.detach_ancestor(tenant_id, branch_id) + + log.info("Validating at workload end ...") + workload_parent.validate(env.pageserver.id) + workload_child.validate(env.pageserver.id) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index c19c78e251..5a5ca3290a 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -3,7 +3,6 @@ from __future__ import annotations import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log @@ -14,9 +13,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import wait_timeline_detail_404 -if TYPE_CHECKING: - from typing import Optional - @pytest.mark.parametrize("sharded", [True, False]) def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool): @@ -89,7 +85,7 @@ def wait_for_another_gc_round(): @dataclass class ScrollableLog: pageserver: NeonPageserver - offset: Optional[LogCursor] + offset: LogCursor | None def assert_log_contains(self, what: str): msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 85c6d17142..4528bc6180 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -7,7 +7,6 @@ import time from collections import defaultdict from contextlib import closing from pathlib import Path -from typing import Optional import psycopg2.errors import psycopg2.extras @@ -668,7 +667,7 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder): class TimelinePhysicalSizeValues: api_current_physical: int prometheus_resident_physical: float - prometheus_remote_physical: Optional[float] = None + prometheus_remote_physical: float | None = None python_timelinedir_layerfiles_physical: int layer_map_file_size_sum: int diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index d4c2ca7e07..f93fc6bd8b 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -63,7 +63,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - endpoint.clear_shared_buffers(cursor=cur) + endpoint.clear_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 6eaaa3c37f..8fa33b81a9 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -61,7 +61,7 @@ from fixtures.utils import ( ) if TYPE_CHECKING: - from typing import Any, Optional + from typing import Any, Self def wait_lsn_force_checkpoint( @@ -189,7 +189,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id)))) m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id)))) - for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): + for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns, strict=False): # Invariant. May be < when transaction is in progress. assert ( commit_lsn <= flush_lsn @@ -224,7 +224,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): def __init__(self) -> None: super().__init__(daemon=True) self.should_stop = threading.Event() - self.exception: Optional[BaseException] = None + self.exception: BaseException | None = None def run(self) -> None: try: @@ -521,7 +521,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] - for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end, strict=False): victim.stop() # roughly fills one segment cur.execute("insert into t select generate_series(1,250000), 'payload'") @@ -666,7 +666,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # recreate timeline on pageserver from scratch ps_http.timeline_create( - pg_version=PgVersion(pg_version), + pg_version=PgVersion(str(pg_version)), tenant_id=tenant_id, new_timeline_id=timeline_id, ) @@ -1177,14 +1177,14 @@ def cmp_sk_wal(sks: list[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] - for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): + for tfl, sk in zip(term_flush_lsns[1:], sks[1:], strict=False): assert ( term_flush_lsns[0] == tfl ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] - for cmp_segs, sk in zip(segs[1:], sks[1:]): + for cmp_segs, sk in zip(segs[1:], sks[1:], strict=False): assert ( segs[0] == cmp_segs ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}" @@ -1455,12 +1455,12 @@ class SafekeeperEnv: self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") - self.safekeepers: Optional[list[subprocess.CompletedProcess[Any]]] = None - self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[TenantId] = None - self.timeline_id: Optional[TimelineId] = None + self.safekeepers: list[subprocess.CompletedProcess[Any]] | None = None + self.postgres: ProposerPostgres | None = None + self.tenant_id: TenantId | None = None + self.timeline_id: TimelineId | None = None - def init(self) -> SafekeeperEnv: + def init(self) -> Self: assert self.postgres is None, "postgres is already initialized" assert self.safekeepers is None, "safekeepers are already initialized" @@ -1541,7 +1541,7 @@ class SafekeeperEnv: log.info(f"Killing safekeeper with pid {pid}") os.kill(pid, signal.SIGKILL) - def __enter__(self): + def __enter__(self) -> Self: return self def __exit__(self, exc_type, exc_value, traceback): @@ -2446,7 +2446,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): # generate some data to commit WAL on safekeepers endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") # clear the buffers - endpoint.clear_shared_buffers() + endpoint.clear_buffers() # read data to fetch pages from pageserver endpoint.safe_psql("select sum(i) from t") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index d3e989afa8..18408b0619 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -5,7 +5,6 @@ import random import time from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING import asyncpg import pytest @@ -16,10 +15,6 @@ from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind from fixtures.utils import skip_in_debug_build -if TYPE_CHECKING: - from typing import Optional - - log = getLogger("root.safekeeper_async") @@ -261,7 +256,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): def endpoint_create_start( - env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False + env: NeonEnv, branch: str, pgdir_name: str | None, allow_multiple: bool = False ): endpoint = Endpoint( env, @@ -287,7 +282,7 @@ async def exec_compute_query( env: NeonEnv, branch: str, query: str, - pgdir_name: Optional[str] = None, + pgdir_name: str | None = None, allow_multiple: bool = False, ): with endpoint_create_start( @@ -705,7 +700,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat # invalid, to make them unavailable to the endpoint. We use # ports 10, 11 and 12 to simulate unavailable safekeepers. config = toml.load(test_output_dir / "repo" / "config") - for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk)): + for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk, strict=False)): if active: config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg else: diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index aeecd27b1f..284ae56be2 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit aeecd27b1f0775b606409d1cbb9c8aa9853a82af +Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 544620db4c..aed79ee87b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 544620db4ca6945be4f1f686a7fbd2cdfb0bf96f +Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3cc152ae2d..f5cfc6fa89 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3cc152ae2d17b19679c7102486bdb94677705c02 +Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index e5d795a1a0..3c15b6565f 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit e5d795a1a0c25da907176d37c905badab70e00c0 +Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f diff --git a/vendor/revisions.json b/vendor/revisions.json index a13ef29e45..4dae88e73d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "e5d795a1a0c25da907176d37c905badab70e00c0" + "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f" ], "v16": [ "16.6", - "3cc152ae2d17b19679c7102486bdb94677705c02" + "f5cfc6fa898544050e821ac688adafece1ac3cff" ], "v15": [ "15.10", - "544620db4ca6945be4f1f686a7fbd2cdfb0bf96f" + "aed79ee87b94779cc52ec13e3b74eba6ada93f05" ], "v14": [ "14.15", - "aeecd27b1f0775b606409d1cbb9c8aa9853a82af" + "284ae56be2397fd3eaf20777fa220b2d0ad968f5" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 53d3a7364b..a73d9d6352 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,7 +19,8 @@ ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } axum = { version = "0.7", features = ["ws"] } axum-core = { version = "0.4", default-features = false, features = ["tracing"] } -base64 = { version = "0.21", features = ["alloc"] } +base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } +base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } @@ -52,6 +53,7 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } +nix = { version = "0.26" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] }