diff --git a/.cargo/config.toml b/.cargo/config.toml index 8fddaa2dd4..cc767a7f68 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -12,6 +12,11 @@ opt-level = 3 # Turn on a small amount of optimization in Development mode. opt-level = 1 +[build] +# This is only present for local builds, as it will be overridden +# by the RUSTDOCFLAGS env var in CI. +rustdocflags = ["-Arustdoc::private_intra_doc_links"] + [alias] build_testing = ["build", "--features", "testing"] neon = ["run", "--bin", "neon_local"] diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 54b69d6d48..a027de9464 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -105,7 +105,7 @@ runs: # Get previously uploaded data for this run ZSTD_NBTHREADS=0 - S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[].Key') + S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output '.Contents[]?.Key') if [ -z "$S3_FILEPATHS" ]; then # There's no previously uploaded data for this $GITHUB_RUN_ID exit 0 diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index dec1f47e47..ceb6f4aa90 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -150,6 +150,14 @@ runs: EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" fi + # We use pytest-split plugin to run benchmarks in parallel on different CI runners + if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then + mkdir -p $TEST_OUTPUT + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + + EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" + fi + if [[ "${{ inputs.build_type }}" == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml new file mode 100644 index 0000000000..ac9e908c09 --- /dev/null +++ b/.github/workflows/approved-for-ci-run.yml @@ -0,0 +1,55 @@ +name: Handle `approved-for-ci-run` label +# This workflow helps to run CI pipeline for PRs made by external contributors (from forks). + +on: + pull_request: + types: + # Default types that triggers a workflow ([1]): + # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request + - opened + - synchronize + - reopened + # Types that we wand to handle in addition to keep labels tidy: + - closed + # Actual magic happens here: + - labeled + +env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + +jobs: + remove-label: + # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR. + # The PR should be reviewed and labelled manually again. + + runs-on: [ ubuntu-latest ] + + if: | + contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) && + contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') + + steps: + - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" + + create-branch: + # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it. + + runs-on: [ ubuntu-latest ] + + if: | + github.event.action == 'labeled' && + contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') + + steps: + - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" + + - uses: actions/checkout@v3 + with: + ref: main + + - run: gh pr checkout "${PR_NUMBER}" + + - run: git checkout -b "ci-run/pr-${PR_NUMBER}" + + - run: git push --force origin "ci-run/pr-${PR_NUMBER}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cd4906579e..5f3e4f1145 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -5,6 +5,7 @@ on: branches: - main - release + - ci-run/pr-* pull_request: defaults: @@ -127,6 +128,11 @@ jobs: - name: Run cargo clippy (release) run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS + - name: Check documentation generation + run: cargo doc --workspace --no-deps --document-private-items + env: + RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting if: ${{ !cancelled() }} @@ -155,7 +161,7 @@ jobs: build_type: [ debug, release ] env: BUILD_TYPE: ${{ matrix.build_type }} - GIT_VERSION: ${{ github.sha }} + GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }} steps: - name: Fix git ownership @@ -174,6 +180,27 @@ jobs: submodules: true fetch-depth: 1 + - name: Check Postgres submodules revision + shell: bash -euo pipefail {0} + run: | + # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally). + # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 + + FAILED=false + for postgres in postgres-v14 postgres-v15; do + expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') + actual=$(git rev-parse "HEAD:vendor/${postgres}") + if [ "${expected}" != "${actual}" ]; then + echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'" + FAILED=true + fi + done + + if [ "${FAILED}" = "true" ]; then + echo >&2 "Please update vendors/revisions.json if these changes are intentional" + exit 1 + fi + - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT @@ -369,13 +396,11 @@ jobs: strategy: fail-fast: false matrix: + pytest_split_group: [ 1, 2, 3, 4 ] build_type: [ release ] steps: - name: Checkout uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 1 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -384,9 +409,11 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} + extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}" # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -614,7 +641,7 @@ jobs: /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . - --build-arg GIT_VERSION=${{ github.sha }} + --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} @@ -658,7 +685,7 @@ jobs: /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . - --build-arg GIT_VERSION=${{ github.sha }} + --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com --dockerfile Dockerfile.compute-tools @@ -715,7 +742,7 @@ jobs: /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . - --build-arg GIT_VERSION=${{ github.sha }} + --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com @@ -742,7 +769,7 @@ jobs: /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \ --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \ --context . \ - --build-arg GIT_VERSION=${{ github.sha }} \ + --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \ --build-arg PG_VERSION=${{ matrix.version }} \ --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \ --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \ @@ -767,7 +794,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.11.1 + VM_BUILDER_VERSION: v0.13.1 steps: - name: Checkout diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 1196881541..a21ddb0414 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -3,7 +3,8 @@ name: Check neon with extra platform builds on: push: branches: - - main + - main + - ci-run/pr-* pull_request: defaults: diff --git a/Cargo.lock b/Cargo.lock index b163d4fe46..b5f6b3b328 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -158,6 +158,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "async-compression" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" +dependencies = [ + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -593,7 +606,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.6.2", "object", "rustc-demangle", ] @@ -882,9 +895,11 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "chrono", "clap", "compute_api", + "flate2", "futures", "hyper", "notify", @@ -1367,6 +1382,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide 0.7.1", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2151,6 +2176,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + [[package]] name = "mio" version = "0.8.6" @@ -2345,9 +2379,9 @@ dependencies = [ [[package]] name = "opentelemetry" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e" +checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f" dependencies = [ "opentelemetry_api", "opentelemetry_sdk", @@ -2355,9 +2389,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d" +checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906" dependencies = [ "async-trait", "bytes", @@ -2368,9 +2402,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde" +checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca" dependencies = [ "async-trait", "futures", @@ -2386,48 +2420,47 @@ dependencies = [ [[package]] name = "opentelemetry-proto" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28" +checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c" dependencies = [ "futures", "futures-util", "opentelemetry", "prost", "tonic 0.8.3", - "tonic-build 0.8.4", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb" +checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5" dependencies = [ "opentelemetry", ] [[package]] name = "opentelemetry_api" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22" +checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2" dependencies = [ "fnv", "futures-channel", "futures-util", "indexmap", - "js-sys", "once_cell", "pin-project-lite", "thiserror", + "urlencoding", ] [[package]] name = "opentelemetry_sdk" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113" +checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1" dependencies = [ "async-trait", "crossbeam-channel", @@ -2482,6 +2515,7 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "async-stream", "async-trait", "byteorder", @@ -2498,6 +2532,7 @@ dependencies = [ "enum-map", "enumset", "fail", + "flate2", "futures", "git-version", "hex", @@ -2901,9 +2936,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.58" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da" dependencies = [ "unicode-ident", ] @@ -3292,9 +3327,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1" +checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8" dependencies = [ "anyhow", "async-trait", @@ -3962,7 +3997,7 @@ dependencies = [ "tokio", "tokio-stream", "tonic 0.9.2", - "tonic-build 0.9.2", + "tonic-build", "tracing", "utils", "workspace_hack", @@ -4063,7 +4098,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6" dependencies = [ "filetime", "libc", - "xattr", + "xattr 0.2.3", ] [[package]] @@ -4344,16 +4379,17 @@ dependencies = [ [[package]] name = "tokio-tar" -version = "0.3.0" -source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75" dependencies = [ "filetime", "futures-core", "libc", - "redox_syscall 0.2.16", + "redox_syscall 0.3.5", "tokio", "tokio-stream", - "xattr", + "xattr 1.0.0", ] [[package]] @@ -4480,19 +4516,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tonic-build" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" -dependencies = [ - "prettyplease 0.1.25", - "proc-macro2", - "prost-build", - "quote", - "syn 1.0.109", -] - [[package]] name = "tonic-build" version = "0.9.2" @@ -4616,9 +4639,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de" +checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600" dependencies = [ "once_cell", "opentelemetry", @@ -4817,6 +4840,7 @@ dependencies = [ "byteorder", "bytes", "chrono", + "const_format", "criterion", "futures", "heapless", @@ -5339,6 +5363,15 @@ dependencies = [ "libc", ] +[[package]] +name = "xattr" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a" +dependencies = [ + "libc", +] + [[package]] name = "xmlparser" version = "0.13.5" diff --git a/Cargo.toml b/Cargo.toml index f36e8f6569..01e2fe7cf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,8 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] anyhow = { version = "1.0", features = ["backtrace"] } +async-compression = { version = "0.4.0", features = ["tokio", "gzip"] } +flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" aws-config = { version = "0.55", default-features = false, features=["rustls"] } @@ -82,9 +84,9 @@ notify = "5.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.18.0" -opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.10.0" +opentelemetry = "0.19.0" +opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.11.0" parking_lot = "0.12" pbkdf2 = "0.12.1" pin-project-lite = "0.2" @@ -93,7 +95,7 @@ prost = "0.11" rand = "0.8" regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } +reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } reqwest-middleware = "0.2.0" reqwest-retry = "0.2.2" routerify = "3" @@ -122,13 +124,14 @@ tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" +tokio-tar = "0.3" tokio-util = { version = "0.7", features = ["io"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.18.0" +tracing-opentelemetry = "0.19.0" tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] } url = "2.2" uuid = { version = "1.2", features = ["v4", "serde"] } @@ -146,7 +149,6 @@ postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } -tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" } ## Other git libraries heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 7208024d63..1b5db2af81 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -132,10 +132,20 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta FROM build-deps AS h3-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -# packaged cmake is too old -RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ +RUN case "$(uname -m)" in \ + "x86_64") \ + export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ + ;; \ + "aarch64") \ + export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \ + ;; \ + *) \ + echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \ -q -O /tmp/cmake-install.sh \ - && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \ + && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \ && chmod u+x /tmp/cmake-install.sh \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 21226249cf..f8f8f729ce 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -6,8 +6,10 @@ license.workspace = true [dependencies] anyhow.workspace = true +async-compression.workspace = true chrono.workspace = true clap.workspace = true +flate2.workspace = true futures.workspace = true hyper = { workspace = true, features = ["full"] } notify.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index aec4e49725..b33f4f05dd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,5 @@ use std::fs; +use std::io::BufRead; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; @@ -15,6 +16,7 @@ use utils::lsn::Lsn; use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeMode, ComputeSpec}; +use utils::measured_stream::MeasuredReader; use crate::config; use crate::pg_helpers::*; @@ -140,14 +142,14 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> .cluster .roles .iter() - .map(|r| format!("'{}'", escape_literal(&r.name))) + .map(|r| escape_literal(&r.name)) .collect::>(); let dbs = spec .cluster .databases .iter() - .map(|db| format!("'{}'", escape_literal(&db.name))) + .map(|db| escape_literal(&db.name)) .collect::>(); let roles_decl = if roles.is_empty() { @@ -253,20 +255,52 @@ impl ComputeNode { let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), // First start of the compute - _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn), + // HACK We don't use compression on first start (Lsn(0)) because there's no API for it + Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id), + _ => format!( + "basebackup {} {} {} --gzip", + spec.tenant_id, spec.timeline_id, lsn + ), }; + let copyreader = client.copy_out(basebackup_cmd.as_str())?; + let mut measured_reader = MeasuredReader::new(copyreader); + + // Check the magic number to see if it's a gzip or not. Even though + // we might explicitly ask for gzip, an old pageserver with no implementation + // of gzip compression might send us uncompressed data. After some time + // passes we can assume all pageservers know how to compress and we can + // delete this check. + // + // If the data is not gzip, it will be tar. It will not be mistakenly + // recognized as gzip because tar starts with an ascii encoding of a filename, + // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover, + // we send the "global" directory first from the pageserver, so it definitely + // won't be recognized as gzip. + let mut bufreader = std::io::BufReader::new(&mut measured_reader); + let gzip = { + let peek = bufreader.fill_buf().unwrap(); + peek[0] == 0x1f && peek[1] == 0x8b + }; // Read the archive directly from the `CopyOutReader` // // Set `ignore_zeros` so that unpack() reads all the Copy data and // doesn't stop at the end-of-archive marker. Otherwise, if the server // sends an Error after finishing the tarball, we will not notice it. - let mut ar = tar::Archive::new(copyreader); - ar.set_ignore_zeros(true); - ar.unpack(&self.pgdata)?; + if gzip { + let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata)?; + } else { + let mut ar = tar::Archive::new(&mut bufreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata)?; + }; + // Report metrics + self.state.lock().unwrap().metrics.basebackup_bytes = + measured_reader.get_byte_count() as u64; self.state.lock().unwrap().metrics.basebackup_ms = Utc::now() .signed_duration_since(start_time) .to_std() @@ -549,6 +583,13 @@ impl ComputeNode { pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None") ); + // Log metrics so that we can search for slow operations in logs + let metrics = { + let state = self.state.lock().unwrap(); + state.metrics.clone() + }; + info!(?metrics, "compute start finished"); + Ok(pg) } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 99346433d0..68b943eec8 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -47,30 +47,22 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; if let Some(s) = &spec.pageserver_connstring { - writeln!( - file, - "neon.pageserver_connstring='{}'", - escape_conf_value(s) - )?; + writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } if !spec.safekeeper_connstrings.is_empty() { writeln!( file, - "neon.safekeepers='{}'", + "neon.safekeepers={}", escape_conf_value(&spec.safekeeper_connstrings.join(",")) )?; } if let Some(s) = &spec.tenant_id { - writeln!( - file, - "neon.tenant_id='{}'", - escape_conf_value(&s.to_string()) - )?; + writeln!(file, "neon.tenant_id={}", escape_conf_value(&s.to_string()))?; } if let Some(s) = &spec.timeline_id { writeln!( file, - "neon.timeline_id='{}'", + "neon.timeline_id={}", escape_conf_value(&s.to_string()) )?; } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 6a78bffd1b..b94a97a126 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -16,15 +16,26 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Escape a string for including it in a SQL literal +/// Escape a string for including it in a SQL literal. Wrapping the result +/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use +/// SQL string literal, e.g. `'db'''` or `E'db\\'`. +/// See +/// for the original implementation. pub fn escape_literal(s: &str) -> String { - s.replace('\'', "''").replace('\\', "\\\\") + let res = s.replace('\'', "''").replace('\\', "\\\\"); + + if res.contains('\\') { + format!("E'{}'", res) + } else { + format!("'{}'", res) + } } -/// Escape a string so that it can be used in postgresql.conf. -/// Same as escape_literal, currently. +/// Escape a string so that it can be used in postgresql.conf. Wrapping the result +/// with `'{}'` is not required, as it returns a ready-to-use config string. pub fn escape_conf_value(s: &str) -> String { - s.replace('\'', "''").replace('\\', "\\\\") + let res = s.replace('\'', "''").replace('\\', "\\\\"); + format!("'{}'", res) } trait GenericOptionExt { @@ -37,7 +48,7 @@ impl GenericOptionExt for GenericOption { fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { - "string" => format!("{} '{}'", self.name, escape_literal(val)), + "string" => format!("{} {}", self.name, escape_literal(val)), _ => format!("{} {}", self.name, val), } } else { @@ -49,7 +60,7 @@ impl GenericOptionExt for GenericOption { fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { - "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), + "string" => format!("{} = {}", self.name, escape_conf_value(val)), _ => format!("{} = {}", self.name, val), } } else { diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 520696da00..575a5332a8 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -397,10 +397,44 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // We do not check either DB exists or not, // Postgres will take care of it for us "delete_db" => { - let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote()); + // In Postgres we can't drop a database if it is a template. + // So we need to unset the template flag first, but it could + // be a retry, so we could've already dropped the database. + // Check that database exists first to make it idempotent. + let unset_template_query: String = format!( + " + DO $$ + BEGIN + IF EXISTS( + SELECT 1 + FROM pg_catalog.pg_database + WHERE datname = {} + ) + THEN + ALTER DATABASE {} is_template false; + END IF; + END + $$;", + escape_literal(&op.name), + &op.name.pg_quote() + ); + // Use FORCE to drop database even if there are active connections. + // We run this from `cloud_admin`, so it should have enough privileges. + // NB: there could be other db states, which prevent us from dropping + // the database. For example, if db is used by any active subscription + // or replication slot. + // TODO: deal with it once we allow logical replication. Proper fix should + // involve returning an error code to the control plane, so it could + // figure out that this is a non-retryable error, return it to the user + // and fail operation permanently. + let drop_db_query: String = format!( + "DROP DATABASE IF EXISTS {} WITH (FORCE)", + &op.name.pg_quote() + ); warn!("deleting database '{}'", &op.name); - client.execute(query.as_str(), &[])?; + client.execute(unset_template_query.as_str(), &[])?; + client.execute(drop_db_query.as_str(), &[])?; } "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 265556d3b9..7d27d22a78 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -89,4 +89,12 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor assert_eq!(none_generic_options.find("missed_value"), None); assert_eq!(none_generic_options.find("invalid_value"), None); } + + #[test] + fn test_escape_literal() { + assert_eq!(escape_literal("test"), "'test'"); + assert_eq!(escape_literal("test'"), "'test'''"); + assert_eq!(escape_literal("test\\'"), "E'test\\\\'''"); + assert_eq!(escape_literal("test\\'\\'"), "E'test\\\\''\\\\'''"); + } } diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 00af1a1d53..64664d65ff 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -10,7 +10,7 @@ //! (non-Neon binaries don't necessarily follow our pidfile conventions). //! The pid stored in the file is later used to stop the service. //! -//! See [`lock_file`] module for more info. +//! See the [`lock_file`](utils::lock_file) module for more info. use std::ffi::OsStr; use std::io::Write; diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs index ad19dfa204..8d40c7afc1 100644 --- a/control_plane/src/broker.rs +++ b/control_plane/src/broker.rs @@ -2,8 +2,9 @@ //! //! In the local test environment, the data for each safekeeper is stored in //! +//! ```text //! .neon/safekeepers/ -//! +//! ``` use anyhow::Context; use std::path::PathBuf; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index ab921d096f..ff373d7111 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -2,7 +2,9 @@ //! //! In the local test environment, the data for each endpoint is stored in //! +//! ```text //! .neon/endpoints/ +//! ``` //! //! Some basic information about the endpoint, like the tenant and timeline IDs, //! are stored in the `endpoint.json` file. The `endpoint.json` file is created @@ -22,7 +24,7 @@ //! //! Directory contents: //! -//! ```ignore +//! ```text //! .neon/endpoints/main/ //! compute.log - log output of `compute_ctl` and `postgres` //! endpoint.json - serialized `EndpointConf` struct diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 9e053ff1f1..d5e0fb112f 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -2,8 +2,9 @@ //! //! In the local test environment, the data for each safekeeper is stored in //! +//! ```text //! .neon/safekeepers/ -//! +//! ``` use std::io::Write; use std::path::PathBuf; use std::process::Child; diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 4926dad932..9777d1fdd2 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -189,7 +189,7 @@ services: - "/bin/bash" - "-c" command: - - "until pg_isready -h compute -p 55433 ; do + - "until pg_isready -h compute -p 55433 -U cloud_admin ; do echo 'Waiting to start compute...' && sleep 1; done" depends_on: diff --git a/docs/docker.md b/docs/docker.md index 704044377f..9761cc4346 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -48,6 +48,7 @@ Creating docker-compose_storage_broker_1 ... done 2. connect compute node ``` $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass +$ chmod 600 ~/.pgpass $ psql -h localhost -p 55433 -U cloud_admin postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE diff --git a/docs/rfcs/024-user-mgmt.md b/docs/rfcs/024-user-mgmt.md new file mode 100644 index 0000000000..7203357328 --- /dev/null +++ b/docs/rfcs/024-user-mgmt.md @@ -0,0 +1,84 @@ +# Postgres user and database management + +(This supersedes the previous proposal that looked too complicated and desynchronization-prone) + +We've accumulated a bunch of problems with our approach to role and database management, namely: + +1. we don't allow role and database creation from Postgres, and users are complaining about that +2. fine-grained role management is not possible both from Postgres and console + +Right now, we do store users and databases both in console and Postgres, and there are two main reasons for +that: + +* we want to be able to authenticate users in proxy against the console without Postgres' involvement. Otherwise, +malicious brute force attempts will wake up Postgres (expensive) and may exhaust the Postgres connections limit (deny of service). +* it is handy when we can render console UI without waking up compute (e.g., show database list) + +This RFC doesn't talk about giving root access to the database, which is blocked by a secure runtime setup. + +## Overview + +* Add Postgres extension that sends an HTTP request each time transaction that modifies users/databases is about to commit. +* Add user management API to internal console API. Also, the console should put a JWT token into the compute so that it can access management API. + +## Postgres behavior + +The default user role (@username) should have `CREATE ROLE`, `CREATE DB`, and `BYPASSRLS` privileges. We expose the Postgres port +to the open internet, so we need to check password strength. Now console generates strong passwords, so there is no risk of having dumb passwords. With user-provided passwords, such risks exist. + +Since we store passwords in the console we should also send unencrypted password when role is created/changed. Hence communication with the console must be encrypted. Postgres also supports creating roles using hashes, in that case, we will not be able to get a raw password. So I can see the following options here: + * roles created via SQL will *not* have raw passwords in the console + * roles created via SQL will have raw passwords in the console, except ones that were created using hashes + +I'm leaning towards the second option here as it is a bit more consistent one -- if raw password storage is enabled then we store passwords in all cases where we can store them. + +To send data about roles and databases from Postgres to the console we can create the following Postgres extension: + + * Intercept role/database changes in `ProcessUtility_hook`. Here we have access to the query statement with the raw password. The hook handler itself should not dial the console immediately and rather stash info in some hashmap for later use. + * When the transaction is about to commit we execute collected role modifications (all as one -- console should either accept all or reject all, and hence API shouldn't be REST-like). If the console request fails we can roll back the transaction. This way if the transaction is committed we know for sure that console has this information. We can use `XACT_EVENT_PRE_COMMIT` and `XACT_EVENT_PARALLEL_PRE_COMMIT` for that. + * Extension should be mindful of the fact that it is possible to create and delete roles within the transaction. + * We also need to track who is database owner, some coding around may be needed to get the current user when the database is created. + +## Console user management API + +The current public API has REST API for role management. We need to have some analog for the internal API (called mgmt API in the console code). But unlike public API here we want to have an atomic way to create several roles/databases (in cases when several roles were created in the same transaction). So something like that may work: + +``` +curl -X PATCH /api/v1/roles_and_databases -d ' +[ + {"op":"create", "type":"role", "name": "kurt", "password":"lYgT3BlbkFJ2vBZrqv"}, + {"op":"drop", "type":"role", "name": "trout"}, + {"op":"alter", "type":"role", "name": "kilgore", "password":"3BlbkFJ2vB"}, + {"op":"create", "type":"database", "name": "db2", "owner": "eliot"}, +] +' +``` + +Makes sense not to error out on duplicated create/delete operations (see failure modes) + +## Managing users from the console + +Now console puts a spec file with the list of databases/roles and delta operations in all the compute pods. `compute_ctl` then picks up that file and stubbornly executes deltas and checks data in the spec file is the same as in the Postgres. This way if the user creates a role in the UI we restart compute with a new spec file and during the start databases/roles are created. So if Postgres send an HTTP call each time role is created we need to break recursion in that case. We can do that based on application_name or some GUC or user (local == no HTTP hook). + +Generally, we have several options when we are creating users via console: + +1. restart compute with a new spec file, execute local SQL command; cut recursion in the extension +2. "push" spec files into running compute, execute local SQL command; cut recursion in the extension +3. "push" spec files into running compute, execute local SQL command; let extension create those roles in the console +4. avoid managing roles via spec files, send SQL commands to compute; let extension create those roles in the console + +The last option is the most straightforward one, but with the raw password storage opt-out, we will not have the password to establish an SQL connection. Also, we need a spec for provisioning purposes and to address potential desync (but that is quite unlikely). So I think the easiest approach would be: + +1. keep role management like it is now and cut the recursion in the extension when SQL is executed by compute_ctl +2. add "push" endpoint to the compute_ctl to avoid compute restart during the `apply_config` operation -- that can be done as a follow up to avoid increasing scope too much + +## Failure modes + +* during role creation via SQL role was created in the console but the connection was dropped before Postgres got acknowledgment or some error happened after acknowledgment (out of disk space, deadlock, etc): + + in that case, Postgres won't have a role that exists in the console. Compute restart will heal it (due to the spec file). Also if the console allows repeated creation/deletion user can repeat the transaction. + + +# Scalability + +On my laptop, I can create 4200 roles per second. That corresponds to 363 million roles per day. Since each role creation ends up in the console database we can add some limit to the number of roles (could be reasonably big to not run into it often -- like 1k or 10k). diff --git a/docs/tools.md b/docs/tools.md new file mode 100644 index 0000000000..1adef2be61 --- /dev/null +++ b/docs/tools.md @@ -0,0 +1,22 @@ +# Useful development tools + +This readme contains some hints on how to set up some optional development tools. + +## ccls + +[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup +to work well. There are different ways to do it but here's what works for me: +1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`) +2. Go to `vendor/postgres-v15` +3. Run `make clean && ./configure` +4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4` +5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent) +6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories + +With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well. + +Some additional tips for various IDEs: + +### Emacs + +To improve performance: `(setq lsp-lens-enable nil)` diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 80e5341216..6124c81f50 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -71,6 +71,7 @@ pub struct ComputeMetrics { pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, pub basebackup_ms: u64, + pub basebackup_bytes: u64, pub start_postgres_ms: u64, pub config_ms: u64, pub total_startup_ms: u64, diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs index 840f60f19b..e9a0a65570 100644 --- a/libs/metrics/src/metric_vec_duration.rs +++ b/libs/metrics/src/metric_vec_duration.rs @@ -1,4 +1,4 @@ -//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec. +//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. use std::{future::Future, time::Instant}; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index df5f5896a1..4c6529ffab 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -411,12 +411,16 @@ pub struct LayerResidenceEvent { pub reason: LayerResidenceEventReason, } -/// The reason for recording a given [`ResidenceEvent`]. +/// The reason for recording a given [`LayerResidenceEvent`]. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum LayerResidenceEventReason { /// The layer map is being populated, e.g. during timeline load or attach. /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`]. /// We need to record such events because there is no persistent storage for the events. + /// + // https://github.com/rust-lang/rust/issues/74481 + /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html + /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote LayerLoad, /// We just created the layer (e.g., freeze_and_flush or compaction). /// Such layers are always [`LayerResidenceStatus::Resident`]. diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 12693379f5..c98ad259bf 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -60,8 +60,9 @@ impl Ord for RelTag { /// Display RelTag in the same format that's used in most PostgreSQL debug messages: /// +/// ```text /// //[_fsm|_vm|_init] -/// +/// ``` impl fmt::Display for RelTag { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(forkname) = forknumber_to_name(self.forknum) { diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index 1dc9f367ff..aa0e625b47 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -49,14 +49,16 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { } } -/// /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple. /// /// Formats: +/// +/// ```text /// /// _ /// . /// _. +/// ``` /// /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources. /// diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 3cdca45009..c12898a05c 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -5,11 +5,11 @@ //! It is similar to what tokio_util::codec::Framed with appropriate codec //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used //! separately without using split from futures::stream::StreamExt (which -//! allocates box[1] in polling internally). tokio::io::split is used for splitting +//! allocates a [Box] in polling internally). tokio::io::split is used for splitting //! instead. Plus we customize error messages more than a single type for all io //! calls. //! -//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 +//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107 use bytes::{Buf, BytesMut}; use std::{ future::Future, @@ -117,7 +117,7 @@ impl Framed { impl Framed { /// Split into owned read and write parts. Beware of potential issues with /// using halves in different tasks on TLS stream: - /// https://github.com/tokio-rs/tls/issues/40 + /// pub fn split(self) -> (FramedReader, FramedWriter) { let (read_half, write_half) = tokio::io::split(self.stream); let reader = FramedReader { diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 8e361b757c..5c5e8a9559 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -934,6 +934,15 @@ impl<'a> BeMessage<'a> { } } +fn terminate_code(code: &[u8; 5]) -> [u8; 6] { + let mut terminated = [0; 6]; + for (i, &elem) in code.iter().enumerate() { + terminated[i] = elem; + } + + terminated +} + #[cfg(test)] mod tests { use super::*; @@ -965,12 +974,3 @@ mod tests { assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } } - -fn terminate_code(code: &[u8; 5]) -> [u8; 6] { - let mut terminated = [0; 6]; - for (i, &elem) in code.iter().enumerate() { - terminated[i] = elem; - } - - terminated -} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 0e9c237e1e..92ef793a34 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50; pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services -/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html +/// /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests -/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ +/// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. -/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax +/// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; @@ -50,6 +50,12 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct RemotePath(PathBuf); +impl std::fmt::Display for RemotePath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0.display()) + } +} + impl RemotePath { pub fn new(relative_path: &Path) -> anyhow::Result { anyhow::ensure!( diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index ca5fbd5de5..f1095ad8b8 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -7,6 +7,7 @@ use std::{ borrow::Cow, future::Future, + io::ErrorKind, path::{Path, PathBuf}, pin::Pin, }; @@ -150,10 +151,7 @@ impl RemoteStorage for LocalFs { let mut files = vec![]; let mut directory_queue = vec![full_path.clone()]; - while !directory_queue.is_empty() { - let cur_folder = directory_queue - .pop() - .expect("queue cannot be empty: we just checked"); + while let Some(cur_folder) = directory_queue.pop() { let mut entries = fs::read_dir(cur_folder.clone()).await?; while let Some(entry) = entries.next_entry().await? { let file_name: PathBuf = entry.file_name().into(); @@ -343,18 +341,14 @@ impl RemoteStorage for LocalFs { async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); - if !file_path.exists() { + match fs::remove_file(&file_path).await { + Ok(()) => Ok(()), + // The file doesn't exist. This shouldn't yield an error to mirror S3's behaviour. // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful. - return Ok(()); + Err(e) if e.kind() == ErrorKind::NotFound => Ok(()), + Err(e) => Err(anyhow::anyhow!(e)), } - - if !file_path.is_file() { - anyhow::bail!("{file_path:?} is not a file"); - } - Ok(fs::remove_file(file_path) - .await - .map_err(|e| anyhow::anyhow!(e))?) } async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index 093b053675..f05997ee65 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; // 2. D+C+a+b // 3. D+A+B -/// [`Segment`] which has had it's size calculated. +/// `Segment` which has had its size calculated. #[derive(Clone, Debug)] struct SegmentSize { method: SegmentMethod, diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index 3f80f49de1..f5ab267ff3 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -33,7 +33,7 @@ pub enum OtelName<'a> { /// directly into HTTP servers. However, I couldn't find one for Hyper, /// so I had to write our own. OpenTelemetry website has a registry of /// instrumentation libraries at: -/// https://opentelemetry.io/registry/?language=rust&component=instrumentation +/// /// If a Hyper crate appears, consider switching to that. pub async fn tracing_handler( req: Request, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 87b0082356..e7c8323c1d 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -40,6 +40,8 @@ pq_proto.workspace = true metrics.workspace = true workspace_hack.workspace = true +const_format.workspace = true + [dev-dependencies] byteorder.workspace = true bytes.workspace = true diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 9c153033cb..70e682cb76 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -14,7 +14,7 @@ pub async fn json_request Deserialize<'de>>( .map_err(ApiError::BadRequest) } -/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282 +/// Will be removed as part of pub async fn json_request_or_empty_body Deserialize<'de>>( request: &mut Request, ) -> Result, ApiError> { diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 69d3a1b9f2..3bcb092ba7 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -109,10 +109,16 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper; /// * building in docker (either in CI or locally) /// /// One thing to note is that .git is not available in docker (and it is bad to include it there). -/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. -/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -/// Git version received from environment variable used as a fallback in git_version invocation. -/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. +/// When building locally, the `git_version` is used to query .git. When building on CI and docker, +/// we don't build the actual PR branch commits, but always a "phantom" would be merge commit to +/// the target branch -- the actual PR commit from which we build from is supplied as GIT_VERSION +/// environment variable. +/// +/// We ended up with this compromise between phantom would be merge commits vs. pull request branch +/// heads due to old logs becoming more reliable (github could gc the phantom merge commit +/// anytime) in #4641. +/// +/// To avoid running buildscript every recompilation, we use rerun-if-env-changed option. /// So the build script will be run only when GIT_VERSION envvar has changed. /// /// Why not to use buildscript to get git commit sha directly without procmacro from different crate? @@ -124,25 +130,36 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper; /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. /// /// ############################################################################################# -/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details. -/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036 +/// TODO this macro is not the way the library is intended to be used, see for details. +/// We use `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. #[macro_export] macro_rules! project_git_version { ($const_identifier:ident) => { - const $const_identifier: &str = git_version::git_version!( - prefix = "git:", - fallback = concat!( - "git-env:", - env!("GIT_VERSION", "Missing GIT_VERSION envvar") - ), - args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha - ); + // this should try GIT_VERSION first only then git_version::git_version! + const $const_identifier: &::core::primitive::str = { + const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! { + prefix = "", + fallback = "unknown", + args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha + }; + + const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("GIT_VERSION") { + ::core::option::Option::Some(x) => ["git-env:", x], + ::core::option::Option::None => ["git:", __COMMIT_FROM_GIT], + }; + + $crate::__const_format::concatcp!(__ARG[0], __ARG[1]) + }; }; } +/// Re-export for `project_git_version` macro +#[doc(hidden)] +pub use const_format as __const_format; + /// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. #[macro_export] macro_rules! const_assert { diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index adbf47eb7a..ca8295040c 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,9 +1,10 @@ //! A module to create and read lock files. //! //! File locking is done using [`fcntl::flock`] exclusive locks. -//! The only consumer of this module is currently [`pid_file`]. -//! See the module-level comment there for potential pitfalls -//! with lock files that are used to store PIDs (pidfiles). +//! The only consumer of this module is currently +//! [`pid_file`](crate::pid_file). See the module-level comment +//! there for potential pitfalls with lock files that are used +//! to store PIDs (pidfiles). use std::{ fs, @@ -81,7 +82,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result TracingPanicHookGuard { diff --git a/libs/utils/src/measured_stream.rs b/libs/utils/src/measured_stream.rs index c37d686a1d..c82fc13109 100644 --- a/libs/utils/src/measured_stream.rs +++ b/libs/utils/src/measured_stream.rs @@ -1,4 +1,5 @@ use pin_project_lite::pin_project; +use std::io::Read; use std::pin::Pin; use std::{io, task}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; @@ -75,3 +76,34 @@ impl AsyncWrite for MeasuredStream { + inner: R, + byte_count: usize, +} + +impl MeasuredReader { + pub fn new(reader: R) -> Self { + Self { + inner: reader, + byte_count: 0, + } + } + + pub fn get_byte_count(&self) -> usize { + self.byte_count + } +} + +impl Read for MeasuredReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let result = self.inner.read(buf); + if let Ok(n_bytes) = result { + self.byte_count += n_bytes + } + result + } +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 70cf4a1ce9..014887392e 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -23,9 +23,9 @@ pub enum SeqWaitError { /// Monotonically increasing value /// -/// It is handy to store some other fields under the same mutex in SeqWait +/// It is handy to store some other fields under the same mutex in `SeqWait` /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with -/// any type that can expose counter. is the type of exposed counter. +/// any type that can expose counter. `V` is the type of exposed counter. pub trait MonotonicCounter { /// Bump counter value and check that it goes forward /// N.B.: new_val is an actual new value, not a difference. @@ -90,7 +90,7 @@ impl Eq for Waiter {} /// [`wait_for`]: SeqWait::wait_for /// [`advance`]: SeqWait::advance /// -/// means Storage, is type of counter that this storage exposes. +/// `S` means Storage, `V` is type of counter that this storage exposes. /// pub struct SeqWait where diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index b9f7986442..926bfc3188 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -1,8 +1,15 @@ //! Assert that the current [`tracing::Span`] has a given set of fields. //! +//! Can only produce meaningful positive results when tracing has been configured as in example. +//! Absence of `tracing_error::ErrorLayer` is not detected yet. +//! +//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing +//! is completly unconfigured. +//! //! # Usage //! -//! ``` +//! ```rust +//! # fn main() { //! use tracing_subscriber::prelude::*; //! let registry = tracing_subscriber::registry() //! .with(tracing_error::ErrorLayer::default()); @@ -20,23 +27,18 @@ //! //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); -//! match check_fields_present([&extractor]) { -//! Ok(()) => {}, -//! Err(missing) => { -//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::>()); -//! } +//! if let Err(missing) = check_fields_present!([&extractor]) { +//! // if you copypaste this to a custom assert method, remember to add #[track_caller] +//! // to get the "user" code location for the panic. +//! panic!("Missing fields: {missing:?}"); //! } +//! # } //! ``` //! -//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering +//! Recommended reading: //! -use std::{ - collections::HashSet, - fmt::{self}, - hash::{Hash, Hasher}, -}; - +#[derive(Debug)] pub enum ExtractionResult { Present, Absent, @@ -71,51 +73,105 @@ impl Extractor for MultiNameExtractor { } } -struct MemoryIdentity<'a>(&'a dyn Extractor); - -impl<'a> MemoryIdentity<'a> { - fn as_ptr(&self) -> *const () { - self.0 as *const _ as *const () - } -} -impl<'a> PartialEq for MemoryIdentity<'a> { - fn eq(&self, other: &Self) -> bool { - self.as_ptr() == other.as_ptr() - } -} -impl<'a> Eq for MemoryIdentity<'a> {} -impl<'a> Hash for MemoryIdentity<'a> { - fn hash(&self, state: &mut H) { - self.as_ptr().hash(state); - } -} -impl<'a> fmt::Debug for MemoryIdentity<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) - } -} - -/// The extractor names passed as keys to [`new`]. -pub fn check_fields_present( +/// Checks that the given extractors are satisfied with the current span hierarchy. +/// +/// This should not be called directly, but used through [`check_fields_present`] which allows +/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default. +#[doc(hidden)] +pub fn check_fields_present0( must_be_present: [&dyn Extractor; L], -) -> Result<(), Vec<&dyn Extractor>> { - let mut missing: HashSet = - HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r))); +) -> Result> { + let mut missing = must_be_present.into_iter().collect::>(); let trace = tracing_error::SpanTrace::capture(); trace.with_spans(|md, _formatted_fields| { - missing.retain(|extractor| match extractor.0.extract(md.fields()) { + // when trying to understand the inner workings of how does the matching work, note that + // this closure might be called zero times if the span is disabled. normally it is called + // once per span hierarchy level. + missing.retain(|extractor| match extractor.extract(md.fields()) { ExtractionResult::Present => false, ExtractionResult::Absent => true, }); - !missing.is_empty() // continue walking up until we've found all missing + + // continue walking up until we've found all missing + !missing.is_empty() }); if missing.is_empty() { - Ok(()) + Ok(Summary::FoundEverything) + } else if !tracing_subscriber_configured() { + Ok(Summary::Unconfigured) } else { - Err(missing.into_iter().map(|mi| mi.0).collect()) + // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is + // missing, which can be annoying. for this case, we could probably use + // SpanTrace::status(). + // + // another way to end up here is with RUST_LOG=pageserver=off while configuring the + // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid. + // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`. + Err(missing) } } +/// Checks that the given extractors are satisfied with the current span hierarchy. +/// +/// The macro is the preferred way of checking if fields exist while passing checks if a test does +/// not have tracing configured. +/// +/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present. +/// However we can game a module namespaced macro for `use` purposes by re-exporting the +/// #[macro_export] exported name with an alias (below). +#[doc(hidden)] +#[macro_export] +macro_rules! __check_fields_present { + ($extractors:expr) => {{ + { + use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor}; + + match check_fields_present0($extractors) { + Ok(FoundEverything) => Ok(()), + Ok(Unconfigured) if cfg!(test) => { + // allow unconfigured in tests + Ok(()) + }, + Ok(Unconfigured) => { + panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + }, + Err(missing) => Err(missing) + } + } + }} +} + +pub use crate::__check_fields_present as check_fields_present; + +/// Explanation for why the check was deemed ok. +/// +/// Mainly useful for testing, or configuring per-crate behaviour as in with +/// [`check_fields_present`]. +#[derive(Debug)] +pub enum Summary { + /// All extractors were found. + /// + /// Should only happen when tracing is properly configured. + FoundEverything, + + /// Tracing has not been configured at all. This is ok for tests running without tracing set + /// up. + Unconfigured, +} + +fn tracing_subscriber_configured() -> bool { + let mut noop_configured = false; + tracing::dispatcher::get_default(|d| { + // it is possible that this closure will not be invoked, but the current implementation + // always invokes it + noop_configured = d + .downcast_ref::() + .is_some(); + }); + + !noop_configured +} + #[cfg(test)] mod tests { @@ -123,6 +179,36 @@ mod tests { use super::*; + use std::{ + collections::HashSet, + fmt::{self}, + hash::{Hash, Hasher}, + }; + + struct MemoryIdentity<'a>(&'a dyn Extractor); + + impl<'a> MemoryIdentity<'a> { + fn as_ptr(&self) -> *const () { + self.0 as *const _ as *const () + } + } + impl<'a> PartialEq for MemoryIdentity<'a> { + fn eq(&self, other: &Self) -> bool { + self.as_ptr() == other.as_ptr() + } + } + impl<'a> Eq for MemoryIdentity<'a> {} + impl<'a> Hash for MemoryIdentity<'a> { + fn hash(&self, state: &mut H) { + self.as_ptr().hash(state); + } + } + impl<'a> fmt::Debug for MemoryIdentity<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + } + } + struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, tenant_extractor: MultiNameExtractor<2>, @@ -159,7 +245,8 @@ mod tests { let setup = setup_current_thread(); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let _guard = span.enter(); - check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); + assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] @@ -167,8 +254,8 @@ mod tests { let setup = setup_current_thread(); let span = tracing::info_span!("root", timeline_id = "timeline-1"); let _guard = span.enter(); - let missing = - check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err(); + let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]) + .unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } @@ -185,7 +272,8 @@ mod tests { let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let _guard = span.enter(); - check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap(); + let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]); + assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] @@ -198,7 +286,7 @@ mod tests { let span = tracing::info_span!("child", timeline_id = "timeline-1"); let _guard = span.enter(); - let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } @@ -207,7 +295,8 @@ mod tests { let setup = setup_current_thread(); let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1"); let _guard = span.enter(); - check_fields_present([&setup.tenant_extractor]).unwrap(); + let res = check_fields_present0([&setup.tenant_extractor]); + assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] @@ -223,7 +312,8 @@ mod tests { let span = tracing::info_span!("grandchild", timeline_id = "timeline-1"); let _guard = span.enter(); - check_fields_present([&setup.tenant_extractor]).unwrap(); + let res = check_fields_present0([&setup.tenant_extractor]); + assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); } #[test] @@ -231,7 +321,7 @@ mod tests { let setup = setup_current_thread(); let span = tracing::info_span!("root", timeline_id = "timeline-1"); let _guard = span.enter(); - let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } @@ -245,43 +335,107 @@ mod tests { let span = tracing::info_span!("child", timeline_id = "timeline-1"); let _guard = span.enter(); - let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err(); + let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err(); assert_missing(missing, vec![&setup.tenant_extractor]); } #[test] - fn tracing_error_subscriber_not_set_up() { + fn tracing_error_subscriber_not_set_up_straight_line() { // no setup - let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); let extractor = MultiNameExtractor::new("E", ["e"]); - let missing = check_fields_present([&extractor]).unwrap_err(); - assert_missing(missing, vec![&extractor]); + let res = check_fields_present0([&extractor]); + assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); + + // similarly for a not found key + let extractor = MultiNameExtractor::new("F", ["foobar"]); + let res = check_fields_present0([&extractor]); + assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } #[test] - #[should_panic] - fn panics_if_tracing_error_subscriber_has_wrong_filter() { + fn tracing_error_subscriber_not_set_up_with_instrument() { + // no setup + + // demo a case where span entering is used to establish a parent child connection, but + // when we re-enter the subspan SpanTrace::with_spans iterates over nothing. + let span = tracing::info_span!("foo", e = "some value"); + let _guard = span.enter(); + + let subspan = tracing::info_span!("bar", f = "foobar"); + drop(_guard); + + // normally this would work, but without any tracing-subscriber configured, both + // check_field_present find nothing + let _guard = subspan.enter(); + let extractors: [&dyn Extractor; 2] = [ + &MultiNameExtractor::new("E", ["e"]), + &MultiNameExtractor::new("F", ["f"]), + ]; + + let res = check_fields_present0(extractors); + assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); + + // similarly for a not found key + let extractor = MultiNameExtractor::new("G", ["g"]); + let res = check_fields_present0([&extractor]); + assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); + } + + #[test] + fn tracing_subscriber_configured() { + // this will fail if any utils::logging::init callers appear, but let's hope they do not + // appear. + assert!(!super::tracing_subscriber_configured()); + + let _g = setup_current_thread(); + + assert!(super::tracing_subscriber_configured()); + } + + #[test] + fn not_found_when_disabled_by_filter() { let r = tracing_subscriber::registry().with({ - tracing_error::ErrorLayer::default().with_filter( - tracing_subscriber::filter::dynamic_filter_fn(|md, _| { - if md.is_span() && *md.level() == tracing::Level::INFO { - return false; - } - true - }), - ) + tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn( + |md| !(md.is_span() && *md.level() == tracing::Level::INFO), + )) }); let _guard = tracing::subscriber::set_default(r); + // this test is a rather tricky one, it has a number of possible outcomes depending on the + // execution order when executed with other tests even if no test sets the global default + // subscriber. + let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractor = MultiNameExtractor::new("E", ["e"]); - let missing = check_fields_present([&extractor]).unwrap_err(); - assert_missing(missing, vec![&extractor]); + let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; + + if span.is_disabled() { + // the tests are running single threaded, or we got lucky and no other tests subscriber + // was got to register their per-CALLSITE::META interest between `set_default` and + // creation of the span, thus the filter got to apply and registered interest of Never, + // so the span was never created. + // + // as the span is disabled, no keys were recorded to it, leading check_fields_present0 + // to find an error. + + let missing = check_fields_present0(extractors).unwrap_err(); + assert_missing(missing, vec![extractors[0]]); + } else { + // when the span is enabled, it is because some other test is running at the same time, + // and that tests registry has filters which are interested in our above span. + // + // because the span is now enabled, all keys will be found for it. the + // tracing_error::SpanTrace does not consider layer filters during the span hierarchy + // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in + // this test-induced issue. + + let res = check_fields_present0(extractors); + assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}"); + } } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index ea81544cbe..9381ed0bfa 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,7 @@ testing = ["fail/failpoints"] [dependencies] anyhow.workspace = true +async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true byteorder.workspace = true @@ -24,6 +25,7 @@ consumption_metrics.workspace = true crc32c.workspace = true crossbeam-utils.workspace = true either.workspace = true +flate2.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 03bb7a5bfd..f7a5832844 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,8 +1,8 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName}; -use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc}; +use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::PersistentLayerDesc; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; @@ -28,13 +28,13 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { for fname in filenames { let fname = fname.unwrap(); let fname = LayerFileName::from_str(&fname).unwrap(); - let layer = LayerDescriptor::from(fname); + let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); min_lsn = min(min_lsn, lsn_range.start); max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1)); - updates.insert_historic(layer.layer_desc().clone()); + updates.insert_historic(layer); } println!("min: {min_lsn}, max: {max_lsn}"); @@ -210,15 +210,15 @@ fn bench_sequential(c: &mut Criterion) { for i in 0..100_000 { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); - let layer = LayerDescriptor::from(PersistentLayerDesc::new_img( + let layer = PersistentLayerDesc::new_img( TenantId::generate(), TimelineId::generate(), zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), false, 0, - )); - updates.insert_historic(layer.layer_desc().clone()); + ); + updates.insert_historic(layer); } updates.flush(); println!("Finished layer map init in {:?}", now.elapsed()); diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index bfde5ba054..568078808f 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -7,10 +7,10 @@ //! - The y axis represents LSN, growing upwards. //! //! Coordinates in both axis are compressed for better readability. -//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb) +//! (see ) //! //! Example use: -//! ``` +//! ```bash //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ //! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg //! $ firefox out.svg @@ -20,7 +20,7 @@ //! or from pageserver log files. //! //! TODO Consider shipping this as a grafana panel plugin: -//! https://grafana.com/tutorials/build-a-panel-plugin/ +//! use anyhow::Result; use pageserver::repository::Key; use std::cmp::Ordering; @@ -117,7 +117,8 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; - let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas + let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -128,7 +129,7 @@ pub fn main() -> Result<()> { num_images += 1; lsn_diff = 0.3; lsn_offset = -lsn_diff / 2.0; - margin = 0.05; + ymargin = 0.05; fill = Fill::Color(rgb(0, 0, 0)); } Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), @@ -137,10 +138,10 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * margin, - stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)), - key_diff as f32 - stretch * 2.0 * margin, - stretch * (lsn_diff - 2.0 * margin) + key_start as f32 + stretch * xmargin, + stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + key_diff as f32 - stretch * 2.0 * xmargin, + stretch * (lsn_diff - 2.0 * ymargin) ) .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index c666fc785c..d2dc759835 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -19,12 +19,6 @@ use tokio::io; use tokio::io::AsyncWrite; use tracing::*; -/// NB: This relies on a modified version of tokio_tar that does *not* write the -/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped -/// without explicitly calling 'finish' or 'into_inner'! -/// -/// See https://github.com/neondatabase/tokio-tar/pull/1 -/// use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 2046d27b1e..4c6df469aa 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -171,11 +171,13 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. + /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`. /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`. /// See the comment in `eviction_task` for details. + /// + /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore, // How often to collect metrics and send them to the metrics endpoint. @@ -570,21 +572,21 @@ impl PageServerConf { .join(TENANT_ATTACHING_MARKER_FILENAME) } - pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf { - self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME) + pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) } /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. - pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { - self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) + pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME) } pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf { + pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf { self.timelines_path(tenant_id).join(timeline_id.to_string()) } @@ -594,7 +596,7 @@ impl PageServerConf { timeline_id: TimelineId, ) -> PathBuf { path_with_suffix_extension( - self.timeline_path(&timeline_id, &tenant_id), + self.timeline_path(&tenant_id, &timeline_id), TIMELINE_UNINIT_MARK_SUFFIX, ) } @@ -617,8 +619,8 @@ impl PageServerConf { /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. - pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { - self.timeline_path(&timeline_id, &tenant_id) + pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf { + self.timeline_path(tenant_id, timeline_id) .join(METADATA_FILE_NAME) } @@ -993,6 +995,8 @@ impl ConfigurableSemaphore { /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will /// behave like [`futures::future::pending`], just waiting until new permits are added. + /// + /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs pub fn new(initial_permits: NonZeroUsize) -> Self { ConfigurableSemaphore { initial_permits, diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f7592afc12..df44300fce 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -234,14 +234,18 @@ pub async fn collect_metrics_iteration( // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one let tenant_synthetic_size = tenant.get_cached_synthetic_size(); - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: None, - metric: SYNTHETIC_STORAGE_SIZE, - }, - tenant_synthetic_size, - )); + + if tenant_synthetic_size != 0 { + // only send non-zeroes because otherwise these show up as errors in logs + current_metrics.push(( + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: SYNTHETIC_STORAGE_SIZE, + }, + tenant_synthetic_size, + )); + } } // Filter metrics, unless we want to send all metrics, including cached ones. diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index f53b7736ab..a1a5c30ae9 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -179,6 +179,9 @@ impl RequestContext { /// a context and you are unwilling to change all callers to provide one. /// /// Before we add cancellation, we should get rid of this method. + /// + /// [`attached_child`]: Self::attached_child + /// [`detached_child`]: Self::detached_child pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { Self::new(task_kind, download_behavior) } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 61cbd5066f..b2ca9ab0bb 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task( disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel) .await; - info!("disk usage based eviction task finishing"); Ok(()) }, ); @@ -126,13 +125,16 @@ async fn disk_usage_eviction_task( tenants_dir: &Path, cancel: CancellationToken, ) { + scopeguard::defer! { + info!("disk usage based eviction task finishing"); + }; + use crate::tenant::tasks::random_init_delay; { if random_init_delay(task_config.period, &cancel) .await .is_err() { - info!("shutting down"); return; } } @@ -167,7 +169,6 @@ async fn disk_usage_eviction_task( tokio::select! { _ = tokio::time::sleep_until(sleep_until) => {}, _ = cancel.cancelled() => { - info!("shutting down"); break } } @@ -304,7 +305,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { debug!( - "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}", i + 1, candidates.len(), candidate.layer.file_size(), @@ -314,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( partition, candidate.layer.get_tenant_id(), candidate.layer.get_timeline_id(), - candidate.layer.filename().file_name(), + candidate.layer, ); } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 58dcbb2aac..08fb917fb6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -346,7 +346,7 @@ async fn timeline_create_handler( Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) .await } @@ -381,7 +381,7 @@ async fn timeline_list_handler( } Ok::, ApiError>(response_data) } - .instrument(info_span!("timeline_list", tenant = %tenant_id)) + .instrument(info_span!("timeline_list", %tenant_id)) .await?; json_response(StatusCode::OK, response_data) @@ -418,7 +418,7 @@ async fn timeline_detail_handler( Ok::<_, ApiError>(timeline_info) } - .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) + .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id)) .await?; json_response(StatusCode::OK, timeline_info) @@ -479,7 +479,7 @@ async fn tenant_attach_handler( remote_storage.clone(), &ctx, ) - .instrument(info_span!("tenant_attach", tenant = %tenant_id)) + .instrument(info_span!("tenant_attach", %tenant_id)) .await?; } else { return Err(ApiError::BadRequest(anyhow!( @@ -501,7 +501,7 @@ async fn timeline_delete_handler( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); mgr::delete_timeline(tenant_id, timeline_id, &ctx) - .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id)) + .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id)) .await?; // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404. @@ -519,7 +519,7 @@ async fn tenant_detach_handler( let state = get_state(&request); let conf = state.conf; mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false)) - .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .instrument(info_span!("tenant_detach", %tenant_id)) .await?; json_response(StatusCode::OK, ()) @@ -542,7 +542,7 @@ async fn tenant_load_handler( state.remote_storage.clone(), &ctx, ) - .instrument(info_span!("load", tenant = %tenant_id)) + .instrument(info_span!("load", %tenant_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -558,7 +558,7 @@ async fn tenant_ignore_handler( let state = get_state(&request); let conf = state.conf; mgr::ignore_tenant(conf, tenant_id) - .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) + .instrument(info_span!("ignore_tenant", %tenant_id)) .await?; json_response(StatusCode::OK, ()) @@ -611,7 +611,7 @@ async fn tenant_status( attachment_status: state.attachment_status(), }) } - .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) + .instrument(info_span!("tenant_status_handler", %tenant_id)) .await?; json_response(StatusCode::OK, tenant_info) @@ -850,7 +850,7 @@ async fn tenant_create_handler( state.remote_storage.clone(), &ctx, ) - .instrument(info_span!("tenant_create", tenant = ?target_tenant_id)) + .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id)) .await?; // We created the tenant. Existing API semantics are that the tenant @@ -912,7 +912,7 @@ async fn update_tenant_config_handler( let state = get_state(&request); mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) - .instrument(info_span!("tenant_config", tenant = ?tenant_id)) + .instrument(info_span!("tenant_config", %tenant_id)) .await?; json_response(StatusCode::OK, ()) @@ -1143,7 +1143,7 @@ async fn disk_usage_eviction_run( let Some(storage) = state.remote_storage.clone() else { return Err(ApiError::InternalServerError(anyhow::anyhow!( "remote storage not configured, cannot run eviction iteration" - ))) + ))); }; let state = state.disk_usage_eviction_state.clone(); diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index db5bccdbba..ee8dfba69a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, - Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, - UIntGauge, UIntGaugeVec, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, + register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter, + IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::models::TenantState; @@ -203,6 +203,49 @@ pub static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { }, }); +pub struct PageCacheSizeMetrics { + pub max_bytes: UIntGauge, + + pub current_bytes_ephemeral: UIntGauge, + pub current_bytes_immutable: UIntGauge, + pub current_bytes_materialized_page: UIntGauge, +} + +static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_page_cache_size_current_bytes", + "Current size of the page cache in bytes, by key kind", + &["key_kind"] + ) + .expect("failed to define a metric") +}); + +pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheSizeMetrics { + max_bytes: { + register_uint_gauge!( + "pageserver_page_cache_size_max_bytes", + "Maximum size of the page cache in bytes" + ) + .expect("failed to define a metric") + }, + + current_bytes_ephemeral: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["ephemeral"]) + .unwrap() + }, + current_bytes_immutable: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["immutable"]) + .unwrap() + }, + current_bytes_materialized_page: { + PAGE_CACHE_SIZE_CURRENT_BYTES + .get_metric_with_label_values(&["materialized_page"]) + .unwrap() + }, +}); + static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_wait_lsn_seconds", @@ -342,7 +385,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. +/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub struct EvictionsWithLowResidenceDuration { data_source: &'static str, @@ -498,6 +541,17 @@ pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +// keep in sync with control plane Go code so that we can validate +// compute's basebackup_ms metric with our perspective in the context of SLI/SLO. +static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { + // Go code uses milliseconds. Variable is called `computeStartupBuckets` + [ + 5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000, + 1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000, + ] + .map(|ms| (ms as f64) / 1000.0) +}); + pub struct BasebackupQueryTime(HistogramVec); pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { BasebackupQueryTime({ @@ -505,7 +559,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { "pageserver_basebackup_query_seconds", "Histogram of basebackup queries durations, by result type", &["result"], - CRITICAL_OP_BUCKETS.into(), + COMPUTE_STARTUP_BUCKETS.to_vec(), ) .expect("failed to define a metric") }) @@ -775,7 +829,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); -/// Similar to [`prometheus::HistogramTimer`] but does not record on drop. +/// Similar to `prometheus::HistogramTimer` but does not record on drop. pub struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, @@ -833,7 +887,7 @@ impl StorageTimeMetrics { /// Starts timing a new operation. /// - /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop. + /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop. pub fn start_timer(&self) -> StorageTimeMetricsTimer { StorageTimeMetricsTimer::new(self.clone()) } @@ -1213,7 +1267,7 @@ impl RemoteTimelineClientMetrics { /// Update the metrics that change when a call to the remote timeline client instance starts. /// /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions. - /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that + /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that /// is more suitable. /// Never do both. pub(crate) fn call_begin( @@ -1246,7 +1300,7 @@ impl RemoteTimelineClientMetrics { /// Manually udpate the metrics that track completions, instead of using the guard object. /// Using the guard object is generally preferable. - /// See [`call_begin`] for more context. + /// See [`call_begin`](Self::call_begin) for more context. pub(crate) fn call_end( &self, file_kind: &RemoteOpFileKind, diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index ef0e748d10..e29eb1d197 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,8 +53,8 @@ use utils::{ lsn::Lsn, }; -use crate::repository::Key; use crate::tenant::writeback_ephemeral_file; +use crate::{metrics::PageCacheSizeMetrics, repository::Key}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -187,6 +187,8 @@ pub struct PageCache { /// Index of the next candidate to evict, for the Clock replacement algorithm. /// This is interpreted modulo the page cache size. next_evict_slot: AtomicUsize, + + size_metrics: &'static PageCacheSizeMetrics, } /// @@ -718,6 +720,9 @@ impl PageCache { if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) { versions.remove(version_idx); + self.size_metrics + .current_bytes_materialized_page + .sub_page_sz(1); if versions.is_empty() { old_entry.remove_entry(); } @@ -730,11 +735,13 @@ impl PageCache { let mut map = self.ephemeral_page_map.write().unwrap(); map.remove(&(*file_id, *blkno)) .expect("could not find old key in mapping"); + self.size_metrics.current_bytes_ephemeral.sub_page_sz(1); } CacheKey::ImmutableFilePage { file_id, blkno } => { let mut map = self.immutable_page_map.write().unwrap(); map.remove(&(*file_id, *blkno)) .expect("could not find old key in mapping"); + self.size_metrics.current_bytes_immutable.sub_page_sz(1); } } } @@ -762,6 +769,9 @@ impl PageCache { slot_idx, }, ); + self.size_metrics + .current_bytes_materialized_page + .add_page_sz(1); None } } @@ -772,6 +782,7 @@ impl PageCache { Entry::Occupied(entry) => Some(*entry.get()), Entry::Vacant(entry) => { entry.insert(slot_idx); + self.size_metrics.current_bytes_ephemeral.add_page_sz(1); None } } @@ -782,6 +793,7 @@ impl PageCache { Entry::Occupied(entry) => Some(*entry.get()), Entry::Vacant(entry) => { entry.insert(slot_idx); + self.size_metrics.current_bytes_immutable.add_page_sz(1); None } } @@ -881,6 +893,12 @@ impl PageCache { let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice()); + let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; + size_metrics.max_bytes.set_page_sz(num_pages); + size_metrics.current_bytes_ephemeral.set_page_sz(0); + size_metrics.current_bytes_immutable.set_page_sz(0); + size_metrics.current_bytes_materialized_page.set_page_sz(0); + let slots = page_buffer .chunks_exact_mut(PAGE_SZ) .map(|chunk| { @@ -903,6 +921,30 @@ impl PageCache { immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), + size_metrics, } } } + +trait PageSzBytesMetric { + fn set_page_sz(&self, count: usize); + fn add_page_sz(&self, count: usize); + fn sub_page_sz(&self, count: usize); +} + +#[inline(always)] +fn count_times_page_sz(count: usize) -> u64 { + u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap() +} + +impl PageSzBytesMetric for metrics::UIntGauge { + fn set_page_sz(&self, count: usize) { + self.set(count_times_page_sz(count)); + } + fn add_page_sz(&self, count: usize) { + self.add(count_times_page_sz(count)); + } + fn sub_page_sz(&self, count: usize) { + self.sub(count_times_page_sz(count)); + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8728559d72..35dd5ecdb5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -10,6 +10,7 @@ // use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use bytes::Bytes; use futures::Stream; @@ -31,8 +32,10 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; +use tracing::field; use tracing::*; use utils::id::ConnectionId; use utils::{ @@ -51,6 +54,7 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::mgr; use crate::tenant::mgr::GetTenantError; use crate::tenant::{Tenant, Timeline}; @@ -238,6 +242,7 @@ pub async fn libpq_listener_main( Ok(()) } +#[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, @@ -260,6 +265,7 @@ async fn page_service_conn_main( .context("could not set TCP_NODELAY")?; let peer_addr = socket.peer_addr().context("get peer address")?; + tracing::Span::current().record("peer_addr", field::display(peer_addr)); // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements: // - long enough for most valid compute connections @@ -362,7 +368,7 @@ impl PageServerHandler { } } - #[instrument(skip(self, pgb, ctx))] + #[instrument(skip_all)] async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, @@ -373,6 +379,8 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + debug_assert_current_span_has_tenant_and_timeline_id(); + // NOTE: pagerequests handler exits when connection is closed, // so there is no need to reset the association task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); @@ -473,7 +481,7 @@ impl PageServerHandler { } #[allow(clippy::too_many_arguments)] - #[instrument(skip(self, pgb, ctx))] + #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))] async fn handle_import_basebackup( &self, pgb: &mut PostgresBackend, @@ -487,6 +495,8 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + debug_assert_current_span_has_tenant_and_timeline_id(); + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); @@ -531,7 +541,7 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip(self, pgb, ctx))] + #[instrument(skip_all, fields(%start_lsn, %end_lsn))] async fn handle_import_wal( &self, pgb: &mut PostgresBackend, @@ -544,6 +554,7 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + debug_assert_current_span_has_tenant_and_timeline_id(); task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; @@ -738,7 +749,7 @@ impl PageServerHandler { } #[allow(clippy::too_many_arguments)] - #[instrument(skip(self, pgb, ctx))] + #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, @@ -747,11 +758,14 @@ impl PageServerHandler { lsn: Option, prev_lsn: Option, full_backup: bool, + gzip: bool, ctx: RequestContext, ) -> anyhow::Result<()> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { + debug_assert_current_span_has_tenant_and_timeline_id(); + let started = std::time::Instant::now(); // check that the timeline exists @@ -772,8 +786,9 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; pgb.flush().await?; - // Send a tarball of the latest layer on the timeline - { + // Send a tarball of the latest layer on the timeline. Compress if not + // fullbackup. TODO Compress in that case too (tests need to be updated) + if full_backup { let mut writer = pgb.copyout_writer(); basebackup::send_basebackup_tarball( &mut writer, @@ -784,6 +799,40 @@ impl PageServerHandler { &ctx, ) .await?; + } else { + let mut writer = pgb.copyout_writer(); + if gzip { + let mut encoder = GzipEncoder::with_quality( + writer, + // NOTE using fast compression because it's on the critical path + // for compute startup. For an empty database, we get + // <100KB with this method. The Level::Best compression method + // gives us <20KB, but maybe we should add basebackup caching + // on compute shutdown first. + async_compression::Level::Fastest, + ); + basebackup::send_basebackup_tarball( + &mut encoder, + &timeline, + lsn, + prev_lsn, + full_backup, + &ctx, + ) + .await?; + // shutdown the encoder to ensure the gzip footer is written + encoder.shutdown().await?; + } else { + basebackup::send_basebackup_tarball( + &mut writer, + &timeline, + lsn, + prev_lsn, + full_backup, + &ctx, + ) + .await?; + } } pgb.write_message_noflush(&BeMessage::CopyDone)?; @@ -862,6 +911,7 @@ where Ok(()) } + #[instrument(skip_all, fields(tenant_id, timeline_id))] async fn process_query( &mut self, pgb: &mut PostgresBackend, @@ -883,6 +933,10 @@ where let timeline_id = TimelineId::from_str(params[1]) .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + self.check_permission(Some(tenant_id))?; self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) @@ -902,6 +956,10 @@ where let timeline_id = TimelineId::from_str(params[1]) .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + self.check_permission(Some(tenant_id))?; let lsn = if params.len() >= 3 { @@ -913,6 +971,19 @@ where None }; + let gzip = if params.len() >= 4 { + if params[3] == "--gzip" { + true + } else { + return Err(QueryError::Other(anyhow::anyhow!( + "Parameter in position 3 unknown {}", + params[3], + ))); + } + } else { + false + }; + metrics::metric_vec_duration::observe_async_block_duration_by_result( &*crate::metrics::BASEBACKUP_QUERY_TIME, async move { @@ -923,6 +994,7 @@ where lsn, None, false, + gzip, ctx, ) .await?; @@ -948,6 +1020,10 @@ where let timeline_id = TimelineId::from_str(params[1]) .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + self.check_permission(Some(tenant_id))?; let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?; @@ -979,6 +1055,10 @@ where let timeline_id = TimelineId::from_str(params[1]) .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + // The caller is responsible for providing correct lsn and prev_lsn. let lsn = if params.len() > 2 { Some( @@ -1000,8 +1080,17 @@ where self.check_permission(Some(tenant_id))?; // Check that the timeline exists - self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx) - .await?; + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + prev_lsn, + true, + false, + ctx, + ) + .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("import basebackup ") { // Import the `base` section (everything but the wal) of a basebackup. @@ -1033,6 +1122,10 @@ where let pg_version = u32::from_str(params[4]) .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + self.check_permission(Some(tenant_id))?; match self @@ -1077,6 +1170,10 @@ where let end_lsn = Lsn::from_str(params[3]) .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + self.check_permission(Some(tenant_id))?; match self @@ -1108,6 +1205,8 @@ where let tenant_id = TenantId::from_str(params[0]) .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + tracing::Span::current().record("tenant_id", field::display(tenant_id)); + self.check_permission(Some(tenant_id))?; let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a54cf9f91b..54b41f3e9d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> { /// context, breaking the atomicity is OK. If the import is interrupted, the /// whole import fails and the timeline will be deleted anyway. /// (Or to be precise, it will be left behind for debugging purposes and - /// ignored, see https://github.com/neondatabase/neon/pull/1809) + /// ignored, see ) /// /// Note: A consequence of flushing the pending operations is that they /// won't be visible to subsequent operations until `commit`. The function diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 13db38d956..9c6851bc71 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -205,7 +205,7 @@ pub enum TaskKind { /// /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// That abstraction doesn't use `task_mgr`. - /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task. + /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// /// Once the connection is established, the `TaskHandle` task creates a @@ -213,16 +213,21 @@ pub enum TaskKind { /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. + /// + /// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler + /// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller WalReceiverManager, - /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`]. + /// The `TaskHandle` task that executes `handle_walreceiver_connection`. /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. + /// + /// [`WalReceiverManager`]: Self::WalReceiverManager WalReceiverConnectionHandler, /// The task that polls the `tokio-postgres::Connection` object. - /// Spawned by task [`WalReceiverConnectionHandler`]. - /// See the comment on [`WalReceiverManager`]. + /// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler). + /// See the comment on [`WalReceiverManager`](Self::WalReceiverManager). WalReceiverConnectionPoller, // Garbage collection worker. One per tenant diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3fa2a4bab4..142118bf6e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11,7 +11,7 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; use futures::FutureExt; use pageserver_api::models::TimelineState; use remote_storage::DownloadError; @@ -49,6 +49,8 @@ use std::time::{Duration, Instant}; use self::config::TenantConf; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineUninitMark; +use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -68,6 +70,7 @@ use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; use crate::InitializationOrder; +use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; use crate::walredo::WalRedoManager; @@ -81,12 +84,32 @@ use utils::{ lsn::{Lsn, RecordLsn}, }; +/// Declare a failpoint that can use the `pause` failpoint action. +/// We don't want to block the executor thread, hence, spawn_blocking + await. +macro_rules! pausable_failpoint { + ($name:literal) => { + if cfg!(feature = "testing") { + tokio::task::spawn_blocking({ + let current = tracing::Span::current(); + move || { + let _entered = current.entered(); + tracing::info!("at failpoint {}", $name); + fail::fail_point!($name); + } + }) + .await + .expect("spawn_blocking"); + } + }; +} + pub mod blob_io; pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; pub mod manifest; +mod span; pub mod metadata; mod par_fsync; @@ -102,7 +125,7 @@ mod timeline; pub mod size; -pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id; +pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; pub use timeline::{ LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline, }; @@ -110,7 +133,7 @@ pub use timeline::{ // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; -// re-export for use in storage_sync.rs +// re-export for use in remote_timeline_client.rs pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver @@ -161,200 +184,6 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, } -/// A timeline with some of its files on disk, being initialized. -/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or -/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory -/// to be removed on next restart. -/// -/// The caller is responsible for proper timeline data filling before the final init. -#[must_use] -pub struct UninitializedTimeline<'t> { - owning_tenant: &'t Tenant, - timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark)>, -} - -/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, -/// or gets removed eventually. -/// -/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. -#[must_use] -struct TimelineUninitMark { - uninit_mark_deleted: bool, - uninit_mark_path: PathBuf, - timeline_path: PathBuf, -} - -impl UninitializedTimeline<'_> { - /// Finish timeline creation: insert it into the Tenant's timelines map and remove the - /// uninit mark file. - /// - /// This function launches the flush loop if not already done. - /// - /// The caller is responsible for activating the timeline (function `.activate()`). - fn finish_creation(mut self) -> anyhow::Result> { - let timeline_id = self.timeline_id; - let tenant_id = self.owning_tenant.tenant_id; - - let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { - format!("No timeline for initalization found for {tenant_id}/{timeline_id}") - })?; - - // Check that the caller initialized disk_consistent_lsn - let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); - ensure!( - new_disk_consistent_lsn.is_valid(), - "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" - ); - - let mut timelines = self.owning_tenant.timelines.lock().unwrap(); - match timelines.entry(timeline_id) { - Entry::Occupied(_) => anyhow::bail!( - "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" - ), - Entry::Vacant(v) => { - uninit_mark.remove_uninit_mark().with_context(|| { - format!( - "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" - ) - })?; - v.insert(Arc::clone(&new_timeline)); - - new_timeline.maybe_spawn_flush_loop(); - } - } - - Ok(new_timeline) - } - - /// Prepares timeline data by loading it from the basebackup archive. - pub async fn import_basebackup_from_tar( - self, - copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), - base_lsn: Lsn, - broker_client: storage_broker::BrokerClientChannel, - ctx: &RequestContext, - ) -> anyhow::Result> { - let raw_timeline = self.raw_timeline()?; - - import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) - .await - .context("Failed to import basebackup")?; - - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - raw_timeline.maybe_spawn_flush_loop(); - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); - }); - - raw_timeline - .freeze_and_flush() - .await - .context("Failed to flush after basebackup import")?; - - // All the data has been imported. Insert the Timeline into the tenant's timelines - // map and remove the uninit mark file. - let tl = self.finish_creation()?; - tl.activate(broker_client, None, ctx); - Ok(tl) - } - - fn raw_timeline(&self) -> anyhow::Result<&Arc> { - Ok(&self - .raw_timeline - .as_ref() - .with_context(|| { - format!( - "No raw timeline {}/{} found", - self.owning_tenant.tenant_id, self.timeline_id - ) - })? - .0) - } -} - -impl Drop for UninitializedTimeline<'_> { - fn drop(&mut self) { - if let Some((_, uninit_mark)) = self.raw_timeline.take() { - let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered(); - error!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(uninit_mark); - } - } -} - -fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { - let timeline_path = &uninit_mark.timeline_path; - match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { - Ok(()) => { - info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") - } - Err(e) => { - error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") - } - } - drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists -} - -impl TimelineUninitMark { - fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { - Self { - uninit_mark_deleted: false, - uninit_mark_path, - timeline_path, - } - } - - fn remove_uninit_mark(mut self) -> anyhow::Result<()> { - if !self.uninit_mark_deleted { - self.delete_mark_file_if_present()?; - } - - Ok(()) - } - - fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { - let uninit_mark_file = &self.uninit_mark_path; - let uninit_mark_parent = uninit_mark_file - .parent() - .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { - format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") - })?; - crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; - self.uninit_mark_deleted = true; - - Ok(()) - } -} - -impl Drop for TimelineUninitMark { - fn drop(&mut self) { - if !self.uninit_mark_deleted { - if self.timeline_path.exists() { - error!( - "Uninit mark {} is not removed, timeline {} stays uninitialized", - self.uninit_mark_path.display(), - self.timeline_path.display() - ) - } else { - // unblock later timeline creation attempts - warn!( - "Removing intermediate uninit mark file {}", - self.uninit_mark_path.display() - ); - if let Err(e) = self.delete_mark_file_if_present() { - error!("Failed to remove the uninit mark file: {e}") - } - } - } - } -} - // We should not blindly overwrite local metadata with remote one. // For example, consider the following case: // Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that @@ -600,7 +429,7 @@ impl Tenant { .layers .read() .await - .0 + .layer_map() .iter_historic_layers() .next() .is_some(), @@ -611,8 +440,8 @@ impl Tenant { if !picked_local { save_metadata( self.conf, - timeline_id, - tenant_id, + &tenant_id, + &timeline_id, up_to_date_metadata, first_save, ) @@ -641,7 +470,7 @@ impl Tenant { ) -> anyhow::Result> { // TODO dedup with spawn_load let tenant_conf = - Self::load_tenant_config(conf, tenant_id).context("load tenant config")?; + Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?; let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( @@ -695,7 +524,7 @@ impl Tenant { /// No background tasks are started as part of this routine. /// async fn attach(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id); if !tokio::fs::try_exists(&marker_file) @@ -750,7 +579,7 @@ impl Tenant { .map(move |res| { res.with_context(|| format!("download index part for timeline {timeline_id}")) }) - .instrument(info_span!("download_index_part", timeline=%timeline_id)), + .instrument(info_span!("download_index_part", %timeline_id)), ); } // Wait for all the download tasks to complete & collect results. @@ -833,10 +662,10 @@ impl Tenant { remote_client: RemoteTimelineClient, ctx: &RequestContext, ) -> anyhow::Result<()> { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); - tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id)) + tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_id, &timeline_id)) .await .context("Failed to create new timeline directory")?; @@ -912,9 +741,9 @@ impl Tenant { init_order: Option, ctx: &RequestContext, ) -> Arc { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); - let tenant_conf = match Self::load_tenant_config(conf, tenant_id) { + let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) { Ok(conf) => conf, Err(e) => { error!("load tenant config failed: {:?}", e); @@ -1025,7 +854,7 @@ impl Tenant { timeline_uninit_mark_file.display() ) })?; - let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id); + let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id); if let Err(e) = remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) { @@ -1070,7 +899,7 @@ impl Tenant { if let Ok(timeline_id) = file_name.to_str().unwrap_or_default().parse::() { - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) + let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id) .context("failed to load metadata")?; timelines_to_load.insert(timeline_id, metadata); } else { @@ -1098,7 +927,7 @@ impl Tenant { init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); debug!("loading tenant task"); @@ -1144,7 +973,7 @@ impl Tenant { init_order: Option<&InitializationOrder>, ctx: &RequestContext, ) -> anyhow::Result<()> { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); let remote_client = self.remote_storage.as_ref().map(|remote_storage| { RemoteTimelineClient::new( @@ -1539,7 +1368,7 @@ impl Tenant { for (timeline_id, timeline) in &timelines_to_compact { timeline .compact(ctx) - .instrument(info_span!("compact_timeline", timeline = %timeline_id)) + .instrument(info_span!("compact_timeline", %timeline_id)) .await?; } @@ -1630,12 +1459,12 @@ impl Tenant { let layer_removal_guard = timeline.layer_removal_cs.lock().await; info!("got layer_removal_cs.lock(), deleting layer files"); - // NB: storage_sync upload tasks that reference these layers have been cancelled + // NB: remote_timeline_client upload tasks that reference these layers have been cancelled // by the caller. let local_timeline_directory = self .conf - .timeline_path(&timeline.timeline_id, &self.tenant_id); + .timeline_path(&self.tenant_id, &timeline.timeline_id); fail::fail_point!("timeline-delete-before-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? @@ -1688,20 +1517,7 @@ impl Tenant { remote_client.delete_all().await.context("delete_all")? }; - // Have a failpoint that can use the `pause` failpoint action. - // We don't want to block the executor thread, hence, spawn_blocking + await. - if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); - move || { - let _entered = current.entered(); - tracing::info!("at failpoint in_progress_delete"); - fail::fail_point!("in_progress_delete"); - } - }) - .await - .expect("spawn_blocking"); - } + pausable_failpoint!("in_progress_delete"); { // Remove the timeline from the map. @@ -1735,7 +1551,7 @@ impl Tenant { timeline_id: TimelineId, _ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { - timeline::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id(); // Transition the timeline into TimelineState::Stopping. // This should prevent new operations from starting. @@ -1899,7 +1715,7 @@ impl Tenant { background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); let mut activating = false; self.state.send_modify(|current_state| { @@ -1970,7 +1786,7 @@ impl Tenant { /// /// This will attempt to shutdown even if tenant is broken. pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> { - debug_assert_current_span_has_tenant_id(); + span::debug_assert_current_span_has_tenant_id(); // Set tenant (and its timlines) to Stoppping state. // // Since we can only transition into Stopping state after activation is complete, @@ -2416,7 +2232,7 @@ impl Tenant { /// Locate and load config pub(super) fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_id: &TenantId, ) -> anyhow::Result { let target_config_path = conf.tenant_config_path(tenant_id); let target_config_display = target_config_path.display(); @@ -3003,7 +2819,7 @@ impl Tenant { timeline_struct.init_empty_layer_map(start_lsn); if let Err(e) = - self.create_timeline_files(&uninit_mark.timeline_path, new_timeline_id, new_metadata) + self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) { error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); cleanup_timeline_directory(uninit_mark); @@ -3012,17 +2828,17 @@ impl Tenant { debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"); - Ok(UninitializedTimeline { - owning_tenant: self, - timeline_id: new_timeline_id, - raw_timeline: Some((timeline_struct, uninit_mark)), - }) + Ok(UninitializedTimeline::new( + self, + new_timeline_id, + Some((timeline_struct, uninit_mark)), + )) } fn create_timeline_files( &self, timeline_path: &Path, - new_timeline_id: TimelineId, + new_timeline_id: &TimelineId, new_metadata: &TimelineMetadata, ) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; @@ -3033,8 +2849,8 @@ impl Tenant { save_metadata( self.conf, + &self.tenant_id, new_timeline_id, - self.tenant_id, new_metadata, true, ) @@ -3057,7 +2873,7 @@ impl Tenant { timelines.get(&timeline_id).is_none(), "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" ); - let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id); + let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id); anyhow::ensure!( !timeline_path.exists(), "Timeline {} already exists, cannot create its uninit mark file", @@ -3188,10 +3004,10 @@ pub(crate) enum CreateTenantFilesMode { pub(crate) fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: TenantId, + tenant_id: &TenantId, mode: CreateTenantFilesMode, ) -> anyhow::Result { - let target_tenant_directory = conf.tenant_path(&tenant_id); + let target_tenant_directory = conf.tenant_path(tenant_id); anyhow::ensure!( !target_tenant_directory .try_exists() @@ -3242,7 +3058,7 @@ pub(crate) fn create_tenant_files( fn try_create_target_tenant_dir( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, - tenant_id: TenantId, + tenant_id: &TenantId, mode: CreateTenantFilesMode, temporary_tenant_dir: &Path, target_tenant_directory: &Path, @@ -3266,7 +3082,7 @@ fn try_create_target_tenant_dir( } let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(&tenant_id), + &conf.timelines_path(tenant_id), target_tenant_directory, temporary_tenant_dir, ) @@ -3278,7 +3094,7 @@ fn try_create_target_tenant_dir( ) .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; - Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?; + Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?; crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -3566,7 +3382,7 @@ pub mod harness { } pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf { - self.conf.timeline_path(timeline_id, &self.tenant_id) + self.conf.timeline_path(&self.tenant_id, timeline_id) } } @@ -4519,13 +4335,13 @@ mod tests { // assert freeze_and_flush exercised the initdb optimization { let state = tline.flush_loop_state.lock().unwrap(); - let - timeline::FlushLoopState::Running { - expect_initdb_optimization, - initdb_optimization_count, - } = *state else { - panic!("unexpected state: {:?}", *state); - }; + let timeline::FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + } = *state + else { + panic!("unexpected state: {:?}", *state); + }; assert!(expect_initdb_optimization); assert!(initdb_optimization_count > 0); } @@ -4560,7 +4376,7 @@ mod tests { assert!(!harness .conf - .timeline_path(&TIMELINE_ID, &tenant.tenant_id) + .timeline_path(&tenant.tenant_id, &TIMELINE_ID) .exists()); assert!(!harness @@ -4571,28 +4387,3 @@ mod tests { Ok(()) } } - -#[cfg(not(debug_assertions))] -#[inline] -pub(crate) fn debug_assert_current_span_has_tenant_id() {} - -#[cfg(debug_assertions)] -pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy< - utils::tracing_span_assert::MultiNameExtractor<2>, -> = once_cell::sync::Lazy::new(|| { - utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]) -}); - -#[cfg(debug_assertions)] -#[inline] -pub(crate) fn debug_assert_current_span_has_tenant_id() { - use utils::tracing_span_assert; - - match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) { - Ok(()) => (), - Err(missing) => panic!( - "missing extractors: {:?}", - missing.into_iter().map(|e| e.name()).collect::>() - ), - } -} diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 88dff32b76..734409a619 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -442,7 +442,7 @@ where writer: W, /// - /// stack[0] is the current root page, stack.last() is the leaf. + /// `stack[0]` is the current root page, `stack.last()` is the leaf. /// /// We maintain the length of the stack to be always greater than zero. /// Two exceptions are: diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 4379438896..0d3c5da91c 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -55,7 +55,7 @@ impl EphemeralFile { l.next_file_id += 1; let filename = conf - .timeline_path(&timeline_id, &tenant_id) + .timeline_path(&tenant_id, &timeline_id) .join(PathBuf::from(format!("ephemeral-{}", file_id))); let file = VirtualFile::open_with_options( @@ -346,7 +346,7 @@ mod tests { let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?; + fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?; Ok((conf, tenant_id, timeline_id)) } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index dee02ac433..2908d3a83c 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -16,7 +16,7 @@ //! Other read methods are less critical but still impact performance of background tasks. //! //! This data structure relies on a persistent/immutable binary search tree. See the -//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s +//! following lecture for an introduction //! Summary: A persistent/immutable BST (and persistent data structures in general) allows //! you to modify the tree in such a way that each modification creates a new "version" //! of the tree. When you modify it, you get a new version, but all previous versions are @@ -40,7 +40,7 @@ //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need //! to throw away most of the persistent BST and build a new one, starting from the oldest -//! LSN. See `LayerMap::flush_updates()`. +//! LSN. See [`LayerMap::flush_updates()`]. //! mod historic_layer_coverage; @@ -60,7 +60,6 @@ use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; -use super::storage_layer::range_eq; use super::storage_layer::PersistentLayerDesc; /// @@ -365,7 +364,7 @@ impl LayerMap { } pub fn is_l0(layer: &PersistentLayerDesc) -> bool { - range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX)) + layer.get_key_range() == (Key::MIN..Key::MAX) } /// This function determines which layers are counted in `count_deltas`: @@ -397,7 +396,7 @@ impl LayerMap { } // Case 2 - if range_eq(partition_range, &(Key::MIN..Key::MAX)) { + if partition_range == &(Key::MIN..Key::MAX) { return true; } @@ -652,19 +651,35 @@ impl LayerMap { #[cfg(test)] mod tests { use super::LayerMap; - use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName}; + use crate::tenant::storage_layer::LayerFileName; use std::str::FromStr; use std::sync::Arc; mod l0_delta_layers_updated { use crate::tenant::{ - storage_layer::{PersistentLayer, PersistentLayerDesc}, - timeline::LayerFileManager, + storage_layer::{AsLayerDesc, PersistentLayerDesc}, + timeline::layer_manager::LayerFileManager, }; use super::*; + struct LayerObject(PersistentLayerDesc); + + impl AsLayerDesc for LayerObject { + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.0 + } + } + + impl LayerObject { + fn new(desc: PersistentLayerDesc) -> Self { + LayerObject(desc) + } + } + + type TestLayerFileManager = LayerFileManager; + #[test] fn for_full_range_delta() { // l0_delta_layers are used by compaction, and should observe all buffered updates @@ -701,18 +716,18 @@ mod tests { let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69"; let layer = LayerFileName::from_str(layer).unwrap(); - let layer = LayerDescriptor::from(layer); + let layer = PersistentLayerDesc::from(layer); // same skeletan construction; see scenario below - let not_found = Arc::new(layer.clone()); - let new_version = Arc::new(layer); + let not_found = Arc::new(LayerObject::new(layer.clone())); + let new_version = Arc::new(LayerObject::new(layer)); // after the immutable storage state refactor, the replace operation // will not use layer map any more. We keep it here for consistency in test cases // and can remove it in the future. let _map = LayerMap::default(); - let mut mapping = LayerFileManager::new(); + let mut mapping = TestLayerFileManager::new(); mapping .replace_and_verify(not_found, new_version) @@ -721,10 +736,10 @@ mod tests { fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) { let name = LayerFileName::from_str(layer_name).unwrap(); - let skeleton = LayerDescriptor::from(name); + let skeleton = PersistentLayerDesc::from(name); - let remote = Arc::new(skeleton.clone()); - let downloaded = Arc::new(skeleton); + let remote = Arc::new(LayerObject::new(skeleton.clone())); + let downloaded = Arc::new(LayerObject::new(skeleton)); let mut map = LayerMap::default(); let mut mapping = LayerFileManager::new(); diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 0f51597027..347490c1ba 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -122,8 +122,7 @@ impl HistoricLayerCoverage { self.head = self .historic .iter() - .rev() - .next() + .next_back() .map(|(_, v)| v.clone()) .unwrap_or_default(); } @@ -412,7 +411,7 @@ fn test_persistent_overlapping() { /// still be more critical. /// /// See this for more on persistent and retroactive techniques: -/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s +/// pub struct BufferedHistoricLayerCoverage { /// A persistent layer map that we rebuild when we need to retroactively update historic_coverage: HistoricLayerCoverage, diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 47aace97a5..1d9101d3d1 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -2,7 +2,7 @@ use std::ops::Range; // NOTE the `im` crate has 20x more downloads and also has // persistent/immutable BTree. But it's bugged so rpds is a -// better choice https://github.com/neondatabase/neon/issues/3395 +// better choice use rpds::RedBlackTreeMapSync; /// Data structure that can efficiently: @@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync; /// - insert layers in non-decreasing lsn.start order /// /// For a detailed explanation and justification of this approach, see: -/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing +/// /// /// NOTE The struct is parameterized over Value for easier /// testing, but in practice it's some sort of layer. @@ -113,8 +113,7 @@ impl LayerCoverage { pub fn query(&self, key: i128) -> Option { self.nodes .range(..=key) - .rev() - .next()? + .next_back()? .1 .as_ref() .map(|(_, v)| v.clone()) diff --git a/pageserver/src/tenant/manifest.rs b/pageserver/src/tenant/manifest.rs index 745437dfbd..1d2835114f 100644 --- a/pageserver/src/tenant/manifest.rs +++ b/pageserver/src/tenant/manifest.rs @@ -24,7 +24,7 @@ //! Currently, this is not used in the system. Future refactors will ensure //! the storage state will be recorded in this file, and the system can be //! recovered from this file. This is tracked in -//! https://github.com/neondatabase/neon/issues/4418 +//! use std::io::{self, Read, Write}; diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 1ea61fa26b..d52bb66e76 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,10 +1,12 @@ //! Every image of a certain timeline from [`crate::tenant::Tenant`] //! has a metadata that needs to be stored persistently. //! -//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of +//! Later, the file gets used in [`remote_timeline_client`] as a part of //! external storage import and export operations. //! //! The module contains all structs and related helper methods related to timeline metadata. +//! +//! [`remote_timeline_client`]: super::remote_timeline_client use std::fs::{File, OpenOptions}; use std::io::Write; @@ -232,13 +234,13 @@ impl TimelineMetadata { /// Save timeline metadata to file pub fn save_metadata( conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, + tenant_id: &TenantId, + timeline_id: &TimelineId, data: &TimelineMetadata, first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = conf.metadata_path(timeline_id, tenant_id); + let path = conf.metadata_path(tenant_id, timeline_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, @@ -267,10 +269,10 @@ pub fn save_metadata( pub fn load_metadata( conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, + tenant_id: &TenantId, + timeline_id: &TimelineId, ) -> anyhow::Result { - let metadata_path = conf.metadata_path(timeline_id, tenant_id); + let metadata_path = conf.metadata_path(tenant_id, timeline_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { format!( "Failed to read metadata bytes from path {}", diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 09b825d2e9..2cc881ed5e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -184,9 +184,9 @@ pub fn schedule_local_tenant_processing( format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}") })?; - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); anyhow::ensure!( - !conf.tenant_ignore_mark_file_path(tenant_id).exists(), + !conf.tenant_ignore_mark_file_path(&tenant_id).exists(), "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); @@ -310,7 +310,7 @@ pub async fn create_tenant( // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Create)?; + let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 @@ -344,14 +344,9 @@ pub async fn set_new_tenant_config( info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; - let tenant_config_path = conf.tenant_config_path(tenant_id); - Tenant::persist_tenant_config( - &tenant.tenant_id(), - &tenant_config_path, - new_tenant_conf, - false, - ) - .map_err(SetNewTenantConfigError::Persist)?; + let tenant_config_path = conf.tenant_config_path(&tenant_id); + Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false) + .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); Ok(()) } @@ -435,7 +430,7 @@ pub async fn detach_tenant( // Ignored tenants are not present in memory and will bail the removal from memory operation. // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); if tenant_ignore_mark.exists() { info!("Detaching an ignored tenant"); local_files_cleanup_operation(tenant_id) @@ -457,7 +452,7 @@ pub async fn load_tenant( ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || { let tenant_path = conf.tenant_path(&tenant_id); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); if tenant_ignore_mark.exists() { std::fs::remove_file(&tenant_ignore_mark) .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; @@ -478,7 +473,7 @@ pub async fn ignore_tenant( tenant_id: TenantId, ) -> Result<(), TenantStateError> { remove_tenant_from_memory(tenant_id, async { - let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id); + let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id); fs::File::create(&ignore_mark_file) .await .context("Failed to create ignore mark file") @@ -525,7 +520,7 @@ pub async fn attach_tenant( ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || { - let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?; + let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 @@ -695,7 +690,7 @@ pub async fn immediate_gc( fail::fail_point!("immediate_gc_task_pre"); let result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx) - .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) + .instrument(info_span!("manual_gc", %tenant_id, %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. @@ -745,9 +740,7 @@ pub async fn immediate_compact( async move { let result = timeline .compact(&ctx) - .instrument( - info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id), - ) + .instrument(info_span!("manual_compact", %tenant_id, %timeline_id)) .await; match task_done.send(result) { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 190512f48f..1355356712 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -135,7 +135,7 @@ //! - Initiate upload queue with that [`IndexPart`]. //! - Reschedule all lost operations by comparing the local filesystem state //! and remote state as per [`IndexPart`]. This is done in -//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`]. +//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`]. //! //! Note that if we crash during file deletion between the index update //! that removes the file from the list of files, and deleting the remote file, @@ -163,8 +163,8 @@ //! - download their remote [`IndexPart`]s //! - create `Timeline` struct and a `RemoteTimelineClient` //! - initialize the client's upload queue with its `IndexPart` -//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart` -//! but not present locally +//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances +//! for layers that are referenced by `IndexPart` but not present locally //! - schedule uploads for layers that are only present locally. //! - if the remote `IndexPart`'s metadata was newer than the metadata in //! the local filesystem, write the remote metadata to the local filesystem @@ -198,6 +198,8 @@ //! in remote storage. //! But note that we don't test any of this right now. //! +//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync +//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote mod delete; mod download; @@ -442,8 +444,8 @@ impl RemoteTimelineClient { let index_part = download::download_index_part( self.conf, &self.storage_impl, - self.tenant_id, - self.timeline_id, + &self.tenant_id, + &self.timeline_id, ) .measure_remote_op( self.tenant_id, @@ -608,10 +610,7 @@ impl RemoteTimelineClient { self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); - info!( - "scheduled layer file upload {}", - layer_file_name.file_name() - ); + info!("scheduled layer file upload {layer_file_name}"); // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); @@ -664,7 +663,7 @@ impl RemoteTimelineClient { }); self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); - info!("scheduled layer file deletion {}", name.file_name()); + info!("scheduled layer file deletion {name}"); } // Launch the tasks immediately, if possible @@ -751,25 +750,13 @@ impl RemoteTimelineClient { stopped.deleted_at = SetDeletedFlagProgress::NotRunning; }); - // Have a failpoint that can use the `pause` failpoint action. - // We don't want to block the executor thread, hence, spawn_blocking + await. - if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); - move || { - let _entered = current.entered(); - tracing::info!("at failpoint persist_deleted_index_part"); - fail::fail_point!("persist_deleted_index_part"); - } - }) - .await - .expect("spawn_blocking"); - } + pausable_failpoint!("persist_deleted_index_part"); + upload::upload_index_part( self.conf, &self.storage_impl, - self.tenant_id, - self.timeline_id, + &self.tenant_id, + &self.timeline_id, &index_part_with_deleted_at, ) .await?; @@ -828,7 +815,7 @@ impl RemoteTimelineClient { .queued_operations .push_back(op); - info!("scheduled layer file deletion {}", name.file_name()); + info!("scheduled layer file deletion {name}"); deletions_queued += 1; } @@ -844,7 +831,7 @@ impl RemoteTimelineClient { // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); let timeline_storage_path = self.conf.remote_path(&timeline_path)?; let remaining = self @@ -855,14 +842,16 @@ impl RemoteTimelineClient { let remaining: Vec = remaining .into_iter() .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME)) + .inspect(|path| { + if let Some(name) = path.object_name() { + info!(%name, "deleting a file not referenced from index_part.json"); + } else { + warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json"); + } + }) .collect(); if !remaining.is_empty() { - warn!( - "Found {} files not bound to index_file.json, proceeding with their deletion", - remaining.len() - ); - warn!("About to remove {} files", remaining.len()); self.storage_impl.delete_objects(&remaining).await?; } @@ -871,7 +860,7 @@ impl RemoteTimelineClient { debug!("deleting index part"); self.storage_impl.delete(&index_file_path).await?; - info!(deletions_queued, "done deleting, including index_part.json"); + info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json"); Ok(()) } @@ -936,11 +925,11 @@ impl RemoteTimelineClient { // Assign unique ID to this task upload_queue.task_counter += 1; - let task_id = upload_queue.task_counter; + let upload_task_id = upload_queue.task_counter; // Add it to the in-progress map let task = Arc::new(UploadTask { - task_id, + task_id: upload_task_id, op: next_op, retries: AtomicU32::new(0), }); @@ -950,6 +939,8 @@ impl RemoteTimelineClient { // Spawn task to perform the task let self_rc = Arc::clone(self); + let tenant_id = self.tenant_id; + let timeline_id = self.timeline_id; task_mgr::spawn( self.runtime.handle(), TaskKind::RemoteUploadTask, @@ -961,7 +952,7 @@ impl RemoteTimelineClient { self_rc.perform_upload_task(task).await; Ok(()) } - .instrument(info_span!(parent: None, "remote_upload", tenant = %self.tenant_id, timeline = %self.timeline_id, upload_task_id = %task_id)), + .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)), ); // Loop back to process next task @@ -1006,7 +997,7 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => { let path = &self .conf - .timeline_path(&self.timeline_id, &self.tenant_id) + .timeline_path(&self.tenant_id, &self.timeline_id) .join(layer_file_name.file_name()); upload::upload_timeline_layer( self.conf, @@ -1027,8 +1018,8 @@ impl RemoteTimelineClient { let res = upload::upload_index_part( self.conf, &self.storage_impl, - self.tenant_id, - self.timeline_id, + &self.tenant_id, + &self.timeline_id, index_part, ) .measure_remote_op( @@ -1047,7 +1038,7 @@ impl RemoteTimelineClient { UploadOp::Delete(delete) => { let path = &self .conf - .timeline_path(&self.timeline_id, &self.tenant_id) + .timeline_path(&self.tenant_id, &self.timeline_id) .join(delete.layer_file_name.file_name()); delete::delete_layer(self.conf, &self.storage_impl, path) .measure_remote_op( diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs index 9f6732fbff..3f505d45ab 100644 --- a/pageserver/src/tenant/remote_timeline_client/delete.rs +++ b/pageserver/src/tenant/remote_timeline_client/delete.rs @@ -19,9 +19,10 @@ pub(super) async fn delete_layer<'a>( let path_to_delete = conf.remote_path(local_layer_path)?; - // XXX: If the deletion fails because the object already didn't exist, - // it would be good to just issue a warning but consider it success. - // https://github.com/neondatabase/neon/issues/2934 + // We don't want to print an error if the delete failed if the file has + // already been deleted. Thankfully, in this situation S3 already + // does not yield an error. While OS-provided local file system APIs do yield + // errors, we avoid them in the `LocalFs` wrapper. storage.delete(&path_to_delete).await.with_context(|| { format!("Failed to delete remote layer from storage at {path_to_delete:?}") }) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index a0d8c0193a..64f4a0a113 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -16,7 +16,7 @@ use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; @@ -46,7 +46,7 @@ pub async fn download_layer_file<'a>( ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + let timeline_path = conf.timeline_path(&tenant_id, &timeline_id); let local_path = timeline_path.join(layer_file_name.file_name()); @@ -229,11 +229,11 @@ pub async fn list_remote_timelines<'a>( pub(super) async fn download_index_part( conf: &'static PageServerConf, storage: &GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, + tenant_id: &TenantId, + timeline_id: &TimelineId, ) -> Result { let index_part_path = conf - .metadata_path(timeline_id, tenant_id) + .metadata_path(tenant_id, timeline_id) .with_file_name(IndexPart::FILE_NAME); let part_storage_path = conf .remote_path(&index_part_path) diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index b520bb4b0c..a805e9bd60 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -2,7 +2,7 @@ use anyhow::{bail, Context}; use fail::fail_point; -use std::path::Path; +use std::{io::ErrorKind, path::Path}; use tokio::fs; use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart}; @@ -11,12 +11,14 @@ use utils::id::{TenantId, TimelineId}; use super::index::LayerFileMetadata; +use tracing::info; + /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, + tenant_id: &TenantId, + timeline_id: &TimelineId, index_part: &'a IndexPart, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -31,7 +33,7 @@ pub(super) async fn upload_index_part<'a>( let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); let index_part_path = conf - .metadata_path(timeline_id, tenant_id) + .metadata_path(tenant_id, timeline_id) .with_file_name(IndexPart::FILE_NAME); let storage_path = conf.remote_path(&index_part_path)?; @@ -56,9 +58,21 @@ pub(super) async fn upload_timeline_layer<'a>( }); let storage_path = conf.remote_path(source_path)?; - let source_file = fs::File::open(&source_path) - .await - .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?; + let source_file_res = fs::File::open(&source_path).await; + let source_file = match source_file_res { + Ok(source_file) => source_file, + Err(e) if e.kind() == ErrorKind::NotFound => { + // If we encounter this arm, it wasn't intended, but it's also not + // a big problem, if it's because the file was deleted before an + // upload. However, a nonexistent file can also be indicative of + // something worse, like when a file is scheduled for upload before + // it has been written to disk yet. + info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + return Ok(()); + } + Err(e) => Err(e) + .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?, + }; let fs_size = source_file .metadata() diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index ffcbdc1f1d..e737d3f59c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -110,11 +110,11 @@ pub struct TimelineInputs { /// /// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which /// is updated on-demand, during the start of this calculation and separate from the -/// [`Timeline::latest_gc_cutoff`]. +/// [`TimelineInputs::latest_gc_cutoff`]. /// /// For timelines in general: /// -/// ```ignore +/// ```text /// 0-----|---------|----|------------| · · · · · |·> lsn /// initdb_lsn branchpoints* next_gc_cutoff latest /// ``` diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs new file mode 100644 index 0000000000..04e92f4096 --- /dev/null +++ b/pageserver/src/tenant/span.rs @@ -0,0 +1,17 @@ +#[cfg(debug_assertions)] +use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; + +#[cfg(not(debug_assertions))] +pub(crate) fn debug_assert_current_span_has_tenant_id() {} + +#[cfg(debug_assertions)] +pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy> = + once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"])); + +#[cfg(debug_assertions)] +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) { + panic!("missing extractors: {missing:?}") + } +} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 7bc513b3a1..c6d1a0052a 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -41,7 +41,7 @@ pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use remote_layer::RemoteLayer; -use super::layer_map::BatchedUpdates; +use super::timeline::layer_manager::LayerManager; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -54,13 +54,6 @@ where } } -pub fn range_eq(a: &Range, b: &Range) -> bool -where - T: PartialEq, -{ - a.start == b.start && a.end == b.end -} - /// Struct used to communicate across calls to 'get_value_reconstruct_data'. /// /// Before first call, you can fill in 'page_img' if you have an older cached @@ -169,6 +162,9 @@ impl LayerAccessStats { /// The caller is responsible for recording a residence event /// using [`record_residence_event`] before calling `latest_activity`. /// If they don't, [`latest_activity`] will return `None`. + /// + /// [`record_residence_event`]: Self::record_residence_event + /// [`latest_activity`]: Self::latest_activity pub(crate) fn empty_will_record_residence_event_later() -> Self { LayerAccessStats(Mutex::default()) } @@ -176,8 +172,11 @@ impl LayerAccessStats { /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status. /// /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + /// + /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad + /// [`record_residence_event`]: Self::record_residence_event pub(crate) fn for_loading_layer( - layer_map_lock_held_witness: &BatchedUpdates<'_>, + layer_map_lock_held_witness: &LayerManager, status: LayerResidenceStatus, ) -> Self { let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default())); @@ -194,9 +193,11 @@ impl LayerAccessStats { /// The `new_status` is not recorded in `self`. /// /// See [`record_residence_event`] for why you need to do this while holding the layer map lock. + /// + /// [`record_residence_event`]: Self::record_residence_event pub(crate) fn clone_for_residence_change( &self, - layer_map_lock_held_witness: &BatchedUpdates<'_>, + layer_map_lock_held_witness: &LayerManager, new_status: LayerResidenceStatus, ) -> LayerAccessStats { let clone = { @@ -228,7 +229,7 @@ impl LayerAccessStats { /// pub(crate) fn record_residence_event( &self, - _layer_map_lock_held_witness: &BatchedUpdates<'_>, + _layer_map_lock_held_witness: &LayerManager, status: LayerResidenceStatus, reason: LayerResidenceEventReason, ) { @@ -301,11 +302,13 @@ impl LayerAccessStats { /// implementation error. This function logs a rate-limited warning in that case. /// /// TODO: use type system to avoid the need for `fallback`. - /// The approach in https://github.com/neondatabase/neon/pull/3775 + /// The approach in /// could be used to enforce that a residence event is recorded /// before a layer is added to the layer map. We could also have /// a layer wrapper type that holds the LayerAccessStats, and ensure /// that that type can only be produced by inserting into the layer map. + /// + /// [`record_residence_event`]: Self::record_residence_event pub(crate) fn latest_activity(&self) -> Option { let locked = self.0.lock().unwrap(); let inner = &locked.for_eviction_policy; @@ -330,12 +333,12 @@ impl LayerAccessStats { } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface -/// required by [`LayerMap`]. +/// required by [`LayerMap`](super::layer_map::LayerMap). /// /// All layers should implement a minimal `std::fmt::Debug` without tenant or /// timeline names, because those are known in the context of which the layers /// are used in (timeline). -pub trait Layer: std::fmt::Debug + Send + Sync { +pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync { /// Range of keys that this layer covers fn get_key_range(&self) -> Range; @@ -373,19 +376,22 @@ pub trait Layer: std::fmt::Debug + Send + Sync { ctx: &RequestContext, ) -> Result; - /// A short ID string that uniquely identifies the given layer within a [`LayerMap`]. - fn short_id(&self) -> String; - /// Dump summary of the contents of the layer to stdout fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>; } -/// Returned by [`Layer::iter`] +/// Returned by [`PersistentLayer::iter`] pub type LayerIter<'i> = Box> + 'i + Send>; -/// Returned by [`Layer::key_iter`] +/// Returned by [`PersistentLayer::key_iter`] pub type LayerKeyIter<'i> = Box + 'i + Send>; +/// Get a layer descriptor from a layer. +pub trait AsLayerDesc { + /// Get the layer descriptor. + fn layer_desc(&self) -> &PersistentLayerDesc; +} + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -399,10 +405,8 @@ pub type LayerKeyIter<'i> = Box + 'i + Send /// A delta layer contains all modifications within a range of LSNs and keys. /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. -pub trait PersistentLayer: Layer { - /// Get the layer descriptor. - fn layer_desc(&self) -> &PersistentLayerDesc; - +pub trait PersistentLayer: Layer + AsLayerDesc { + /// Identify the tenant this layer belongs to fn get_tenant_id(&self) -> TenantId { self.layer_desc().tenant_id } @@ -438,6 +442,10 @@ pub trait PersistentLayer: Layer { None } + fn downcast_delta_layer(self: Arc) -> Option> { + None + } + fn is_remote_layer(&self) -> bool { false } @@ -468,117 +476,32 @@ pub fn downcast_remote_layer( pub mod tests { use super::*; - /// Holds metadata about a layer without any content. Used mostly for testing. - /// - /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a - /// LayerDescriptor. - #[derive(Clone, Debug)] - pub struct LayerDescriptor { - base: PersistentLayerDesc, - } - - impl From for LayerDescriptor { - fn from(base: PersistentLayerDesc) -> Self { - Self { base } - } - } - - impl Layer for LayerDescriptor { - fn get_value_reconstruct_data( - &self, - _key: Key, - _lsn_range: Range, - _reconstruct_data: &mut ValueReconstructState, - _ctx: &RequestContext, - ) -> Result { - todo!("This method shouldn't be part of the Layer trait") - } - - fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { - todo!() - } - - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn get_key_range(&self) -> Range { - self.layer_desc().key_range.clone() - } - - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn get_lsn_range(&self) -> Range { - self.layer_desc().lsn_range.clone() - } - - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn is_incremental(&self) -> bool { - self.layer_desc().is_incremental - } - - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn short_id(&self) -> String { - self.layer_desc().short_id() - } - } - - impl PersistentLayer for LayerDescriptor { - fn layer_desc(&self) -> &PersistentLayerDesc { - &self.base - } - - fn local_path(&self) -> Option { - unimplemented!() - } - - fn iter(&self, _: &RequestContext) -> Result> { - unimplemented!() - } - - fn key_iter(&self, _: &RequestContext) -> Result> { - unimplemented!() - } - - fn delete_resident_layer_file(&self) -> Result<()> { - unimplemented!() - } - - fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo { - unimplemented!() - } - - fn access_stats(&self) -> &LayerAccessStats { - unimplemented!() - } - } - - impl From for LayerDescriptor { + impl From for PersistentLayerDesc { fn from(value: DeltaFileName) -> Self { - LayerDescriptor { - base: PersistentLayerDesc::new_delta( - TenantId::from_array([0; 16]), - TimelineId::from_array([0; 16]), - value.key_range, - value.lsn_range, - 233, - ), - } + PersistentLayerDesc::new_delta( + TenantId::from_array([0; 16]), + TimelineId::from_array([0; 16]), + value.key_range, + value.lsn_range, + 233, + ) } } - impl From for LayerDescriptor { + impl From for PersistentLayerDesc { fn from(value: ImageFileName) -> Self { - LayerDescriptor { - base: PersistentLayerDesc::new_img( - TenantId::from_array([0; 16]), - TimelineId::from_array([0; 16]), - value.key_range, - value.lsn, - false, - 233, - ), - } + PersistentLayerDesc::new_img( + TenantId::from_array([0; 16]), + TimelineId::from_array([0; 16]), + value.key_range, + value.lsn, + false, + 233, + ) } } - impl From for LayerDescriptor { + impl From for PersistentLayerDesc { fn from(value: LayerFileName) -> Self { match value { LayerFileName::Delta(d) => Self::from(d), diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6e14663121..83a22f9f13 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -7,14 +7,18 @@ //! must be page images or WAL records with the 'will_init' flag set, so that //! they can be replayed without referring to an older page version. //! -//! The delta files are stored in timelines/ directory. Currently, +//! The delta files are stored in `timelines/` directory. Currently, //! there are no subdirectories, and each delta file is named like this: //! -//! -__--__- +//! ``` //! //! For example: //! +//! ```text //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 +//! ``` //! //! Every delta file consists of three parts: "summary", "index", and //! "values". The summary is a fixed size header at the beginning of the file, @@ -47,6 +51,7 @@ use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; +use std::sync::Arc; use tracing::*; use utils::{ @@ -56,8 +61,8 @@ use utils::{ }; use super::{ - DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, - PathOrConf, PersistentLayerDesc, + AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, + LayerKeyIter, PathOrConf, PersistentLayerDesc, }; /// @@ -222,13 +227,14 @@ impl Layer for DeltaLayer { /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( - "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----", self.desc.tenant_id, self.desc.timeline_id, self.desc.key_range.start, self.desc.key_range.end, self.desc.lsn_range.start, - self.desc.lsn_range.end + self.desc.lsn_range.end, + self.desc.file_size, ); if !verbose { @@ -394,16 +400,23 @@ impl Layer for DeltaLayer { fn is_incremental(&self) -> bool { self.layer_desc().is_incremental } +} +/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. +impl std::fmt::Display for DeltaLayer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.layer_desc().short_id()) + } +} - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn short_id(&self) -> String { - self.layer_desc().short_id() +impl AsLayerDesc for DeltaLayer { + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } } impl PersistentLayer for DeltaLayer { - fn layer_desc(&self) -> &PersistentLayerDesc { - &self.desc + fn downcast_delta_layer(self: Arc) -> Option> { + Some(self) } fn local_path(&self) -> Option { @@ -457,22 +470,22 @@ impl PersistentLayer for DeltaLayer { impl DeltaLayer { fn path_for( path_or_conf: &PathOrConf, - timeline_id: TimelineId, - tenant_id: TenantId, + tenant_id: &TenantId, + timeline_id: &TimelineId, fname: &DeltaFileName, ) -> PathBuf { match path_or_conf { PathOrConf::Path(path) => path.clone(), PathOrConf::Conf(conf) => conf - .timeline_path(&timeline_id, &tenant_id) + .timeline_path(tenant_id, timeline_id) .join(fname.to_string()), } } fn temp_path_for( conf: &PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, + tenant_id: &TenantId, + timeline_id: &TimelineId, key_start: Key, lsn_range: &Range, ) -> PathBuf { @@ -482,7 +495,7 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(&timeline_id, &tenant_id).join(format!( + conf.timeline_path(tenant_id, timeline_id).join(format!( "{}-XXX__{:016X}-{:016X}.{}.{}", key_start, u64::from(lsn_range.start), @@ -604,8 +617,8 @@ impl DeltaLayer { pub fn path(&self) -> PathBuf { Self::path_for( &self.path_or_conf, - self.desc.timeline_id, - self.desc.tenant_id, + &self.desc.tenant_id, + &self.desc.timeline_id, &self.layer_name(), ) } @@ -653,7 +666,7 @@ impl DeltaLayerWriterInner { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range); + let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path)?; // make room for the header block @@ -768,8 +781,8 @@ impl DeltaLayerWriterInner { // FIXME: throw an error instead? let final_path = DeltaLayer::path_for( &PathOrConf::Conf(self.conf), - self.timeline_id, - self.tenant_id, + &self.tenant_id, + &self.timeline_id, &DeltaFileName { key_range: self.key_start..key_end, lsn_range: self.lsn_range, @@ -796,7 +809,7 @@ impl DeltaLayerWriterInner { /// /// # Note /// -/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// As described in , it's /// possible for the writer to drop before `finish` is actually called. So this /// could lead to odd temporary files in the directory, exhausting file system. /// This structure wraps `DeltaLayerWriterInner` and also contains `Drop` diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index 5dcd54689e..843bb1f631 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -57,8 +57,9 @@ impl Ord for DeltaFileName { /// Represents the filename of a DeltaLayer /// +/// ```text /// -__- -/// +/// ``` impl DeltaFileName { /// /// Parse a string as a delta file name. Returns None if the filename does not @@ -162,7 +163,9 @@ impl ImageFileName { /// /// Represents the filename of an ImageLayer /// +/// ```text /// -__ +/// ``` impl ImageFileName { /// /// Parse a string as an image file name. Returns None if the filename does not @@ -210,9 +213,15 @@ pub enum LayerFileName { impl LayerFileName { pub fn file_name(&self) -> String { + self.to_string() + } +} + +impl fmt::Display for LayerFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Image(fname) => fname.to_string(), - Self::Delta(fname) => fname.to_string(), + Self::Image(fname) => write!(f, "{fname}"), + Self::Delta(fname) => write!(f, "{fname}"), } } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 07a16a7de2..b8601af818 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -7,11 +7,15 @@ //! timelines/ directory. Currently, there are no //! subdirectories, and each image layer file is named like this: //! +//! ```text //! -__ +//! ``` //! //! For example: //! +//! ```text //! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 +//! ``` //! //! Every image layer file consists of three parts: "summary", //! "index", and "values". The summary is a fixed size header at the @@ -53,7 +57,9 @@ use utils::{ }; use super::filename::ImageFileName; -use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc}; +use super::{ + AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc, +}; /// /// Header stored in the beginning of the file @@ -153,12 +159,14 @@ impl Layer for ImageLayer { /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { println!( - "----- image layer for ten {} tli {} key {}-{} at {} ----", + "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----", self.desc.tenant_id, self.desc.timeline_id, self.desc.key_range.start, self.desc.key_range.end, - self.lsn + self.lsn, + self.desc.is_incremental, + self.desc.file_size ); if !verbose { @@ -230,18 +238,22 @@ impl Layer for ImageLayer { fn is_incremental(&self) -> bool { self.layer_desc().is_incremental } +} - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn short_id(&self) -> String { - self.layer_desc().short_id() +/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. +impl std::fmt::Display for ImageLayer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.layer_desc().short_id()) + } +} + +impl AsLayerDesc for ImageLayer { + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } } impl PersistentLayer for ImageLayer { - fn layer_desc(&self) -> &PersistentLayerDesc { - &self.desc - } - fn local_path(&self) -> Option { Some(self.path()) } @@ -284,7 +296,7 @@ impl ImageLayer { match path_or_conf { PathOrConf::Path(path) => path.to_path_buf(), PathOrConf::Conf(conf) => conf - .timeline_path(&timeline_id, &tenant_id) + .timeline_path(&tenant_id, &timeline_id) .join(fname.to_string()), } } @@ -301,7 +313,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&timeline_id, &tenant_id) + conf.timeline_path(&tenant_id, &timeline_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -652,7 +664,7 @@ impl ImageLayerWriterInner { /// /// # Note /// -/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// As described in , it's /// possible for the writer to drop before `finish` is actually called. So this /// could lead to odd temporary files in the directory, exhausting file system. /// This structure wraps `ImageLayerWriterInner` and also contains `Drop` diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 78bcfdafc0..77778822cf 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -131,13 +131,6 @@ impl Layer for InMemoryLayer { true } - fn short_id(&self) -> String { - let inner = self.inner.read().unwrap(); - - let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); - format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) - } - /// debugging function to print out the contents of the layer fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> { let inner = self.inner.read().unwrap(); @@ -240,6 +233,15 @@ impl Layer for InMemoryLayer { } } +impl std::fmt::Display for InMemoryLayer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let inner = self.inner.read().unwrap(); + + let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); + write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) + } +} + impl InMemoryLayer { /// /// Get layer size on the disk diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 5ed548909e..42c3925b73 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use core::fmt::Display; use std::ops::Range; use utils::{ id::{TenantId, TimelineId}, @@ -48,8 +49,8 @@ impl PersistentLayerDesc { } } - pub fn short_id(&self) -> String { - self.filename().file_name() + pub fn short_id(&self) -> impl Display { + self.filename() } #[cfg(test)] @@ -173,13 +174,16 @@ impl PersistentLayerDesc { pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { println!( - "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----", self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn_range.start, - self.lsn_range.end + self.lsn_range.end, + self.is_delta, + self.is_incremental, + self.file_size, ); Ok(()) diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 9d423ed815..d3c40d93bb 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -4,9 +4,9 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::repository::Key; -use crate::tenant::layer_map::BatchedUpdates; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::timeline::layer_manager::LayerManager; use anyhow::{bail, Result}; use pageserver_api::models::HistoricLayerInfo; use std::ops::Range; @@ -20,12 +20,12 @@ use utils::{ use super::filename::{DeltaFileName, ImageFileName}; use super::{ - DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter, - LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, + AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, + LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, }; /// RemoteLayer is a not yet downloaded [`ImageLayer`] or -/// [`crate::storage_layer::DeltaLayer`]. +/// [`DeltaLayer`](super::DeltaLayer). /// /// RemoteLayer might be downloaded on-demand during operations which are /// allowed download remote layers and during which, it gets replaced with a @@ -50,6 +50,8 @@ pub struct RemoteLayer { /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids /// a possible fast loop between `Timeline::get_reconstruct_data` and /// `Timeline::download_remote_layer`, which also logs. + /// + /// [`ongoing_download`]: Self::ongoing_download pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool, } @@ -71,22 +73,22 @@ impl Layer for RemoteLayer { _reconstruct_state: &mut ValueReconstructState, _ctx: &RequestContext, ) -> Result { - bail!( - "layer {} needs to be downloaded", - self.filename().file_name() - ); + bail!("layer {self} needs to be downloaded"); } /// debugging function to print out the contents of the layer fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> { println!( - "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----", self.desc.tenant_id, self.desc.timeline_id, self.desc.key_range.start, self.desc.key_range.end, self.desc.lsn_range.start, - self.desc.lsn_range.end + self.desc.lsn_range.end, + self.desc.is_delta, + self.desc.is_incremental, + self.desc.file_size, ); Ok(()) @@ -106,18 +108,22 @@ impl Layer for RemoteLayer { fn is_incremental(&self) -> bool { self.layer_desc().is_incremental } +} - /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. - fn short_id(&self) -> String { - self.layer_desc().short_id() +/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. +impl std::fmt::Display for RemoteLayer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.layer_desc().short_id()) + } +} + +impl AsLayerDesc for RemoteLayer { + fn layer_desc(&self) -> &PersistentLayerDesc { + &self.desc } } impl PersistentLayer for RemoteLayer { - fn layer_desc(&self) -> &PersistentLayerDesc { - &self.desc - } - fn local_path(&self) -> Option { None } @@ -220,7 +226,7 @@ impl RemoteLayer { /// Create a Layer struct representing this layer, after it has been downloaded. pub fn create_downloaded_layer( &self, - layer_map_lock_held_witness: &BatchedUpdates<'_>, + layer_map_lock_held_witness: &LayerManager, conf: &'static PageServerConf, file_size: u64, ) -> Arc { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 39c72a7e47..58144d9050 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,6 +1,8 @@ -//! - mod eviction_task; +pub mod layer_manager; +mod logical_size; +pub mod span; +pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; @@ -8,7 +10,6 @@ use bytes::Bytes; use fail::fail_point; use futures::StreamExt; use itertools::Itertools; -use once_cell::sync::OnceCell; use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus, @@ -17,18 +18,18 @@ use pageserver_api::models::{ use remote_storage::GenericRemoteStorage; use serde_with::serde_as; use storage_broker::BrokerClientChannel; -use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; +use tokio::sync::{oneshot, watch, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantTimelineId; use std::cmp::{max, min, Ordering}; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::{BinaryHeap, HashMap, HashSet}; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; use std::pin::pin; -use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; +use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -38,6 +39,7 @@ use crate::tenant::storage_layer::{ DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer, }; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ ephemeral_file::is_ephemeral_file, layer_map::{LayerMap, SearchResult}, @@ -79,14 +81,15 @@ use crate::{is_temporary, task_mgr}; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; +use self::layer_manager::LayerManager; +use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; -use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{ - DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey, + AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, }; #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -120,80 +123,6 @@ impl PartialOrd for Hole { } } -pub struct LayerFileManager(HashMap>); - -impl LayerFileManager { - fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc { - // The assumption for the `expect()` is that all code maintains the following invariant: - // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. - self.0 - .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename().file_name())) - .expect("not found") - .clone() - } - - pub(crate) fn insert(&mut self, layer: Arc) { - let present = self.0.insert(layer.layer_desc().key(), layer.clone()); - if present.is_some() && cfg!(debug_assertions) { - panic!("overwriting a layer: {:?}", layer.layer_desc()) - } - } - - pub(crate) fn new() -> Self { - Self(HashMap::new()) - } - - pub(crate) fn remove(&mut self, layer: Arc) { - let present = self.0.remove(&layer.layer_desc().key()); - if present.is_none() && cfg!(debug_assertions) { - panic!( - "removing layer that is not present in layer mapping: {:?}", - layer.layer_desc() - ) - } - } - - pub(crate) fn replace_and_verify( - &mut self, - expected: Arc, - new: Arc, - ) -> Result<()> { - let key = expected.layer_desc().key(); - let other = new.layer_desc().key(); - - let expected_l0 = LayerMap::is_l0(expected.layer_desc()); - let new_l0 = LayerMap::is_l0(new.layer_desc()); - - fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!( - "layermap-replace-notfound" - )); - - anyhow::ensure!( - key == other, - "expected and new layer have different keys: {key:?} != {other:?}" - ); - - anyhow::ensure!( - expected_l0 == new_l0, - "one layer is l0 while the other is not: {expected_l0} != {new_l0}" - ); - - if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) { - anyhow::ensure!( - compare_arced_layers(&expected, layer), - "another layer was found instead of expected, expected={expected:?}, new={new:?}", - expected = Arc::as_ptr(&expected), - new = Arc::as_ptr(layer), - ); - *layer = new; - Ok(()) - } else { - anyhow::bail!("layer was not found"); - } - } -} - /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { @@ -205,7 +134,6 @@ fn drop_rlock(rlock: tokio::sync::OwnedRwLockReadGuard) { fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { drop(rlock) } - pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -234,7 +162,7 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: Arc>, + pub(crate) layers: Arc>, /// Set of key ranges which should be covered by image layers to /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. @@ -255,7 +183,7 @@ pub struct Timeline { walredo_mgr: Arc, /// Remote storage client. - /// See [`storage_sync`] module comment for details. + /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. pub remote_client: Option>, // What page versions do we hold in the repository? If we get a @@ -312,6 +240,8 @@ pub struct Timeline { /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], /// and [`Tenant::delete_timeline`]. This is an `Arc` lock because we need an owned /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). + /// + /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline pub(super) layer_removal_cs: Arc>, // Needed to ensure that we can't create a branch at a point that was already garbage collected @@ -365,126 +295,6 @@ pub struct Timeline { initial_logical_size_attempt: Mutex>, } -/// Internal structure to hold all data needed for logical size calculation. -/// -/// Calculation consists of two stages: -/// -/// 1. Initial size calculation. That might take a long time, because it requires -/// reading all layers containing relation sizes at `initial_part_end`. -/// -/// 2. Collecting an incremental part and adding that to the initial size. -/// Increments are appended on walreceiver writing new timeline data, -/// which result in increase or decrease of the logical size. -struct LogicalSize { - /// Size, potentially slow to compute. Calculating this might require reading multiple - /// layers, and even ancestor's layers. - /// - /// NOTE: size at a given LSN is constant, but after a restart we will calculate - /// the initial size at a different LSN. - initial_logical_size: OnceCell, - - /// Semaphore to track ongoing calculation of `initial_logical_size`. - initial_size_computation: Arc, - - /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. - initial_part_end: Option, - - /// All other size changes after startup, combined together. - /// - /// Size shouldn't ever be negative, but this is signed for two reasons: - /// - /// 1. If we initialized the "baseline" size lazily, while we already - /// process incoming WAL, the incoming WAL records could decrement the - /// variable and temporarily make it negative. (This is just future-proofing; - /// the initialization is currently not done lazily.) - /// - /// 2. If there is a bug and we e.g. forget to increment it in some cases - /// when size grows, but remember to decrement it when it shrinks again, the - /// variable could go negative. In that case, it seems better to at least - /// try to keep tracking it, rather than clamp or overflow it. Note that - /// get_current_logical_size() will clamp the returned value to zero if it's - /// negative, and log an error. Could set it permanently to zero or some - /// special value to indicate "broken" instead, but this will do for now. - /// - /// Note that we also expose a copy of this value as a prometheus metric, - /// see `current_logical_size_gauge`. Use the `update_current_logical_size` - /// to modify this, it will also keep the prometheus metric in sync. - size_added_after_initial: AtomicI64, -} - -/// Normalized current size, that the data in pageserver occupies. -#[derive(Debug, Clone, Copy)] -enum CurrentLogicalSize { - /// The size is not yet calculated to the end, this is an intermediate result, - /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, - /// yet total logical size cannot be below 0. - Approximate(u64), - // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are - // available for observation without any calculations. - Exact(u64), -} - -impl CurrentLogicalSize { - fn size(&self) -> u64 { - *match self { - Self::Approximate(size) => size, - Self::Exact(size) => size, - } - } -} - -impl LogicalSize { - fn empty_initial() -> Self { - Self { - initial_logical_size: OnceCell::with_value(0), - // initial_logical_size already computed, so, don't admit any calculations - initial_size_computation: Arc::new(Semaphore::new(0)), - initial_part_end: None, - size_added_after_initial: AtomicI64::new(0), - } - } - - fn deferred_initial(compute_to: Lsn) -> Self { - Self { - initial_logical_size: OnceCell::new(), - initial_size_computation: Arc::new(Semaphore::new(1)), - initial_part_end: Some(compute_to), - size_added_after_initial: AtomicI64::new(0), - } - } - - fn current_size(&self) -> anyhow::Result { - let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); - // ^^^ keep this type explicit so that the casts in this function break if - // we change the type. - match self.initial_logical_size.get() { - Some(initial_size) => { - initial_size.checked_add_signed(size_increment) - .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) - .map(CurrentLogicalSize::Exact) - } - None => { - let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); - Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) - } - } - } - - fn increment_size(&self, delta: i64) { - self.size_added_after_initial - .fetch_add(delta, AtomicOrdering::SeqCst); - } - - /// Make the value computed by initial logical size computation - /// available for re-use. This doesn't contain the incremental part. - fn initialized_size(&self, lsn: Lsn) -> Option { - match self.initial_part_end { - Some(v) if v == lsn => self.initial_logical_size.get().copied(), - _ => None, - } - } -} - pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, @@ -705,7 +515,7 @@ impl Timeline { /// Hence, the result **does not represent local filesystem usage**. pub async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; - let (layer_map, _) = &*guard; + let layer_map = guard.layer_map(); let mut size = 0; for l in layer_map.iter_historic_layers() { size += l.file_size(); @@ -1016,7 +826,7 @@ impl Timeline { let last_lsn = self.get_last_record_lsn(); let open_layer_size = { let guard = self.layers.read().await; - let (layers, _) = &*guard; + let layers = guard.layer_map(); let Some(open_layer) = layers.open_layer.as_ref() else { return Ok(()); }; @@ -1148,7 +958,7 @@ impl Timeline { pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { let guard = self.layers.read().await; - let (layer_map, mapping) = &*guard; + let layer_map = guard.layer_map(); let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { in_memory_layers.push(open_layer.info()); @@ -1159,7 +969,7 @@ impl Timeline { let mut historic_layers = Vec::new(); for historic_layer in layer_map.iter_historic_layers() { - let historic_layer = mapping.get_from_desc(&historic_layer); + let historic_layer = guard.get_from_desc(&historic_layer); historic_layers.push(historic_layer.info(reset)); } @@ -1169,10 +979,14 @@ impl Timeline { } } - #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] + #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) }; - let Some(remote_layer) = layer.downcast_remote_layer() else { return Ok(Some(false)) }; + let Some(layer) = self.find_layer(layer_file_name).await else { + return Ok(None); + }; + let Some(remote_layer) = layer.downcast_remote_layer() else { + return Ok(Some(false)); + }; if self.remote_client.is_none() { return Ok(Some(false)); } @@ -1181,10 +995,12 @@ impl Timeline { Ok(Some(true)) } - /// Like [`evict_layer_batch`], but for just one layer. + /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer. /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`. pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { - let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) }; + let Some(local_layer) = self.find_layer(layer_file_name).await else { + return Ok(None); + }; let remote_client = self .remote_client .as_ref() @@ -1205,9 +1021,9 @@ impl Timeline { /// Evict a batch of layers. /// - /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured." + /// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured." /// - /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html + /// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html pub async fn evict_layers( &self, _: &GenericRemoteStorage, @@ -1270,27 +1086,18 @@ impl Timeline { // start the batch update let mut guard = self.layers.write().await; - let (layer_map, mapping) = &mut *guard; - let mut batch_updates = layer_map.batch_update(); - let mut results = Vec::with_capacity(layers_to_evict.len()); for l in layers_to_evict.iter() { let res = if cancel.is_cancelled() { None } else { - Some(self.evict_layer_batch_impl( - &layer_removal_guard, - l, - &mut batch_updates, - mapping, - )) + Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut guard)) }; results.push(res); } // commit the updates & release locks - batch_updates.flush(); drop_wlock(guard); drop(layer_removal_guard); @@ -1302,8 +1109,7 @@ impl Timeline { &self, _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, local_layer: &Arc, - batch_updates: &mut BatchedUpdates<'_>, - mapping: &mut LayerFileManager, + layer_mgr: &mut LayerManager, ) -> anyhow::Result { if local_layer.is_remote_layer() { // TODO(issue #3851): consider returning an err here instead of false, @@ -1339,7 +1145,7 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), + .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted), ), LayerFileName::Delta(delta_name) => RemoteLayer::new_delta( self.tenant_id, @@ -1348,13 +1154,13 @@ impl Timeline { &layer_metadata, local_layer .access_stats() - .clone_for_residence_change(batch_updates, LayerResidenceStatus::Evicted), + .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted), ), }); assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc()); - let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) { + let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) { Ok(()) => { if let Err(e) = local_layer.delete_resident_layer_file() { error!("failed to remove layer file on evict after replacement: {e:#?}"); @@ -1381,9 +1187,9 @@ impl Timeline { .read() .unwrap() .observe(delta); - info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); + info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period"); } else { - info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period"); + info!(layer=%local_layer, "evicted layer after unknown residence period"); } true @@ -1525,10 +1331,7 @@ impl Timeline { timeline_id, tenant_id, pg_version, - layers: Arc::new(tokio::sync::RwLock::new(( - LayerMap::default(), - LayerFileManager::new(), - ))), + layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), wanted_image_layers: Mutex::new(None), walredo_mgr, @@ -1657,7 +1460,7 @@ impl Timeline { *flush_loop_state = FlushLoopState::Exited; Ok(()) } - .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) + .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) ); } @@ -1713,7 +1516,7 @@ impl Timeline { let mut layers = self.layers.try_write().expect( "in the context where we call this function, no other task has access to the object", ); - layers.0.next_open_layer_at = Some(Lsn(start_lsn.0)); + layers.initialize_empty(Lsn(start_lsn.0)); } /// @@ -1721,18 +1524,18 @@ impl Timeline { /// pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { let mut guard = self.layers.write().await; - let (layers, mapping) = &mut *guard; - let mut updates = layers.batch_update(); let mut num_layers = 0; let timer = self.metrics.load_layer_map_histo.start_timer(); // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); // total size of layer files in the current timeline directory let mut total_physical_size = 0; + let mut loaded_layers = Vec::>::new(); + for direntry in fs::read_dir(timeline_path)? { let direntry = direntry?; let direntry_path = direntry.path(); @@ -1759,12 +1562,12 @@ impl Timeline { self.tenant_id, &imgfilename, file_size, - LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - self.insert_historic_layer(Arc::new(layer), &mut updates, mapping); + loaded_layers.push(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { // Create a DeltaLayer struct for each delta file. @@ -1791,12 +1594,12 @@ impl Timeline { self.tenant_id, &deltafilename, file_size, - LayerAccessStats::for_loading_layer(&updates, LayerResidenceStatus::Resident), + LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident), ); trace!("found layer {}", layer.path().display()); total_physical_size += file_size; - self.insert_historic_layer(Arc::new(layer), &mut updates, mapping); + loaded_layers.push(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these @@ -1822,8 +1625,7 @@ impl Timeline { } } - updates.flush(); - layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); + guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1); info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -1851,8 +1653,9 @@ impl Timeline { // We're holding a layer map lock for a while but this // method is only called during init so it's fine. let mut guard = self.layers.write().await; - let (layer_map, mapping) = &mut *guard; - let mut updates = layer_map.batch_update(); + + let mut corrupted_local_layers = Vec::new(); + let mut added_remote_layers = Vec::new(); for remote_layer_name in &index_part.timeline_layers { let local_layer = local_only_layers.remove(remote_layer_name); @@ -1896,7 +1699,7 @@ impl Timeline { anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); } else { self.metrics.resident_physical_size_gauge.sub(local_size); - self.remove_historic_layer(local_layer, &mut updates, mapping); + corrupted_local_layers.push(local_layer); // fall-through to adding the remote layer } } else { @@ -1928,14 +1731,10 @@ impl Timeline { self.timeline_id, imgfilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer( - &updates, - LayerResidenceStatus::Evicted, - ), + LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted), ); let remote_layer = Arc::new(remote_layer); - - self.insert_historic_layer(remote_layer, &mut updates, mapping); + added_remote_layers.push(remote_layer); } LayerFileName::Delta(deltafilename) => { // Create a RemoteLayer for the delta file. @@ -1956,18 +1755,14 @@ impl Timeline { self.timeline_id, deltafilename, &remote_layer_metadata, - LayerAccessStats::for_loading_layer( - &updates, - LayerResidenceStatus::Evicted, - ), + LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted), ); let remote_layer = Arc::new(remote_layer); - self.insert_historic_layer(remote_layer, &mut updates, mapping); + added_remote_layers.push(remote_layer); } } } - - updates.flush(); + guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers); Ok(local_only_layers) } @@ -1982,7 +1777,7 @@ impl Timeline { /// 3. Schedule upload of local-only layer files (which will then also update the remote /// IndexPart to include the new layer files). /// - /// Refer to the `storage_sync` module comment for more context. + /// Refer to the [`remote_timeline_client`] module comment for more context. /// /// # TODO /// May be a bit cleaner to do things based on populated remote client, @@ -2003,10 +1798,10 @@ impl Timeline { let local_layers = { let guard = self.layers.read().await; - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); layers .iter_historic_layers() - .map(|l| (l.filename(), mapping.get_from_desc(&l))) + .map(|l| (l.filename(), guard.get_from_desc(&l))) .collect::>() }; @@ -2239,7 +2034,7 @@ impl Timeline { ctx: &RequestContext, cancel: CancellationToken, ) -> Result { - debug_assert_current_span_has_tenant_and_timeline_id(); + span::debug_assert_current_span_has_tenant_and_timeline_id(); let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); @@ -2324,7 +2119,7 @@ impl Timeline { fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { if !self .conf - .metadata_path(self.timeline_id, self.tenant_id) + .metadata_path(&self.tenant_id, &self.timeline_id) .exists() { error!("timeline-calculate-logical-size-pre metadata file does not exist") @@ -2380,70 +2175,15 @@ impl Timeline { async fn find_layer(&self, layer_file_name: &str) -> Option> { let guard = self.layers.read().await; - let (layers, mapping) = &*guard; - for historic_layer in layers.iter_historic_layers() { + for historic_layer in guard.layer_map().iter_historic_layers() { let historic_layer_name = historic_layer.filename().file_name(); if layer_file_name == historic_layer_name { - return Some(mapping.get_from_desc(&historic_layer)); + return Some(guard.get_from_desc(&historic_layer)); } } None } - - /// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future - /// after we introduce `LayerMapManager`. - fn insert_historic_layer( - &self, - layer: Arc, - updates: &mut BatchedUpdates<'_>, - mapping: &mut LayerFileManager, - ) { - updates.insert_historic(layer.layer_desc().clone()); - mapping.insert(layer); - } - - /// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future - /// after we introduce `LayerMapManager`. - fn remove_historic_layer( - &self, - layer: Arc, - updates: &mut BatchedUpdates<'_>, - mapping: &mut LayerFileManager, - ) { - updates.remove_historic(layer.layer_desc().clone()); - mapping.remove(layer); - } - - /// Removes the layer from local FS (if present) and from memory. - /// Remote storage is not affected by this operation. - fn delete_historic_layer( - &self, - // we cannot remove layers otherwise, since gc and compaction will race - _layer_removal_cs: Arc>, - layer: Arc, - updates: &mut BatchedUpdates<'_>, - mapping: &mut LayerFileManager, - ) -> anyhow::Result<()> { - let layer = mapping.get_from_desc(&layer); - if !layer.is_remote_layer() { - layer.delete_resident_layer_file()?; - let layer_file_size = layer.file_size(); - self.metrics - .resident_physical_size_gauge - .sub(layer_file_size); - } - - // TODO Removing from the bottom of the layer map is expensive. - // Maybe instead discard all layer map historic versions that - // won't be needed for page reconstruction for this timeline, - // and mark what we can't delete yet as deleted from the layer - // map index without actually rebuilding the index. - updates.remove_historic(layer.layer_desc().clone()); - mapping.remove(layer); - - Ok(()) - } } type TraversalId = String; @@ -2462,11 +2202,7 @@ impl TraversalLayerExt for Arc { format!("{}", local_path.display()) } None => { - format!( - "remote {}/{}", - self.get_timeline_id(), - self.filename().file_name() - ) + format!("remote {}/{self}", self.get_timeline_id()) } } } @@ -2474,11 +2210,7 @@ impl TraversalLayerExt for Arc { impl TraversalLayerExt for Arc { fn traversal_id(&self) -> TraversalId { - format!( - "timeline {} in-memory {}", - self.get_timeline_id(), - self.short_id() - ) + format!("timeline {} in-memory {self}", self.get_timeline_id()) } } @@ -2626,7 +2358,7 @@ impl Timeline { 'layer_map_search: loop { let remote_layer = { let guard = timeline.layers.read().await; - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); // Check the open and frozen in-memory layers first, in order from newest // to oldest. @@ -2688,7 +2420,7 @@ impl Timeline { } if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { - let layer = mapping.get_from_desc(&layer); + let layer = guard.get_from_desc(&layer); // If it's a remote layer, download it and retry. if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) @@ -2811,52 +2543,13 @@ impl Timeline { /// async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { let mut guard = self.layers.write().await; - let (layers, _) = &mut *guard; - - ensure!(lsn.is_aligned()); - - let last_record_lsn = self.get_last_record_lsn(); - ensure!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}", + let layer = guard.get_layer_for_write( lsn, - last_record_lsn, - std::backtrace::Backtrace::force_capture(), - ); - - // Do we have a layer open for writing already? - let layer; - if let Some(open_layer) = &layers.open_layer { - if open_layer.get_lsn_range().start > lsn { - bail!( - "unexpected open layer in the future: open layers starts at {}, write lsn {}", - open_layer.get_lsn_range().start, - lsn - ); - } - - layer = Arc::clone(open_layer); - } else { - // No writeable layer yet. Create one. - let start_lsn = layers - .next_open_layer_at - .context("No next open layer found")?; - - trace!( - "creating layer for write at {}/{} for record at {}", - self.timeline_id, - start_lsn, - lsn - ); - let new_layer = - InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; - let layer_rc = Arc::new(new_layer); - - layers.open_layer = Some(Arc::clone(&layer_rc)); - layers.next_open_layer_at = None; - - layer = layer_rc; - } + self.get_last_record_lsn(), + self.conf, + self.timeline_id, + self.tenant_id, + )?; Ok(layer) } @@ -2889,21 +2582,7 @@ impl Timeline { Some(self.write_lock.lock().await) }; let mut guard = self.layers.write().await; - let (layers, _) = &mut *guard; - if let Some(open_layer) = &layers.open_layer { - let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? - let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); - open_layer.freeze(end_lsn); - - // The layer is no longer open, update the layer map to reflect this. - // We will replace it with on-disk historics below. - layers.frozen_layers.push_back(open_layer_rc); - layers.open_layer = None; - layers.next_open_layer_at = Some(end_lsn); - self.last_freeze_at.store(end_lsn); - } - drop_wlock(guard); + guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at); } /// Layer flusher task's main loop. @@ -2928,18 +2607,15 @@ impl Timeline { let result = loop { let layer_to_flush = { let guard = self.layers.read().await; - let (layers, _) = &*guard; - layers.frozen_layers.front().cloned() + guard.layer_map().frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; - if let Some(layer_to_flush) = layer_to_flush { - if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await { - error!("could not flush frozen layer: {err:?}"); - break Err(err); - } - continue; - } else { + let Some(layer_to_flush) = layer_to_flush else { break Ok(()); + }; + if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await { + error!("could not flush frozen layer: {err:?}"); + break Err(err); } }; // Notify any listeners that we're done @@ -2998,7 +2674,7 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, @@ -3048,15 +2724,20 @@ impl Timeline { HashMap::from([(delta_path, metadata)]) }; - fail_point!("flush-frozen-before-sync"); + // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, + // a compaction can delete the file and then it won't be available for uploads any more. + // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this + // race situation. + // See https://github.com/neondatabase/neon/issues/4526 + + pausable_failpoint!("flush-frozen-before-sync"); // The new on-disk layers are now in the layer map. We can remove the - // in-memory layer from the map now. We do not modify `LayerFileManager` because - // it only contains persistent layers. The flushed layer is stored in + // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. { - let mut layers = self.layers.write().await; - let l = layers.0.frozen_layers.pop_front(); + let mut guard = self.layers.write().await; + let l = guard.layer_map_mut().frozen_layers.pop_front(); // Only one thread may call this function at a time (for this // timeline). If two threads tried to flush the same frozen @@ -3131,8 +2812,8 @@ impl Timeline { save_metadata( self.conf, - self.timeline_id, - self.tenant_id, + &self.tenant_id, + &self.timeline_id, &metadata, false, ) @@ -3181,7 +2862,7 @@ impl Timeline { par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; par_fsync::par_fsync(&[self_clone .conf - .timeline_path(&self_clone.timeline_id, &self_clone.tenant_id)]) + .timeline_path(&self_clone.tenant_id, &self_clone.timeline_id)]) .context("fsync of timeline dir")?; anyhow::Ok(new_delta) @@ -3195,15 +2876,12 @@ impl Timeline { // Add it to the layer map let l = Arc::new(new_delta); let mut guard = self.layers.write().await; - let (layers, mapping) = &mut *guard; - let mut batch_updates = layers.batch_update(); l.access_stats().record_residence_event( - &batch_updates, + &guard, LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - self.insert_historic_layer(l, &mut batch_updates, mapping); - batch_updates.flush(); + guard.track_new_l0_delta_layer(l); // update metrics self.metrics.resident_physical_size_gauge.add(sz); @@ -3252,7 +2930,7 @@ impl Timeline { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; - let (layers, _) = &*guard; + let layers = guard.layer_map(); let mut max_deltas = 0; { @@ -3424,18 +3102,16 @@ impl Timeline { .await .context("fsync of newly created layer files")?; - par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)]) .await .context("fsync of timeline dir")?; let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); let mut guard = self.layers.write().await; - let (layers, mapping) = &mut *guard; - let mut updates = layers.batch_update(); - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); - for l in image_layers { + for l in &image_layers { let path = l.filename(); let metadata = timeline_path .join(path.file_name()) @@ -3449,13 +3125,12 @@ impl Timeline { .add(metadata.len()); let l = Arc::new(l); l.access_stats().record_residence_event( - &updates, + &guard, LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - self.insert_historic_layer(l, &mut updates, mapping); } - updates.flush(); + guard.track_new_image_layers(image_layers); drop_wlock(guard); timer.stop_and_record(); @@ -3465,7 +3140,7 @@ impl Timeline { #[derive(Default)] struct CompactLevel0Phase1Result { - new_layers: Vec, + new_layers: Vec>, deltas_to_compact: Vec>, } @@ -3614,21 +3289,23 @@ impl Timeline { /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the /// start of level0 files compaction, the on-demand download should be revisited as well. + /// + /// [`compact_inner`]: Self::compact_inner fn compact_level0_phase1( self: Arc, _layer_removal_cs: Arc>, - guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>, + guard: tokio::sync::OwnedRwLockReadGuard, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, ctx: &RequestContext, ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = stats.read_lock_acquisition_micros.till_now(); // set by caller - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); let level0_deltas = layers.get_level0_deltas()?; let mut level0_deltas = level0_deltas .into_iter() - .map(|x| mapping.get_from_desc(&x)) + .map(|x| guard.get_from_desc(&x)) .collect_vec(); stats.level0_deltas_count = Some(level0_deltas.len()); // Only compact if enough layers have accumulated. @@ -3641,6 +3318,37 @@ impl Timeline { return Ok(CompactLevel0Phase1Result::default()); } + // This failpoint is used together with `test_duplicate_layers` integration test. + // It returns the compaction result exactly the same layers as input to compaction. + // We want to ensure that this will not cause any problem when updating the layer map + // after the compaction is finished. + // + // Currently, there are two rare edge cases that will cause duplicated layers being + // inserted. + // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which + // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer + // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this + // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, + // and this causes an overwrite. This is acceptable because the content is the same, and we should do a + // layer replace instead of the normal remove / upload process. + // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file + // size length. Compaction will likely create the same set of n files afterwards. + // + // This failpoint is a superset of both of the cases. + fail_point!("compact-level0-phase1-return-same", |_| { + println!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint + Ok(CompactLevel0Phase1Result { + new_layers: level0_deltas + .iter() + .map(|x| x.clone().downcast_delta_layer().unwrap()) + .collect(), + deltas_to_compact: level0_deltas + .iter() + .map(|x| x.layer_desc().clone().into()) + .collect(), + }) + }); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -3677,7 +3385,7 @@ impl Timeline { let remotes = deltas_to_compact .iter() .filter(|l| l.is_remote_layer()) - .inspect(|l| info!("compact requires download of {}", l.filename().file_name())) + .inspect(|l| info!("compact requires download of {l}")) .map(|l| { l.clone() .downcast_remote_layer() @@ -3701,7 +3409,7 @@ impl Timeline { ); for l in deltas_to_compact.iter() { - info!("compact includes {}", l.filename().file_name()); + info!("compact includes {l}"); } // We don't need the original list of layers anymore. Drop it so that @@ -3899,7 +3607,9 @@ impl Timeline { || contains_hole { // ... if so, flush previous layer and prepare to write new one - new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); + new_layers.push(Arc::new( + writer.take().unwrap().finish(prev_key.unwrap().next())?, + )); writer = None; if contains_hole { @@ -3937,7 +3647,7 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next())?); + new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?)); } // Sync layers @@ -3948,7 +3658,7 @@ impl Timeline { // minimize latency. par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; - par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)]) + par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)]) .context("fsync of timeline dir")?; layer_paths.pop().unwrap(); @@ -4033,7 +3743,7 @@ impl Timeline { } // Before deleting any layers, we need to wait for their upload ops to finish. - // See storage_sync module level comment on consistency. + // See remote_timeline_client module level comment on consistency. // Do it here because we don't want to hold self.layers.write() while waiting. if let Some(remote_client) = &self.remote_client { debug!("waiting for upload ops to complete"); @@ -4044,9 +3754,16 @@ impl Timeline { } let mut guard = self.layers.write().await; - let (layers, mapping) = &mut *guard; - let mut updates = layers.batch_update(); let mut new_layer_paths = HashMap::with_capacity(new_layers.len()); + + // In some rare cases, we may generate a file with exactly the same key range / LSN as before the compaction. + // We should move to numbering the layer files instead of naming them using key range / LSN some day. But for + // now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map. + let mut duplicated_layers = HashSet::new(); + + let mut insert_layers = Vec::new(); + let mut remove_layers = Vec::new(); + for l in new_layers { let new_delta_path = l.path(); @@ -4070,26 +3787,42 @@ impl Timeline { .add(metadata.len()); new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); - let x: Arc = Arc::new(l); - x.access_stats().record_residence_event( - &updates, + l.access_stats().record_residence_event( + &guard, LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); - self.insert_historic_layer(x, &mut updates, mapping); + let l = l as Arc; + if guard.contains(&l) { + duplicated_layers.insert(l.layer_desc().key()); + } else { + if LayerMap::is_l0(l.layer_desc()) { + return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + } + insert_layers.push(l); + } } // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); - for l in deltas_to_compact { - layer_names_to_delete.push(l.filename()); - // NB: the layer file identified by descriptor `l` is guaranteed to be present - // in the LayerFileManager because we kept holding `layer_removal_cs` the entire - // time, even though we dropped `Timeline::layers` inbetween. - self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?; + for ldesc in deltas_to_compact { + if duplicated_layers.contains(&ldesc.key()) { + // skip duplicated layers, they will not be removed; we have already overwritten them + // with new layers in the compaction phase 1. + continue; + } + layer_names_to_delete.push(ldesc.filename()); + remove_layers.push(guard.get_from_desc(&ldesc)); } - updates.flush(); + + guard.finish_compact_l0( + layer_removal_cs, + remove_layers, + insert_layers, + &self.metrics, + )?; + drop_wlock(guard); // Also schedule the deletions in remote storage @@ -4234,7 +3967,7 @@ impl Timeline { new_gc_cutoff, ) .instrument( - info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff), + info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff), ) .await?; @@ -4308,7 +4041,7 @@ impl Timeline { // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; - let (layers, mapping) = &mut *guard; + let layers = guard.layer_map(); 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -4316,8 +4049,8 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename().file_name(), - horizon_cutoff + l.filename(), + horizon_cutoff, ); result.layers_needed_by_cutoff += 1; continue 'outer; @@ -4327,8 +4060,8 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename().file_name(), - pitr_cutoff + l.filename(), + pitr_cutoff, ); result.layers_needed_by_pitr += 1; continue 'outer; @@ -4346,7 +4079,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename().file_name(), + l.filename(), retain_lsn, l.is_incremental(), ); @@ -4377,10 +4110,7 @@ impl Timeline { if !layers .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? { - debug!( - "keeping {} because it is the latest layer", - l.filename().file_name() - ); + debug!("keeping {} because it is the latest layer", l.filename()); // Collect delta key ranges that need image layers to allow garbage // collecting the layers. // It is not so obvious whether we need to propagate information only about @@ -4397,7 +4127,7 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename().file_name(), + l.filename(), l.is_incremental(), ); layers_to_remove.push(Arc::clone(&l)); @@ -4407,7 +4137,6 @@ impl Timeline { .unwrap() .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); - let mut updates = layers.batch_update(); if !layers_to_remove.is_empty() { // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. @@ -4417,18 +4146,15 @@ impl Timeline { // (couldn't do this in the loop above, because you cannot modify a collection // while iterating it. BTreeMap::retain() would be another option) let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len()); - { - for doomed_layer in layers_to_remove { - layer_names_to_delete.push(doomed_layer.filename()); - self.delete_historic_layer( - layer_removal_cs.clone(), - doomed_layer, - &mut updates, - mapping, - )?; // FIXME: schedule succeeded deletions before returning? - result.layers_removed += 1; - } + let gc_layers = layers_to_remove + .iter() + .map(|x| guard.get_from_desc(x)) + .collect(); + for doomed_layer in layers_to_remove { + layer_names_to_delete.push(doomed_layer.filename()); + result.layers_removed += 1; } + let apply = guard.finish_gc_timeline(layer_removal_cs, gc_layers, &self.metrics)?; if result.layers_removed != 0 { fail_point!("after-timeline-gc-removed-layers"); @@ -4437,8 +4163,9 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; } + + apply.flush(); } - updates.flush(); info!( "GC completed removing {} layers, cutoff {}", @@ -4551,12 +4278,12 @@ impl Timeline { /// If the caller has a deadline or needs a timeout, they can simply stop polling: /// we're **cancellation-safe** because the download happens in a separate task_mgr task. /// So, the current download attempt will run to completion even if we stop polling. - #[instrument(skip_all, fields(layer=%remote_layer.short_id()))] + #[instrument(skip_all, fields(layer=%remote_layer))] pub async fn download_remote_layer( &self, remote_layer: Arc, ) -> anyhow::Result<()> { - debug_assert_current_span_has_tenant_and_timeline_id(); + span::debug_assert_current_span_has_tenant_and_timeline_id(); use std::sync::atomic::Ordering::Relaxed; @@ -4589,7 +4316,7 @@ impl Timeline { TaskKind::RemoteDownloadTask, Some(self.tenant_id), Some(self.timeline_id), - &format!("download layer {}", remote_layer.short_id()), + &format!("download layer {}", remote_layer), false, async move { let remote_client = self_clone.remote_client.as_ref().unwrap(); @@ -4610,13 +4337,11 @@ impl Timeline { // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. let mut guard = self_clone.layers.write().await; - let (layers, mapping) = &mut *guard; - let updates = layers.batch_update(); let new_layer = - remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size); + remote_layer.create_downloaded_layer(&guard, self_clone.conf, *size); { let l: Arc = remote_layer.clone(); - let failure = match mapping.replace_and_verify(l, new_layer) { + let failure = match guard.replace_and_verify(l, new_layer) { Ok(()) => false, Err(e) => { // this is a precondition failure, the layer filename derived @@ -4644,7 +4369,6 @@ impl Timeline { .store(true, Relaxed); } } - updates.flush(); drop_wlock(guard); info!("on-demand download successful"); @@ -4723,7 +4447,7 @@ impl Timeline { }; Ok(()) } - .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id)) + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) ); let initial_info = DownloadRemoteLayersTaskInfo { @@ -4745,10 +4469,10 @@ impl Timeline { let mut downloads = Vec::new(); { let guard = self.layers.read().await; - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); layers .iter_historic_layers() - .map(|l| mapping.get_from_desc(&l)) + .map(|l| guard.get_from_desc(&l)) .filter_map(|l| l.downcast_remote_layer()) .map(|l| self.download_remote_layer(l)) .for_each(|dl| downloads.push(dl)) @@ -4850,7 +4574,7 @@ impl LocalLayerInfoForDiskUsageEviction { impl Timeline { pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read().await; - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); let mut max_layer_size: Option = None; let mut resident_layers = Vec::new(); @@ -4859,21 +4583,18 @@ impl Timeline { let file_size = l.file_size(); max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let l = mapping.get_from_desc(&l); + let l = guard.get_from_desc(&l); if l.is_remote_layer() { continue; } - let last_activity_ts = l - .access_stats() - .latest_activity() - .unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); + let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| { + // We only use this fallback if there's an implementation error. + // `latest_activity` already does rate-limited warn!() log. + debug!(layer=%l, "last_activity returns None, using SystemTime::now"); + SystemTime::now() + }); resident_layers.push(LocalLayerInfoForDiskUsageEviction { layer: l, @@ -4993,33 +4714,6 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } -#[cfg(not(debug_assertions))] -#[inline] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} - -#[cfg(debug_assertions)] -#[inline] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { - use utils::tracing_span_assert; - - pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy< - tracing_span_assert::MultiNameExtractor<2>, - > = once_cell::sync::Lazy::new(|| { - tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]) - }); - - match tracing_span_assert::check_fields_present([ - &*super::TENANT_ID_EXTRACTOR, - &*TIMELINE_ID_EXTRACTOR, - ]) { - Ok(()) => (), - Err(missing) => panic!( - "missing extractors: {:?}", - missing.into_iter().map(|e| e.name()).collect::>() - ), - } -} - /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables. /// /// Returns `true` if the two `Arc` point to the same layer, false otherwise. diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 03cf2d89ad..80146419df 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -70,7 +70,6 @@ impl Timeline { }; self_clone.eviction_task(cancel).await; - info!("eviction task finishing"); Ok(()) }, ); @@ -78,6 +77,9 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { + scopeguard::defer! { + info!("eviction task finishing"); + } use crate::tenant::tasks::random_init_delay; { let policy = self.get_eviction_policy(); @@ -86,7 +88,6 @@ impl Timeline { EvictionPolicy::NoEviction => Duration::from_secs(10), }; if random_init_delay(period, &cancel).await.is_err() { - info!("shutting down"); return; } } @@ -101,7 +102,6 @@ impl Timeline { ControlFlow::Continue(sleep_until) => { tokio::select! { _ = cancel.cancelled() => { - info!("shutting down"); break; } _ = tokio::time::sleep_until(sleep_until) => { } @@ -198,10 +198,10 @@ impl Timeline { // So, we just need to deal with this. let candidates: Vec> = { let guard = self.layers.read().await; - let (layers, mapping) = &*guard; + let layers = guard.layer_map(); let mut candidates = Vec::new(); for hist_layer in layers.iter_historic_layers() { - let hist_layer = mapping.get_from_desc(&hist_layer); + let hist_layer = guard.get_from_desc(&hist_layer); if hist_layer.is_remote_layer() { continue; } @@ -209,7 +209,7 @@ impl Timeline { let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { // We only use this fallback if there's an implementation error. // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now"); + debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now"); SystemTime::now() }); diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs new file mode 100644 index 0000000000..77f5f38314 --- /dev/null +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -0,0 +1,378 @@ +use anyhow::{bail, ensure, Context, Result}; +use std::{collections::HashMap, sync::Arc}; +use tracing::trace; +use utils::{ + id::{TenantId, TimelineId}, + lsn::{AtomicLsn, Lsn}, +}; + +use crate::{ + config::PageServerConf, + metrics::TimelineMetrics, + tenant::{ + layer_map::{BatchedUpdates, LayerMap}, + storage_layer::{ + AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer, + PersistentLayerDesc, PersistentLayerKey, RemoteLayer, + }, + timeline::compare_arced_layers, + }, +}; + +/// Provides semantic APIs to manipulate the layer map. +pub struct LayerManager { + layer_map: LayerMap, + layer_fmgr: LayerFileManager, +} + +/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after +/// scheduling deletes in remote client. +pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>); + +impl ApplyGcResultGuard<'_> { + pub fn flush(self) { + self.0.flush(); + } +} + +impl LayerManager { + pub fn create() -> Self { + Self { + layer_map: LayerMap::default(), + layer_fmgr: LayerFileManager::new(), + } + } + + pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc { + self.layer_fmgr.get_from_desc(desc) + } + + /// Get an immutable reference to the layer map. + /// + /// We expect users only to be able to get an immutable layer map. If users want to make modifications, + /// they should use the below semantic APIs. This design makes us step closer to immutable storage state. + pub fn layer_map(&self) -> &LayerMap { + &self.layer_map + } + + /// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer` + /// gets a refactor. + pub fn layer_map_mut(&mut self) -> &mut LayerMap { + &mut self.layer_map + } + + /// Replace layers in the layer file manager, used in evictions and layer downloads. + pub fn replace_and_verify( + &mut self, + expected: Arc, + new: Arc, + ) -> Result<()> { + self.layer_fmgr.replace_and_verify(expected, new) + } + + /// Called from `load_layer_map`. Initialize the layer manager with: + /// 1. all on-disk layers + /// 2. next open layer (with disk disk_consistent_lsn LSN) + pub fn initialize_local_layers( + &mut self, + on_disk_layers: Vec>, + next_open_layer_at: Lsn, + ) { + let mut updates = self.layer_map.batch_update(); + for layer in on_disk_layers { + Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + self.layer_map.next_open_layer_at = Some(next_open_layer_at); + } + + /// Initialize when creating a new timeline, called in `init_empty_layer_map`. + pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) { + self.layer_map.next_open_layer_at = Some(next_open_layer_at); + } + + pub fn initialize_remote_layers( + &mut self, + corrupted_local_layers: Vec>, + remote_layers: Vec>, + ) { + let mut updates = self.layer_map.batch_update(); + for layer in corrupted_local_layers { + Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr); + } + for layer in remote_layers { + Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + + /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, + /// called within `get_layer_for_write`. + pub fn get_layer_for_write( + &mut self, + lsn: Lsn, + last_record_lsn: Lsn, + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + ) -> Result> { + ensure!(lsn.is_aligned()); + + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}", + lsn, + last_record_lsn, + std::backtrace::Backtrace::force_capture(), + ); + + // Do we have a layer open for writing already? + let layer = if let Some(open_layer) = &self.layer_map.open_layer { + if open_layer.get_lsn_range().start > lsn { + bail!( + "unexpected open layer in the future: open layers starts at {}, write lsn {}", + open_layer.get_lsn_range().start, + lsn + ); + } + + Arc::clone(open_layer) + } else { + // No writeable layer yet. Create one. + let start_lsn = self + .layer_map + .next_open_layer_at + .context("No next open layer found")?; + + trace!( + "creating in-memory layer at {}/{} for record at {}", + timeline_id, + start_lsn, + lsn + ); + + let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?; + let layer = Arc::new(new_layer); + + self.layer_map.open_layer = Some(layer.clone()); + self.layer_map.next_open_layer_at = None; + + layer + }; + + Ok(layer) + } + + /// Called from `freeze_inmem_layer`, returns true if successfully frozen. + pub fn try_freeze_in_memory_layer( + &mut self, + Lsn(last_record_lsn): Lsn, + last_freeze_at: &AtomicLsn, + ) { + let end_lsn = Lsn(last_record_lsn + 1); + + if let Some(open_layer) = &self.layer_map.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + self.layer_map.frozen_layers.push_back(open_layer_rc); + self.layer_map.open_layer = None; + self.layer_map.next_open_layer_at = Some(end_lsn); + last_freeze_at.store(end_lsn); + } + } + + /// Add image layers to the layer map, called from `create_image_layers`. + pub fn track_new_image_layers(&mut self, image_layers: Vec) { + let mut updates = self.layer_map.batch_update(); + for layer in image_layers { + Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + + /// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`. + pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc) { + let mut updates = self.layer_map.batch_update(); + Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr); + updates.flush(); + } + + /// Called when compaction is completed. + pub fn finish_compact_l0( + &mut self, + layer_removal_cs: Arc>, + compact_from: Vec>, + compact_to: Vec>, + metrics: &TimelineMetrics, + ) -> Result<()> { + let mut updates = self.layer_map.batch_update(); + for l in compact_to { + Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr); + } + for l in compact_from { + // NB: the layer file identified by descriptor `l` is guaranteed to be present + // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire + // time, even though we dropped `Timeline::layers` inbetween. + Self::delete_historic_layer( + layer_removal_cs.clone(), + l, + &mut updates, + metrics, + &mut self.layer_fmgr, + )?; + } + updates.flush(); + Ok(()) + } + + /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map. + pub fn finish_gc_timeline( + &mut self, + layer_removal_cs: Arc>, + gc_layers: Vec>, + metrics: &TimelineMetrics, + ) -> Result { + let mut updates = self.layer_map.batch_update(); + for doomed_layer in gc_layers { + Self::delete_historic_layer( + layer_removal_cs.clone(), + doomed_layer, + &mut updates, + metrics, + &mut self.layer_fmgr, + )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch? + } + Ok(ApplyGcResultGuard(updates)) + } + + /// Helper function to insert a layer into the layer map and file manager. + fn insert_historic_layer( + layer: Arc, + updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, + ) { + updates.insert_historic(layer.layer_desc().clone()); + mapping.insert(layer); + } + + /// Helper function to remove a layer into the layer map and file manager + fn remove_historic_layer( + layer: Arc, + updates: &mut BatchedUpdates<'_>, + mapping: &mut LayerFileManager, + ) { + updates.remove_historic(layer.layer_desc().clone()); + mapping.remove(layer); + } + + /// Removes the layer from local FS (if present) and from memory. + /// Remote storage is not affected by this operation. + fn delete_historic_layer( + // we cannot remove layers otherwise, since gc and compaction will race + _layer_removal_cs: Arc>, + layer: Arc, + updates: &mut BatchedUpdates<'_>, + metrics: &TimelineMetrics, + mapping: &mut LayerFileManager, + ) -> anyhow::Result<()> { + if !layer.is_remote_layer() { + layer.delete_resident_layer_file()?; + let layer_file_size = layer.file_size(); + metrics.resident_physical_size_gauge.sub(layer_file_size); + } + + // TODO Removing from the bottom of the layer map is expensive. + // Maybe instead discard all layer map historic versions that + // won't be needed for page reconstruction for this timeline, + // and mark what we can't delete yet as deleted from the layer + // map index without actually rebuilding the index. + updates.remove_historic(layer.layer_desc().clone()); + mapping.remove(layer); + + Ok(()) + } + + pub(crate) fn contains(&self, layer: &Arc) -> bool { + self.layer_fmgr.contains(layer) + } +} + +pub struct LayerFileManager( + HashMap>, +); + +impl LayerFileManager { + fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc { + // The assumption for the `expect()` is that all code maintains the following invariant: + // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. + self.0 + .get(&desc.key()) + .with_context(|| format!("get layer from desc: {}", desc.filename())) + .expect("not found") + .clone() + } + + pub(crate) fn insert(&mut self, layer: Arc) { + let present = self.0.insert(layer.layer_desc().key(), layer.clone()); + if present.is_some() && cfg!(debug_assertions) { + panic!("overwriting a layer: {:?}", layer.layer_desc()) + } + } + + pub(crate) fn contains(&self, layer: &Arc) -> bool { + self.0.contains_key(&layer.layer_desc().key()) + } + + pub(crate) fn new() -> Self { + Self(HashMap::new()) + } + + pub(crate) fn remove(&mut self, layer: Arc) { + let present = self.0.remove(&layer.layer_desc().key()); + if present.is_none() && cfg!(debug_assertions) { + panic!( + "removing layer that is not present in layer mapping: {:?}", + layer.layer_desc() + ) + } + } + + pub(crate) fn replace_and_verify(&mut self, expected: Arc, new: Arc) -> Result<()> { + let key = expected.layer_desc().key(); + let other = new.layer_desc().key(); + + let expected_l0 = LayerMap::is_l0(expected.layer_desc()); + let new_l0 = LayerMap::is_l0(new.layer_desc()); + + fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!( + "layermap-replace-notfound" + )); + + anyhow::ensure!( + key == other, + "expected and new layer have different keys: {key:?} != {other:?}" + ); + + anyhow::ensure!( + expected_l0 == new_l0, + "one layer is l0 while the other is not: {expected_l0} != {new_l0}" + ); + + if let Some(layer) = self.0.get_mut(&key) { + anyhow::ensure!( + compare_arced_layers(&expected, layer), + "another layer was found instead of expected, expected={expected:?}, new={new:?}", + expected = Arc::as_ptr(&expected), + new = Arc::as_ptr(layer), + ); + *layer = new; + Ok(()) + } else { + anyhow::bail!("layer was not found"); + } + } +} diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs new file mode 100644 index 0000000000..d9c2bc4cb9 --- /dev/null +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -0,0 +1,128 @@ +use anyhow::Context; +use once_cell::sync::OnceCell; + +use tokio::sync::Semaphore; +use utils::lsn::Lsn; + +use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; +use std::sync::Arc; + +/// Internal structure to hold all data needed for logical size calculation. +/// +/// Calculation consists of two stages: +/// +/// 1. Initial size calculation. That might take a long time, because it requires +/// reading all layers containing relation sizes at `initial_part_end`. +/// +/// 2. Collecting an incremental part and adding that to the initial size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. +pub(super) struct LogicalSize { + /// Size, potentially slow to compute. Calculating this might require reading multiple + /// layers, and even ancestor's layers. + /// + /// NOTE: size at a given LSN is constant, but after a restart we will calculate + /// the initial size at a different LSN. + pub initial_logical_size: OnceCell, + + /// Semaphore to track ongoing calculation of `initial_logical_size`. + pub initial_size_computation: Arc, + + /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. + pub initial_part_end: Option, + + /// All other size changes after startup, combined together. + /// + /// Size shouldn't ever be negative, but this is signed for two reasons: + /// + /// 1. If we initialized the "baseline" size lazily, while we already + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) + /// + /// 2. If there is a bug and we e.g. forget to increment it in some cases + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. + /// + /// Note that we also expose a copy of this value as a prometheus metric, + /// see `current_logical_size_gauge`. Use the `update_current_logical_size` + /// to modify this, it will also keep the prometheus metric in sync. + pub size_added_after_initial: AtomicI64, +} + +/// Normalized current size, that the data in pageserver occupies. +#[derive(Debug, Clone, Copy)] +pub(super) enum CurrentLogicalSize { + /// The size is not yet calculated to the end, this is an intermediate result, + /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, + /// yet total logical size cannot be below 0. + Approximate(u64), + // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are + // available for observation without any calculations. + Exact(u64), +} + +impl CurrentLogicalSize { + pub(super) fn size(&self) -> u64 { + *match self { + Self::Approximate(size) => size, + Self::Exact(size) => size, + } + } +} + +impl LogicalSize { + pub(super) fn empty_initial() -> Self { + Self { + initial_logical_size: OnceCell::with_value(0), + // initial_logical_size already computed, so, don't admit any calculations + initial_size_computation: Arc::new(Semaphore::new(0)), + initial_part_end: None, + size_added_after_initial: AtomicI64::new(0), + } + } + + pub(super) fn deferred_initial(compute_to: Lsn) -> Self { + Self { + initial_logical_size: OnceCell::new(), + initial_size_computation: Arc::new(Semaphore::new(1)), + initial_part_end: Some(compute_to), + size_added_after_initial: AtomicI64::new(0), + } + } + + pub(super) fn current_size(&self) -> anyhow::Result { + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. + match self.initial_logical_size.get() { + Some(initial_size) => { + initial_size.checked_add_signed(size_increment) + .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) + } + None => { + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); + Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) + } + } + } + + pub(super) fn increment_size(&self, delta: i64) { + self.size_added_after_initial + .fetch_add(delta, AtomicOrdering::SeqCst); + } + + /// Make the value computed by initial logical size computation + /// available for re-use. This doesn't contain the incremental part. + pub(super) fn initialized_size(&self, lsn: Lsn) -> Option { + match self.initial_part_end { + Some(v) if v == lsn => self.initial_logical_size.get().copied(), + _ => None, + } + } +} diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs new file mode 100644 index 0000000000..3b580c9d1b --- /dev/null +++ b/pageserver/src/tenant/timeline/span.rs @@ -0,0 +1,20 @@ +#[cfg(debug_assertions)] +use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor}; + +#[cfg(not(debug_assertions))] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} + +#[cfg(debug_assertions)] +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy> = + once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"])); + + let fields: [&dyn Extractor; 2] = [ + &*crate::tenant::span::TENANT_ID_EXTRACTOR, + &*TIMELINE_ID_EXTRACTOR, + ]; + if let Err(missing) = check_fields_present!(fields) { + panic!("missing extractors: {missing:?}") + } +} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs new file mode 100644 index 0000000000..b8cc65f4b1 --- /dev/null +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -0,0 +1,219 @@ +use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc}; + +use anyhow::Context; +use tracing::{error, info, info_span, warn}; +use utils::{crashsafe, id::TimelineId, lsn::Lsn}; + +use crate::{ + context::RequestContext, + import_datadir, + tenant::{ignore_absent_files, Tenant}, +}; + +use super::Timeline; + +/// A timeline with some of its files on disk, being initialized. +/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or +/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory +/// to be removed on next restart. +/// +/// The caller is responsible for proper timeline data filling before the final init. +#[must_use] +pub struct UninitializedTimeline<'t> { + pub(crate) owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Arc, TimelineUninitMark)>, +} + +impl<'t> UninitializedTimeline<'t> { + pub(crate) fn new( + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Arc, TimelineUninitMark)>, + ) -> Self { + Self { + owning_tenant, + timeline_id, + raw_timeline, + } + } + + /// Finish timeline creation: insert it into the Tenant's timelines map and remove the + /// uninit mark file. + /// + /// This function launches the flush loop if not already done. + /// + /// The caller is responsible for activating the timeline (function `.activate()`). + pub(crate) fn finish_creation(mut self) -> anyhow::Result> { + let timeline_id = self.timeline_id; + let tenant_id = self.owning_tenant.tenant_id; + + let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { + format!("No timeline for initalization found for {tenant_id}/{timeline_id}") + })?; + + // Check that the caller initialized disk_consistent_lsn + let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + anyhow::ensure!( + new_disk_consistent_lsn.is_valid(), + "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" + ); + + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + match timelines.entry(timeline_id) { + Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + ), + Entry::Vacant(v) => { + uninit_mark.remove_uninit_mark().with_context(|| { + format!( + "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + ) + })?; + v.insert(Arc::clone(&new_timeline)); + + new_timeline.maybe_spawn_flush_loop(); + } + } + + Ok(new_timeline) + } + + /// Prepares timeline data by loading it from the basebackup archive. + pub(crate) async fn import_basebackup_from_tar( + self, + copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), + base_lsn: Lsn, + broker_client: storage_broker::BrokerClientChannel, + ctx: &RequestContext, + ) -> anyhow::Result> { + let raw_timeline = self.raw_timeline()?; + + import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) + .await + .context("Failed to import basebackup")?; + + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // + // Flush loop needs to be spawned in order to be able to flush. + raw_timeline.maybe_spawn_flush_loop(); + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + anyhow::bail!("failpoint before-checkpoint-new-timeline"); + }); + + raw_timeline + .freeze_and_flush() + .await + .context("Failed to flush after basebackup import")?; + + // All the data has been imported. Insert the Timeline into the tenant's timelines + // map and remove the uninit mark file. + let tl = self.finish_creation()?; + tl.activate(broker_client, None, ctx); + Ok(tl) + } + + pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc> { + Ok(&self + .raw_timeline + .as_ref() + .with_context(|| { + format!( + "No raw timeline {}/{} found", + self.owning_tenant.tenant_id, self.timeline_id + ) + })? + .0) + } +} + +impl Drop for UninitializedTimeline<'_> { + fn drop(&mut self) { + if let Some((_, uninit_mark)) = self.raw_timeline.take() { + let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered(); + error!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(uninit_mark); + } + } +} + +pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { + let timeline_path = &uninit_mark.timeline_path; + match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + Ok(()) => { + info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + } + Err(e) => { + error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") + } + } + drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists +} + +/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, +/// or gets removed eventually. +/// +/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +#[must_use] +pub(crate) struct TimelineUninitMark { + uninit_mark_deleted: bool, + uninit_mark_path: PathBuf, + pub(crate) timeline_path: PathBuf, +} + +impl TimelineUninitMark { + pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { + Self { + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + } + } + + fn remove_uninit_mark(mut self) -> anyhow::Result<()> { + if !self.uninit_mark_deleted { + self.delete_mark_file_if_present()?; + } + + Ok(()) + } + + fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { + let uninit_mark_file = &self.uninit_mark_path; + let uninit_mark_parent = uninit_mark_file + .parent() + .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; + ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { + format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") + })?; + crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; + self.uninit_mark_deleted = true; + + Ok(()) + } +} + +impl Drop for TimelineUninitMark { + fn drop(&mut self) { + if !self.uninit_mark_deleted { + if self.timeline_path.exists() { + error!( + "Uninit mark {} is not removed, timeline {} stays uninitialized", + self.uninit_mark_path.display(), + self.timeline_path.display() + ) + } else { + // unblock later timeline creation attempts + warn!( + "Removing intermediate uninit mark file {}", + self.uninit_mark_path.display() + ); + if let Err(e) = self.delete_mark_file_if_present() { + error!("Failed to remove the uninit mark file: {e}") + } + } + } + } +} diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index fa23ae765d..57c09a4487 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -6,7 +6,7 @@ //! Current connection state is tracked too, to ensure it's not getting stale. //! //! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, -//! then a [re]connection happens, if necessary. +//! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; @@ -266,7 +266,7 @@ pub struct ConnectionManagerStatus { impl ConnectionManagerStatus { /// Generates a string, describing current connection status in a form, suitable for logging. pub fn to_human_readable_string(&self) -> String { - let mut resulting_string = "WalReceiver status".to_string(); + let mut resulting_string = String::new(); match &self.existing_connection { Some(connection) => { if connection.has_processed_wal { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 1c1fe87305..817a30247e 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection( ctx: RequestContext, node: NodeId, ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_and_timeline_id(); + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -140,6 +142,9 @@ pub(super) async fn handle_walreceiver_connection( } Ok(()) } + // Enrich the log lines emitted by this closure with meaningful context. + // TODO: technically, this task outlives the surrounding function, so, the + // spans won't be properly nested. .instrument(tracing::info_span!("poller")), ); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 98730a7637..bc250166ce 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -175,8 +175,8 @@ impl WalRedoManager for PostgresRedoManager { let mut img = base_img.map(|p| p.1); let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; - for i in 1..records.len() { - let rec_neon = can_apply_in_neon(&records[i].1); + for (i, record) in records.iter().enumerate().skip(1) { + let rec_neon = can_apply_in_neon(&record.1); if rec_neon != batch_neon { let result = if batch_neon { @@ -685,7 +685,7 @@ impl PostgresRedoManager { // as close-on-exec by default, but that's not enough, since we use // libraries that directly call libc open without setting that flag. .close_fds() - .spawn_no_leak_child() + .spawn_no_leak_child(self.tenant_id) .map_err(|e| { Error::new( e.kind(), @@ -989,6 +989,7 @@ impl PostgresRedoManager { /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. struct NoLeakChild { + tenant_id: TenantId, child: Option, } @@ -1007,9 +1008,12 @@ impl DerefMut for NoLeakChild { } impl NoLeakChild { - fn spawn(command: &mut Command) -> io::Result { + fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result { let child = command.spawn()?; - Ok(NoLeakChild { child: Some(child) }) + Ok(NoLeakChild { + tenant_id, + child: Some(child), + }) } fn kill_and_wait(mut self) { @@ -1056,11 +1060,16 @@ impl Drop for NoLeakChild { Some(child) => child, None => return, }; + let tenant_id = self.tenant_id; // Offload the kill+wait of the child process into the background. // If someone stops the runtime, we'll leak the child process. // We can ignore that case because we only stop the runtime on pageserver exit. BACKGROUND_RUNTIME.spawn(async move { tokio::task::spawn_blocking(move || { + // Intentionally don't inherit the tracing context from whoever is dropping us. + // This thread here is going to outlive of our dropper. + let span = tracing::info_span!("walredo", %tenant_id); + let _entered = span.enter(); Self::kill_and_wait_impl(child); }) .await @@ -1069,12 +1078,12 @@ impl Drop for NoLeakChild { } trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self) -> io::Result; + fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result; } impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self) -> io::Result { - NoLeakChild::spawn(self) + fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result { + NoLeakChild::spawn(tenant_id, self) } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 606af9741f..4fdc7f8c82 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -34,7 +34,6 @@ #define PageStoreTrace DEBUG5 -#define MAX_RECONNECT_ATTEMPTS 5 #define RECONNECT_INTERVAL_USEC 1000000 bool connected = false; @@ -55,13 +54,15 @@ int32 max_cluster_size; char *page_server_connstring; char *neon_auth_token; -int n_unflushed_requests = 0; -int flush_every_n_requests = 8; int readahead_buffer_size = 128; +int flush_every_n_requests = 8; + +int n_reconnect_attempts = 0; +int max_reconnect_attempts = 60; bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; -static void pageserver_flush(void); +static bool pageserver_flush(void); static bool pageserver_connect(int elevel) @@ -232,16 +233,17 @@ pageserver_disconnect(void) } } -static void +static bool pageserver_send(NeonRequest * request) { StringInfoData req_buff; - int n_reconnect_attempts = 0; /* If the connection was lost for some reason, reconnect */ if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + neon_log(LOG, "pageserver_send disconnect bad connection"); pageserver_disconnect(); - + } req_buff = nm_pack_request(request); @@ -252,53 +254,36 @@ pageserver_send(NeonRequest * request) * See https://github.com/neondatabase/neon/issues/1138 * So try to reestablish connection in case of failure. */ - while (true) + if (!connected) { - if (!connected) + while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { - if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR)) - { - n_reconnect_attempts += 1; - pg_usleep(RECONNECT_INTERVAL_USEC); - continue; - } + n_reconnect_attempts += 1; + pg_usleep(RECONNECT_INTERVAL_USEC); } + n_reconnect_attempts = 0; + } - /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output and - * TCP buffer. - */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); - if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS) - { - neon_log(LOG, "failed to send page request (try to reconnect): %s", msg); - if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */ - pg_usleep(RECONNECT_INTERVAL_USEC); - n_reconnect_attempts += 1; - continue; - } - else - { - pageserver_disconnect(); - neon_log(ERROR, "failed to send page request: %s", msg); - } - } - break; + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output and + * TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); + neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + pfree(msg); + pfree(req_buff.data); + return false; } pfree(req_buff.data); - n_unflushed_requests++; - - if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests) - pageserver_flush(); - if (message_level_is_interesting(PageStoreTrace)) { char *msg = nm_to_string((NeonMessage *) request); @@ -306,6 +291,7 @@ pageserver_send(NeonRequest * request) neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); } + return true; } static NeonResponse * @@ -340,16 +326,25 @@ pageserver_receive(void) } else if (rc == -1) { + neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); pageserver_disconnect(); resp = NULL; } else if (rc == -2) - neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn))); + { + char* msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); + neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); + } else - neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc); + { + pageserver_disconnect(); + neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + } } PG_CATCH(); { + neon_log(LOG, "pageserver_receive disconnect due to caught exception"); pageserver_disconnect(); PG_RE_THROW(); } @@ -359,21 +354,25 @@ pageserver_receive(void) } -static void +static bool pageserver_flush(void) { if (!connected) { neon_log(WARNING, "Tried to flush while disconnected"); } - else if (PQflush(pageserver_conn)) + else { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); - - pageserver_disconnect(); - neon_log(ERROR, "failed to flush page requests: %s", msg); + if (PQflush(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + pageserver_disconnect(); + neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); + pfree(msg); + return false; + } } - n_unflushed_requests = 0; + return true; } page_server_api api = { @@ -439,6 +438,14 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.max_reconnect_attempts", + "Maximal attempts to reconnect to pages server (with 1 second timeout)", + NULL, + &max_reconnect_attempts, + 10, 0, INT_MAX, + PGC_USERSET, + 0, + NULL, NULL, NULL); DefineCustomIntVariable("neon.readahead_buffer_size", "number of prefetches to buffer", "This buffer is used to hold and manage prefetched " diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 8257b90ac3..2889db49bc 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -145,9 +145,9 @@ extern char *nm_to_string(NeonMessage * msg); typedef struct { - void (*send) (NeonRequest * request); + bool (*send) (NeonRequest * request); NeonResponse *(*receive) (void); - void (*flush) (void); + bool (*flush) (void); } page_server_api; extern void prefetch_on_ps_disconnect(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 79dec0881d..76d71dd94b 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -489,7 +489,8 @@ prefetch_wait_for(uint64 ring_index) if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) { - page_server->flush(); + if (!page_server->flush()) + return false; MyPState->ring_flush = MyPState->ring_unused; } @@ -666,7 +667,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * smaller than the current WAL insert/redo pointer, which is already * larger than this prefetch_lsn. So in any case, that would * invalidate this cache. - * + * * The best LSN to use for effective_request_lsn would be * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. */ @@ -677,7 +678,8 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - page_server->send((NeonRequest *) &request); + + while (!page_server->send((NeonRequest *) &request)); /* update prefetch state */ MyPState->n_requests_inflight += 1; @@ -687,6 +689,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force /* update slot state */ slot->status = PRFS_REQUESTED; + prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -743,6 +746,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls prefetch_set_unused(ring_index); entry = NULL; } + } /* if we don't want the latest version, only accept requests with the exact same LSN */ else @@ -756,20 +760,23 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls } } - /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. - */ - else if (slot->status == PRFS_TAG_REMAINS) + if (entry != NULL) { - prefetch_set_unused(ring_index); - entry = NULL; - } - else - { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We received a prefetch for a page that was recently read and + * removed from the buffers. Remove that request from the buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + } + else + { + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + return ring_index; + } } } @@ -859,8 +866,7 @@ page_server_request(void const *req) { NeonResponse* resp; do { - page_server->send((NeonRequest *) req); - page_server->flush(); + while (!page_server->send((NeonRequest *) req) || !page_server->flush()); MyPState->ring_flush = MyPState->ring_unused; consume_prefetch_responses(); resp = page_server->receive(); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 8d82de6dc4..765966092d 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -2231,6 +2231,18 @@ HandleSafekeeperResponse(void) if (n_synced >= quorum) { /* All safekeepers synced! */ + + /* + * Send empty message to broadcast latest truncateLsn to all safekeepers. + * This helps to finish next sync-safekeepers eailier, by skipping recovery + * step. + * + * We don't need to wait for response because it doesn't affect correctness, + * and TCP should be able to deliver the message to safekeepers in case of + * network working properly. + */ + BroadcastAppendRequest(); + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); exit(0); } diff --git a/poetry.lock b/poetry.lock index aa76ac6dbd..b22a6a5bc9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "aiohttp" version = "3.7.4" description = "Async http client/server framework (asyncio)" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -62,7 +61,6 @@ speedups = ["aiodns", "brotlipy", "cchardet"] name = "aiopg" version = "1.3.4" description = "Postgres integration with asyncio." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -81,7 +79,6 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] name = "allure-pytest" version = "2.13.2" description = "Allure pytest integration" -category = "main" optional = false python-versions = "*" files = [ @@ -97,7 +94,6 @@ pytest = ">=4.5.0" name = "allure-python-commons" version = "2.13.2" description = "Common module for integrate allure with python-based frameworks" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -113,7 +109,6 @@ pluggy = ">=0.4.0" name = "async-timeout" version = "3.0.1" description = "Timeout context manager for asyncio programs" -category = "main" optional = false python-versions = ">=3.5.3" files = [ @@ -125,7 +120,6 @@ files = [ name = "asyncpg" version = "0.27.0" description = "An asyncio PostgreSQL driver" -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -176,7 +170,6 @@ test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"] name = "attrs" version = "21.4.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -194,7 +187,6 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" name = "aws-sam-translator" version = "1.48.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" -category = "main" optional = false python-versions = ">=3.7, <=4.0, !=4.0" files = [ @@ -204,7 +196,7 @@ files = [ ] [package.dependencies] -boto3 = ">=1.19.5,<2.0.0" +boto3 = ">=1.19.5,<2.dev0" jsonschema = ">=3.2,<4.0" [package.extras] @@ -214,7 +206,6 @@ dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage name = "aws-xray-sdk" version = "2.10.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." -category = "main" optional = false python-versions = "*" files = [ @@ -230,7 +221,6 @@ wrapt = "*" name = "backoff" version = "2.2.1" description = "Function decoration for backoff and retry" -category = "main" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -242,7 +232,6 @@ files = [ name = "black" version = "23.3.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -292,7 +281,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "boto3" version = "1.26.16" description = "The AWS SDK for Python" -category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -312,7 +300,6 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] name = "boto3-stubs" version = "1.26.16" description = "Type annotations for boto3 1.26.16 generated with mypy-boto3-builder 7.11.11" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -657,7 +644,6 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] name = "botocore" version = "1.29.16" description = "Low-level, data-driven core of boto 3." -category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -677,7 +663,6 @@ crt = ["awscrt (==0.14.0)"] name = "botocore-stubs" version = "1.27.38" description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -692,7 +677,6 @@ typing-extensions = ">=4.1.0" name = "certifi" version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -704,7 +688,6 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "main" optional = false python-versions = "*" files = [ @@ -781,7 +764,6 @@ pycparser = "*" name = "cfn-lint" version = "0.61.3" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" -category = "main" optional = false python-versions = ">=3.6, <=4.0, !=4.0" files = [ @@ -803,7 +785,6 @@ sarif-om = ">=1.0.4,<1.1.0" name = "chardet" version = "3.0.4" description = "Universal encoding detector for Python 2 and 3" -category = "main" optional = false python-versions = "*" files = [ @@ -815,7 +796,6 @@ files = [ name = "charset-normalizer" version = "2.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.6.0" files = [ @@ -830,7 +810,6 @@ unicode-backport = ["unicodedata2"] name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -845,7 +824,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.5" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -855,31 +833,34 @@ files = [ [[package]] name = "cryptography" -version = "41.0.0" +version = "41.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"}, - {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"}, - {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"}, - {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"}, - {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"}, - {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"}, - {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"}, - {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"}, - {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"}, - {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"}, - {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"}, - {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"}, - {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"}, - {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"}, - {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"}, - {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"}, - {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"}, - {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"}, - {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"}, + {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"}, + {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"}, + {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"}, + {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"}, + {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"}, + {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"}, + {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"}, + {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"}, + {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"}, + {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"}, + {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"}, + {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"}, + {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"}, + {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"}, + {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"}, + {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"}, + {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"}, + {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"}, + {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"}, + {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"}, + {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"}, + {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"}, + {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"}, ] [package.dependencies] @@ -899,7 +880,6 @@ test-randomorder = ["pytest-randomly"] name = "docker" version = "4.2.2" description = "A Python library for the Docker Engine API." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -921,7 +901,6 @@ tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] name = "ecdsa" version = "0.18.0" description = "ECDSA cryptographic signature library (pure python)" -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -940,7 +919,6 @@ gmpy2 = ["gmpy2"] name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -955,7 +933,6 @@ test = ["pytest (>=6)"] name = "execnet" version = "1.9.0" description = "execnet: rapid multi-Python deployment" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -970,7 +947,6 @@ testing = ["pre-commit"] name = "flask" version = "2.2.5" description = "A simple framework for building complex web applications." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -993,7 +969,6 @@ dotenv = ["python-dotenv"] name = "flask-cors" version = "3.0.10" description = "A Flask extension adding a decorator for CORS support" -category = "main" optional = false python-versions = "*" files = [ @@ -1009,7 +984,6 @@ Six = "*" name = "graphql-core" version = "3.2.1" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." -category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -1021,7 +995,6 @@ files = [ name = "idna" version = "3.3" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1033,7 +1006,6 @@ files = [ name = "importlib-metadata" version = "4.12.0" description = "Read metadata from Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1053,7 +1025,6 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "iniconfig" version = "1.1.1" description = "iniconfig: brain-dead simple config-ini parsing" -category = "main" optional = false python-versions = "*" files = [ @@ -1065,7 +1036,6 @@ files = [ name = "itsdangerous" version = "2.1.2" description = "Safely pass data to untrusted environments and back." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1077,7 +1047,6 @@ files = [ name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1095,7 +1064,6 @@ i18n = ["Babel (>=2.7)"] name = "jmespath" version = "1.0.1" description = "JSON Matching Expressions" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1107,7 +1075,6 @@ files = [ name = "jschema-to-python" version = "1.2.3" description = "Generate source code for Python classes from a JSON schema." -category = "main" optional = false python-versions = ">= 2.7" files = [ @@ -1124,7 +1091,6 @@ pbr = "*" name = "jsondiff" version = "2.0.0" description = "Diff JSON and JSON-like structures in Python" -category = "main" optional = false python-versions = "*" files = [ @@ -1136,7 +1102,6 @@ files = [ name = "jsonpatch" version = "1.32" description = "Apply JSON-Patches (RFC 6902)" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -1151,7 +1116,6 @@ jsonpointer = ">=1.9" name = "jsonpickle" version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" -category = "main" optional = false python-versions = ">=2.7" files = [ @@ -1168,7 +1132,6 @@ testing-libs = ["simplejson", "ujson", "yajl"] name = "jsonpointer" version = "2.3" description = "Identify specific nodes in a JSON document (RFC 6901)" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1180,7 +1143,6 @@ files = [ name = "jsonschema" version = "3.2.0" description = "An implementation of JSON Schema validation for Python" -category = "main" optional = false python-versions = "*" files = [ @@ -1202,7 +1164,6 @@ format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-va name = "junit-xml" version = "1.9" description = "Creates JUnit XML test result documents that can be read by tools such as Jenkins" -category = "main" optional = false python-versions = "*" files = [ @@ -1217,7 +1178,6 @@ six = "*" name = "markupsafe" version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1267,7 +1227,6 @@ files = [ name = "moto" version = "4.1.2" description = "" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1328,7 +1287,6 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] name = "multidict" version = "6.0.4" description = "multidict implementation" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1412,7 +1370,6 @@ files = [ name = "mypy" version = "1.3.0" description = "Optional static typing for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1459,7 +1416,6 @@ reports = ["lxml"] name = "mypy-boto3-s3" version = "1.26.0.post1" description = "Type annotations for boto3.S3 1.26.0 service generated with mypy-boto3-builder 7.11.10" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1474,7 +1430,6 @@ typing-extensions = ">=4.1.0" name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -1486,7 +1441,6 @@ files = [ name = "networkx" version = "2.8.5" description = "Python package for creating and manipulating graphs and networks" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1505,7 +1459,6 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] name = "openapi-schema-validator" version = "0.2.3" description = "OpenAPI schema validation for Python" -category = "main" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ @@ -1525,7 +1478,6 @@ strict-rfc3339 = ["strict-rfc3339"] name = "openapi-spec-validator" version = "0.4.0" description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" -category = "main" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ @@ -1546,7 +1498,6 @@ requests = ["requests"] name = "packaging" version = "23.0" description = "Core utilities for Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1558,7 +1509,6 @@ files = [ name = "pathspec" version = "0.9.0" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -1570,7 +1520,6 @@ files = [ name = "pbr" version = "5.9.0" description = "Python Build Reasonableness" -category = "main" optional = false python-versions = ">=2.6" files = [ @@ -1582,7 +1531,6 @@ files = [ name = "platformdirs" version = "2.5.2" description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1598,7 +1546,6 @@ test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1614,7 +1561,6 @@ testing = ["pytest", "pytest-benchmark"] name = "prometheus-client" version = "0.14.1" description = "Python client for the Prometheus monitoring system." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1629,7 +1575,6 @@ twisted = ["twisted"] name = "psutil" version = "5.9.4" description = "Cross-platform lib for process and system monitoring in Python." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1656,7 +1601,6 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2-binary" version = "2.9.6" description = "psycopg2 - Python-PostgreSQL Database Adapter" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1728,7 +1672,6 @@ files = [ name = "pyasn1" version = "0.4.8" description = "ASN.1 types and codecs" -category = "main" optional = false python-versions = "*" files = [ @@ -1740,7 +1683,6 @@ files = [ name = "pycparser" version = "2.21" description = "C parser in Python" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1752,7 +1694,6 @@ files = [ name = "pyjwt" version = "2.4.0" description = "JSON Web Token implementation in Python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1773,7 +1714,6 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" optional = false python-versions = ">=3.6.8" files = [ @@ -1788,7 +1728,6 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pypiwin32" version = "223" description = "" -category = "main" optional = false python-versions = "*" files = [ @@ -1803,7 +1742,6 @@ pywin32 = ">=223" name = "pyrsistent" version = "0.18.1" description = "Persistent/Functional/Immutable data structures" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1834,7 +1772,6 @@ files = [ name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1857,7 +1794,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-asyncio" version = "0.21.0" description = "Pytest support for asyncio" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1876,7 +1812,6 @@ testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy name = "pytest-httpserver" version = "1.0.8" description = "pytest-httpserver is a httpserver for pytest" -category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -1891,7 +1826,6 @@ Werkzeug = ">=2.0.0" name = "pytest-lazy-fixture" version = "0.6.3" description = "It helps to use fixtures in pytest.mark.parametrize" -category = "main" optional = false python-versions = "*" files = [ @@ -1906,7 +1840,6 @@ pytest = ">=3.2.5" name = "pytest-order" version = "1.1.0" description = "pytest plugin to run your tests in a specific order" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1924,7 +1857,6 @@ pytest = [ name = "pytest-rerunfailures" version = "11.1.2" description = "pytest plugin to re-run tests to eliminate flaky failures" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1936,11 +1868,24 @@ files = [ packaging = ">=17.1" pytest = ">=5.3" +[[package]] +name = "pytest-split" +version = "0.8.1" +description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time." +optional = false +python-versions = ">=3.7.1,<4.0" +files = [ + {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"}, + {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"}, +] + +[package.dependencies] +pytest = ">=5,<8" + [[package]] name = "pytest-timeout" version = "2.1.0" description = "pytest plugin to abort hanging tests" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1955,7 +1900,6 @@ pytest = ">=5.0.0" name = "pytest-xdist" version = "3.3.1" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1976,7 +1920,6 @@ testing = ["filelock"] name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -1991,7 +1934,6 @@ six = ">=1.5" name = "python-jose" version = "3.3.0" description = "JOSE implementation in Python" -category = "main" optional = false python-versions = "*" files = [ @@ -2014,7 +1956,6 @@ pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] name = "pywin32" version = "301" description = "Python for Window Extensions" -category = "main" optional = false python-versions = "*" files = [ @@ -2034,7 +1975,6 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2084,7 +2024,6 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2106,7 +2045,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "responses" version = "0.21.0" description = "A utility library for mocking out the `requests` Python library." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2125,7 +2063,6 @@ tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asy name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" -category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -2140,7 +2077,6 @@ pyasn1 = ">=0.1.3" name = "ruff" version = "0.0.269" description = "An extremely fast Python linter, written in Rust." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2167,7 +2103,6 @@ files = [ name = "s3transfer" version = "0.6.0" description = "An Amazon S3 Transfer Manager" -category = "main" optional = false python-versions = ">= 3.7" files = [ @@ -2185,7 +2120,6 @@ crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] name = "sarif-om" version = "1.0.4" description = "Classes implementing the SARIF 2.1.0 object model." -category = "main" optional = false python-versions = ">= 2.7" files = [ @@ -2201,7 +2135,6 @@ pbr = "*" name = "setuptools" version = "65.5.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2218,7 +2151,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -2230,7 +2162,6 @@ files = [ name = "sshpubkeys" version = "3.3.1" description = "SSH public key parser" -category = "main" optional = false python-versions = ">=3" files = [ @@ -2249,7 +2180,6 @@ dev = ["twine", "wheel", "yapf"] name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -2261,7 +2191,6 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2273,7 +2202,6 @@ files = [ name = "types-psutil" version = "5.9.5.12" description = "Typing stubs for psutil" -category = "main" optional = false python-versions = "*" files = [ @@ -2285,7 +2213,6 @@ files = [ name = "types-psycopg2" version = "2.9.21.10" description = "Typing stubs for psycopg2" -category = "main" optional = false python-versions = "*" files = [ @@ -2297,7 +2224,6 @@ files = [ name = "types-pytest-lazy-fixture" version = "0.6.3.3" description = "Typing stubs for pytest-lazy-fixture" -category = "main" optional = false python-versions = "*" files = [ @@ -2309,7 +2235,6 @@ files = [ name = "types-requests" version = "2.31.0.0" description = "Typing stubs for requests" -category = "main" optional = false python-versions = "*" files = [ @@ -2324,7 +2249,6 @@ types-urllib3 = "*" name = "types-s3transfer" version = "0.6.0.post3" description = "Type annotations and code completion for s3transfer" -category = "main" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -2336,7 +2260,6 @@ files = [ name = "types-toml" version = "0.10.8.6" description = "Typing stubs for toml" -category = "main" optional = false python-versions = "*" files = [ @@ -2348,7 +2271,6 @@ files = [ name = "types-urllib3" version = "1.26.17" description = "Typing stubs for urllib3" -category = "main" optional = false python-versions = "*" files = [ @@ -2360,7 +2282,6 @@ files = [ name = "typing-extensions" version = "4.6.1" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2372,7 +2293,6 @@ files = [ name = "urllib3" version = "1.26.11" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" files = [ @@ -2389,7 +2309,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "websocket-client" version = "1.3.3" description = "WebSocket client for Python with low level API options" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2406,7 +2325,6 @@ test = ["websockets"] name = "werkzeug" version = "2.2.3" description = "The comprehensive WSGI web application library." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2424,7 +2342,6 @@ watchdog = ["watchdog"] name = "wrapt" version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -2498,7 +2415,6 @@ files = [ name = "xmltodict" version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" -category = "main" optional = false python-versions = ">=3.4" files = [ @@ -2510,7 +2426,6 @@ files = [ name = "yarl" version = "1.8.2" description = "Yet another URL library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2598,7 +2513,6 @@ multidict = ">=4.0" name = "zipp" version = "3.8.1" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2613,4 +2527,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c6c217033f50430c31b0979b74db222e6bab2301abd8b9f0cce5a9d5bccc578f" +content-hash = "e16a65d8fdff4e2173610e552e0e7306e301de2c640ae6082ef6cc5755f566d2" diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index a5f50cc7c1..849af47cfc 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -5,6 +5,7 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; +use futures::future::Either; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; @@ -109,20 +110,25 @@ async fn main() -> anyhow::Result<()> { let cancellation_token = CancellationToken::new(); - let main = proxy::flatten_err(tokio::spawn(task_main( + let main = tokio::spawn(task_main( Arc::new(destination), tls_config, proxy_listener, cancellation_token.clone(), - ))); - let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token))); + )); + let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token)); - tokio::select! { - res = main => { res?; }, - res = signals_task => { res?; }, - } + // the signal task cant ever succeed. + // the main task can error, or can succeed on cancellation. + // we want to immediately exit on either of these cases + let signal = match futures::future::select(signals_task, main).await { + Either::Left((res, _)) => proxy::flatten_err(res)?, + Either::Right((res, _)) => return proxy::flatten_err(res), + }; - Ok(()) + // maintenance tasks return `Infallible` success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match signal {} } async fn task_main( diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 28e6e25317..6b46eaddfa 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,13 +1,15 @@ +use futures::future::Either; use proxy::auth; use proxy::console; use proxy::http; use proxy::metrics; use anyhow::bail; -use clap::{self, Arg}; use proxy::config::{self, ProxyConfig}; +use std::pin::pin; use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; @@ -15,6 +17,70 @@ use utils::{project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); +use clap::{Parser, ValueEnum}; + +#[derive(Clone, Debug, ValueEnum)] +enum AuthBackend { + Console, + Postgres, + Link, +} + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct ProxyCliArgs { + /// listen for incoming client connections on ip:port + #[clap(short, long, default_value = "127.0.0.1:4432")] + proxy: String, + #[clap(value_enum, long, default_value_t = AuthBackend::Link)] + auth_backend: AuthBackend, + /// listen for management callback connection on ip:port + #[clap(short, long, default_value = "127.0.0.1:7000")] + mgmt: String, + /// listen for incoming http connections (metrics, etc) on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + http: String, + /// listen for incoming wss connections on ip:port + #[clap(long)] + wss: Option, + /// redirect unauthenticated users to the given uri in case of link auth + #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] + uri: String, + /// cloud API endpoint for authenticating users + #[clap( + short, + long, + default_value = "http://localhost:3000/authenticate_proxy_request/" + )] + auth_endpoint: String, + /// path to TLS key for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'k', long, alias = "ssl-key")] + tls_key: Option, + /// path to TLS cert for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'c', long, alias = "ssl-cert")] + tls_cert: Option, + /// path to directory with TLS certificates for client postgres connections + #[clap(long)] + certs_dir: Option, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, + /// cache for `wake_compute` api method (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + wake_compute_cache: String, + /// Allow self-signed certificates for compute nodes (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + allow_self_signed_compute: bool, +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let _logging_guard = proxy::logging::init().await?; @@ -24,90 +90,99 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); - let args = cli().get_matches(); + let args = ProxyCliArgs::parse(); let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.get_one::("http").unwrap().parse()?; + let http_address: SocketAddr = args.http.parse()?; info!("Starting http on {http_address}"); let http_listener = TcpListener::bind(http_address).await?.into_std()?; - let mgmt_address: SocketAddr = args.get_one::("mgmt").unwrap().parse()?; + let mgmt_address: SocketAddr = args.mgmt.parse()?; info!("Starting mgmt on {mgmt_address}"); let mgmt_listener = TcpListener::bind(mgmt_address).await?; - let proxy_address: SocketAddr = args.get_one::("proxy").unwrap().parse()?; + let proxy_address: SocketAddr = args.proxy.parse()?; info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main( + // client facing tasks. these will exit on error or on cancellation + // cancellation returns Ok(()) + let mut client_tasks = JoinSet::new(); + client_tasks.spawn(proxy::proxy::task_main( config, proxy_listener, cancellation_token.clone(), - ))]; + )); - if let Some(wss_address) = args.get_one::("wss") { + if let Some(wss_address) = args.wss { let wss_address: SocketAddr = wss_address.parse()?; info!("Starting wss on {wss_address}"); let wss_listener = TcpListener::bind(wss_address).await?; - client_tasks.push(tokio::spawn(http::websocket::task_main( + client_tasks.spawn(http::websocket::task_main( config, wss_listener, cancellation_token.clone(), - ))); + )); } - let mut tasks = vec![ - tokio::spawn(proxy::handle_signals(cancellation_token)), - tokio::spawn(http::server::task_main(http_listener)), - tokio::spawn(console::mgmt::task_main(mgmt_listener)), - ]; + // maintenance tasks. these never return unless there's an error + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); + maintenance_tasks.spawn(http::server::task_main(http_listener)); + maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { - tasks.push(tokio::spawn(metrics::task_main(metrics_config))); + maintenance_tasks.spawn(metrics::task_main(metrics_config)); } - let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err)); - let client_tasks = - futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err)); - tokio::select! { - // We are only expecting an error from these forever tasks - res = tasks => { res?; }, - res = client_tasks => { res?; }, - } - Ok(()) + let maintenance = loop { + // get one complete task + match futures::future::select( + pin!(maintenance_tasks.join_next()), + pin!(client_tasks.join_next()), + ) + .await + { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => break proxy::flatten_err(res)?, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((Some(res), _)) => proxy::flatten_err(res)?, + // exit if all our client tasks have shutdown gracefully + Either::Right((None, _)) => return Ok(()), + } + }; + + // maintenance tasks return Infallible success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match maintenance {} } /// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> { - let tls_config = match ( - args.get_one::("tls-key"), - args.get_one::("tls-cert"), - ) { +fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, - args.get_one::("certs-dir"), + args.certs_dir.as_ref(), )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - let allow_self_signed_compute: bool = args - .get_one::("allow-self-signed-compute") - .unwrap() - .parse()?; - if allow_self_signed_compute { + if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } let metric_collection = match ( - args.get_one::("metric-collection-endpoint"), - args.get_one::("metric-collection-interval"), + &args.metric_collection_endpoint, + &args.metric_collection_interval, ) { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, @@ -120,145 +195,38 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> ), }; - let auth_backend = match args.get_one::("auth-backend").unwrap().as_str() { - "console" => { - let config::CacheOptions { size, ttl } = args - .get_one::("wake-compute-cache") - .unwrap() - .parse()?; + let auth_backend = match &args.auth_backend { + AuthBackend::Console => { + let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?; info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches { node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl), })); - let url = args.get_one::("auth-endpoint").unwrap().parse()?; + let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client()); let api = console::provider::neon::Api::new(endpoint, caches); auth::BackendType::Console(Cow::Owned(api), ()) } - "postgres" => { - let url = args.get_one::("auth-endpoint").unwrap().parse()?; + AuthBackend::Postgres => { + let url = args.auth_endpoint.parse()?; let api = console::provider::mock::Api::new(url); auth::BackendType::Postgres(Cow::Owned(api), ()) } - "link" => { - let url = args.get_one::("uri").unwrap().parse()?; + AuthBackend::Link => { + let url = args.uri.parse()?; auth::BackendType::Link(Cow::Owned(url)) } - other => bail!("unsupported auth backend: {other}"), }; let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, metric_collection, - allow_self_signed_compute, + allow_self_signed_compute: args.allow_self_signed_compute, })); Ok(config) } - -fn cli() -> clap::Command { - clap::Command::new("Neon proxy/router") - .disable_help_flag(true) - .version(GIT_VERSION) - .arg( - Arg::new("proxy") - .short('p') - .long("proxy") - .help("listen for incoming client connections on ip:port") - .default_value("127.0.0.1:4432"), - ) - .arg( - Arg::new("auth-backend") - .long("auth-backend") - .value_parser(["console", "postgres", "link"]) - .default_value("link"), - ) - .arg( - Arg::new("mgmt") - .short('m') - .long("mgmt") - .help("listen for management callback connection on ip:port") - .default_value("127.0.0.1:7000"), - ) - .arg( - Arg::new("http") - .long("http") - .help("listen for incoming http connections (metrics, etc) on ip:port") - .default_value("127.0.0.1:7001"), - ) - .arg( - Arg::new("wss") - .long("wss") - .help("listen for incoming wss connections on ip:port"), - ) - .arg( - Arg::new("uri") - .short('u') - .long("uri") - .help("redirect unauthenticated users to the given uri in case of link auth") - .default_value("http://localhost:3000/psql_session/"), - ) - .arg( - Arg::new("auth-endpoint") - .short('a') - .long("auth-endpoint") - .help("cloud API endpoint for authenticating users") - .default_value("http://localhost:3000/authenticate_proxy_request/"), - ) - .arg( - Arg::new("tls-key") - .short('k') - .long("tls-key") - .alias("ssl-key") // backwards compatibility - .help("path to TLS key for client postgres connections"), - ) - .arg( - Arg::new("tls-cert") - .short('c') - .long("tls-cert") - .alias("ssl-cert") // backwards compatibility - .help("path to TLS cert for client postgres connections"), - ) - // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - .arg( - Arg::new("certs-dir") - .long("certs-dir") - .help("path to directory with TLS certificates for client postgres connections"), - ) - .arg( - Arg::new("metric-collection-endpoint") - .long("metric-collection-endpoint") - .help("http endpoint to receive periodic metric updates"), - ) - .arg( - Arg::new("metric-collection-interval") - .long("metric-collection-interval") - .help("how often metrics should be sent to a collection endpoint"), - ) - .arg( - Arg::new("wake-compute-cache") - .long("wake-compute-cache") - .help("cache for `wake_compute` api method (use `size=0` to disable)") - .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO), - ) - .arg( - Arg::new("allow-self-signed-compute") - .long("allow-self-signed-compute") - .help("Allow self-signed certificates for compute nodes (for testing)") - .default_value("false"), - ) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn verify_cli() { - cli().debug_assert(); - } -} diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index 4e16cc39ec..a9d6793bbd 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -262,24 +262,21 @@ pub mod timed_lru { token: Option<(C, C::LookupInfo)>, /// The value itself. - pub value: C::Value, + value: C::Value, } impl Cached { /// Place any entry into this wrapper; invalidation will be a no-op. - /// Unfortunately, rust doesn't let us implement [`From`] or [`Into`]. - pub fn new_uncached(value: impl Into) -> Self { - Self { - token: None, - value: value.into(), - } + pub fn new_uncached(value: C::Value) -> Self { + Self { token: None, value } } /// Drop this entry from a cache if it's still there. - pub fn invalidate(&self) { + pub fn invalidate(self) -> C::Value { if let Some((cache, info)) = &self.token { cache.invalidate(info); } + self.value } /// Tell if this entry is actually cached. diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c8c0727471..8d16f202e9 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -110,7 +110,7 @@ impl<'a> Session<'a> { impl Session<'_> { /// Store the cancel token for the given session. - /// This enables query cancellation in [`crate::proxy::handshake`]. + /// This enables query cancellation in `crate::proxy::prepare_client_connection`. pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { info!("enabling query cancellation for this session"); self.cancel_map diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 70b29679b9..b1cf2a8559 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,4 +1,9 @@ -use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError}; +use crate::{ + auth::parse_endpoint_param, + cancellation::CancelClosure, + console::errors::WakeComputeError, + error::{io_error, UserFacingError}, +}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use pq_proto::StartupMessageParams; @@ -13,7 +18,7 @@ const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, - /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] tokio_postgres::Error), @@ -24,6 +29,12 @@ pub enum ConnectionError { TlsError(#[from] native_tls::Error), } +impl From for ConnectionError { + fn from(value: WakeComputeError) -> Self { + io_error(value).into() + } +} + impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { use ConnectionError::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 00f561fcf2..989027f03f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -211,7 +211,7 @@ pub struct CacheOptions { } impl CacheOptions { - /// Default options for [`crate::auth::caches::NodeInfoCache`]. + /// Default options for [`crate::console::provider::NodeInfoCache`]. pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m"; /// Parse cache options passed via cmdline. diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 35d1ff59b7..f0e084b679 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -6,7 +6,7 @@ use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::future; +use std::{convert::Infallible, future}; use tokio::net::{TcpListener, TcpStream}; use tracing::{error, info, info_span, Instrument}; @@ -31,7 +31,7 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N /// Console management API listener task. /// It spawns console response handlers needed for the link auth. -pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(listener: TcpListener) -> anyhow::Result { scopeguard::defer! { info!("mgmt has shut down"); } diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 44e23e0adf..3eaed1b82b 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -186,18 +186,18 @@ pub trait Api { async fn get_auth_info( &self, extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + creds: &ClientCredentials, ) -> Result, errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + creds: &ClientCredentials, ) -> Result; } -/// Various caches for [`console`]. +/// Various caches for [`console`](super). pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 3b42c73a34..282567269d 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -106,7 +106,7 @@ impl super::Api for Api { async fn get_auth_info( &self, _extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + creds: &ClientCredentials, ) -> Result, GetAuthInfoError> { self.do_get_auth_info(creds).await } @@ -115,7 +115,7 @@ impl super::Api for Api { async fn wake_compute( &self, _extra: &ConsoleReqExtra<'_>, - _creds: &ClientCredentials<'_>, + _creds: &ClientCredentials, ) -> Result { self.do_wake_compute() .map_ok(CachedNodeInfo::new_uncached) diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index a8e855b2c8..22e766b5f1 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -123,7 +123,7 @@ impl super::Api for Api { async fn get_auth_info( &self, extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + creds: &ClientCredentials, ) -> Result, GetAuthInfoError> { self.do_get_auth_info(extra, creds).await } @@ -132,7 +132,7 @@ impl super::Api for Api { async fn wake_compute( &self, extra: &ConsoleReqExtra<'_>, - creds: &ClientCredentials<'_>, + creds: &ClientCredentials, ) -> Result { let key = creds.project().expect("impossible"); diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs index fb53c663c8..703632a511 100644 --- a/proxy/src/http/conn_pool.rs +++ b/proxy/src/http/conn_pool.rs @@ -1,16 +1,17 @@ +use anyhow::Context; +use async_trait::async_trait; use parking_lot::Mutex; use pq_proto::StartupMessageParams; use std::fmt; -use std::ops::ControlFlow; use std::{collections::HashMap, sync::Arc}; use tokio::time; -use crate::config; use crate::{auth, console}; +use crate::{compute, config}; use super::sql_over_http::MAX_RESPONSE_SIZE; -use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE}; +use crate::proxy::ConnectMechanism; use tracing::error; use tracing::info; @@ -184,6 +185,27 @@ impl GlobalConnPool { } } +struct TokioMechanism<'a> { + conn_info: &'a ConnInfo, +} + +#[async_trait] +impl ConnectMechanism for TokioMechanism<'_> { + type Connection = tokio_postgres::Client; + type ConnectError = tokio_postgres::Error; + type Error = anyhow::Error; + + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result { + connect_to_compute_once(node_info, self.conn_info, timeout).await + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} + // Wake up the destination if needed. Code here is a bit involved because // we reuse the code from the usual proxy and we need to prepare few structures // that this code expects. @@ -217,80 +239,18 @@ async fn connect_to_compute( application_name: Some(APP_NAME), }; - let node_info = &mut creds.wake_compute(&extra).await?.expect("msg"); + let node_info = creds + .wake_compute(&extra) + .await? + .context("missing cache entry from wake_compute")?; - let mut num_retries = 0; - let mut wait_duration = time::Duration::ZERO; - let mut should_wake_with_error = None; - loop { - if !wait_duration.is_zero() { - time::sleep(wait_duration).await; - } - - // try wake the compute node if we have determined it's sensible to do so - if let Some(err) = should_wake_with_error.take() { - match try_wake(node_info, &extra, &creds).await { - // we can't wake up the compute node - Ok(None) => return Err(err), - // there was an error communicating with the control plane - Err(e) => return Err(e.into()), - // failed to wake up but we can continue to retry - Ok(Some(ControlFlow::Continue(()))) => { - wait_duration = retry_after(num_retries); - should_wake_with_error = Some(err); - - num_retries += 1; - info!(num_retries, "retrying wake compute"); - continue; - } - // successfully woke up a compute node and can break the wakeup loop - Ok(Some(ControlFlow::Break(()))) => {} - } - } - - match connect_to_compute_once(node_info, conn_info).await { - Ok(res) => return Ok(res), - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - if !can_retry_error(&e, num_retries) { - return Err(e.into()); - } - wait_duration = retry_after(num_retries); - - // after the first connect failure, - // we should invalidate the cache and wake up a new compute node - if num_retries == 0 { - invalidate_cache(node_info); - should_wake_with_error = Some(e.into()); - } - } - } - - num_retries += 1; - info!(num_retries, "retrying connect"); - } -} - -fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool { - use tokio_postgres::error::SqlState; - match err.code() { - // retry all errors at least once - _ if num_retries == 0 => true, - // keep retrying connection errors - Some( - &SqlState::CONNECTION_FAILURE - | &SqlState::CONNECTION_EXCEPTION - | &SqlState::CONNECTION_DOES_NOT_EXIST - | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, - ) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true, - // otherwise, don't retry - _ => false, - } + crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await } async fn connect_to_compute_once( node_info: &console::CachedNodeInfo, conn_info: &ConnInfo, + timeout: time::Duration, ) -> Result { let mut config = (*node_info.config).clone(); @@ -299,6 +259,7 @@ async fn connect_to_compute_once( .password(&conn_info.password) .dbname(&conn_info.dbname) .max_backend_message_size(MAX_RESPONSE_SIZE) + .connect_timeout(timeout) .connect(tokio_postgres::NoTls) .await?; diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs index f35f4f9a62..6186ddde0d 100644 --- a/proxy/src/http/server.rs +++ b/proxy/src/http/server.rs @@ -1,6 +1,6 @@ -use anyhow::anyhow; +use anyhow::{anyhow, bail}; use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; +use std::{convert::Infallible, net::TcpListener}; use tracing::info; use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; @@ -12,7 +12,7 @@ fn make_router() -> RouterBuilder { endpoint::make_router().get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } @@ -23,5 +23,5 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> { .serve(service().map_err(|e| anyhow!(e))?) .await?; - Ok(()) + bail!("hyper server without shutdown handling cannot shutdown successfully"); } diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index 83ba034e57..5b7a87bc11 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -1,5 +1,8 @@ use crate::{ - cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client, + cancellation::CancelMap, + config::ProxyConfig, + error::io_error, + proxy::{handle_client, ClientMode}, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream, StreamExt}; @@ -150,12 +153,12 @@ async fn serve_websocket( hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; - handle_ws_client( + handle_client( config, cancel_map, session_id, WebSocketRw::new(websocket), - hostname, + ClientMode::Websockets { hostname }, ) .await?; Ok(()) @@ -221,6 +224,18 @@ async fn ws_handler( ); r }) + } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + Response::builder() + .header("Allow", "OPTIONS, POST") + .header("Access-Control-Allow-Origin", "*") + .header( + "Access-Control-Allow-Headers", + "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In", + ) + .header("Access-Control-Max-Age", "86400" /* 24 hours */) + .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code + .body(Body::empty()) + .map_err(|e| ApiError::BadRequest(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 148ee67d90..1e1e216bb7 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,5 +1,6 @@ +use std::convert::Infallible; + use anyhow::{bail, Context}; -use futures::{Future, FutureExt}; use tokio::task::JoinError; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -23,7 +24,7 @@ pub mod url; pub mod waiters; /// Handle unix signals appropriately. -pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { +pub async fn handle_signals(token: CancellationToken) -> anyhow::Result { use tokio::signal::unix::{signal, SignalKind}; let mut hangup = signal(SignalKind::hangup())?; @@ -50,8 +51,6 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> { } /// Flattens `Result>` into `Result`. -pub async fn flatten_err( - f: impl Future, JoinError>>, -) -> anyhow::Result<()> { - f.map(|r| r.context("join error").and_then(|x| x)).await +pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { + r.context("join error").and_then(|x| x) } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 00fd7f0405..c4be7e1f08 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -4,7 +4,7 @@ use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; -use std::{collections::HashMap, time::Duration}; +use std::{collections::HashMap, convert::Infallible, time::Duration}; use tracing::{error, info, instrument, trace, warn}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -26,7 +26,7 @@ pub struct Ids { pub branch_id: String, } -pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { +pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result { info!("metrics collector config: {config:?}"); scopeguard::defer! { info!("metrics collector has shut down"); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 12ca9c5187..d317d382a7 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -11,16 +11,16 @@ use crate::{ errors::{ApiError, WakeComputeError}, messages::MetricsAuxInfo, }, - error::io_error, stream::{PqStream, Stream}, }; use anyhow::{bail, Context}; +use async_trait::async_trait; use futures::TryFutureExt; use hyper::StatusCode; use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; -use std::{ops::ControlFlow, sync::Arc}; +use std::{error::Error, io, ops::ControlFlow, sync::Arc}; use tokio::{ io::{AsyncRead, AsyncWrite, AsyncWriteExt}, time, @@ -31,7 +31,8 @@ use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n -pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10; +const NUM_RETRIES_CONNECT: u32 = 10; +const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100); const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -103,7 +104,8 @@ pub async fn task_main( .set_nodelay(true) .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket).await + handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp) + .await } .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. @@ -128,14 +130,50 @@ pub async fn task_main( Ok(()) } -// TODO(tech debt): unite this with its twin below. +pub enum ClientMode { + Tcp, + Websockets { hostname: Option }, +} + +/// Abstracts the logic of handling TCP vs WS clients +impl ClientMode { + fn allow_cleartext(&self) -> bool { + match self { + ClientMode::Tcp => false, + ClientMode::Websockets { .. } => true, + } + } + + fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { + match self { + ClientMode::Tcp => config.allow_self_signed_compute, + ClientMode::Websockets { .. } => false, + } + } + + fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { + match self { + ClientMode::Tcp => s.sni_hostname(), + ClientMode::Websockets { hostname } => hostname.as_deref(), + } + } + + fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> { + match self { + ClientMode::Tcp => tls, + // TLS is None here if using websockets, because the connection is already encrypted. + ClientMode::Websockets { .. } => None, + } + } +} + #[tracing::instrument(fields(session_id = ?session_id), skip_all)] -pub async fn handle_ws_client( +pub async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, session_id: uuid::Uuid, - stream: impl AsyncRead + AsyncWrite + Unpin, - hostname: Option, + stream: S, + mode: ClientMode, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); @@ -144,10 +182,8 @@ pub async fn handle_ws_client( } let tls = config.tls_config.as_ref(); - let hostname = hostname.as_deref(); - // TLS is None here, because the connection is already encrypted. - let do_handshake = handshake(stream, None, cancel_map); + let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map); let (mut stream, params) = match do_handshake.await? { Some(x) => x, None => return Ok(()), // it's a cancellation request @@ -155,6 +191,7 @@ pub async fn handle_ws_client( // Extract credentials which we're going to use for auth. let creds = { + let hostname = mode.hostname(stream.get_ref()); let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend @@ -162,59 +199,21 @@ pub async fn handle_ws_client( .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names)) .transpose(); - async { result }.or_else(|e| stream.throw_error(e)).await? + match result { + Ok(creds) => creds, + Err(e) => stream.throw_error(e).await?, + } }; - let client = Client::new(stream, creds, ¶ms, session_id, false); - cancel_map - .with_session(|session| client.connect_to_db(session, true)) - .await -} - -#[tracing::instrument(fields(session_id = ?session_id), skip_all)] -async fn handle_client( - config: &'static ProxyConfig, - cancel_map: &CancelMap, - session_id: uuid::Uuid, - stream: impl AsyncRead + AsyncWrite + Unpin, -) -> anyhow::Result<()> { - // The `closed` counter will increase when this future is destroyed. - NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); - scopeguard::defer! { - NUM_CONNECTIONS_CLOSED_COUNTER.inc(); - } - - let tls = config.tls_config.as_ref(); - let do_handshake = handshake(stream, tls, cancel_map); - let (mut stream, params) = match do_handshake.await? { - Some(x) => x, - None => return Ok(()), // it's a cancellation request - }; - - // Extract credentials which we're going to use for auth. - let creds = { - let sni = stream.get_ref().sni_hostname(); - let common_names = tls.and_then(|tls| tls.common_names.clone()); - let result = config - .auth_backend - .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names)) - .transpose(); - - async { result }.or_else(|e| stream.throw_error(e)).await? - }; - - let allow_self_signed_compute = config.allow_self_signed_compute; - let client = Client::new( stream, creds, ¶ms, session_id, - allow_self_signed_compute, + mode.allow_self_signed_compute(config), ); cancel_map - .with_session(|session| client.connect_to_db(session, false)) + .with_session(|session| client.connect_to_db(session, mode.allow_cleartext())) .await } @@ -297,18 +296,18 @@ async fn handshake( /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: &console::CachedNodeInfo) { +pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); - node_info.invalidate(); } - let label = match is_cached { true => "compute_cached", false => "compute_uncached", }; NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + + node_info.invalidate().config } /// Try to connect to the compute node once. @@ -325,89 +324,118 @@ async fn connect_to_compute_once( .await } +enum ConnectionState { + Cached(console::CachedNodeInfo), + Invalid(compute::ConnCfg, E), +} + +#[async_trait] +pub trait ConnectMechanism { + type Connection; + type ConnectError; + type Error: From; + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result; + + fn update_connect_config(&self, conf: &mut compute::ConnCfg); +} + +pub struct TcpMechanism<'a> { + /// KV-dictionary with PostgreSQL connection params. + pub params: &'a StartupMessageParams, +} + +#[async_trait] +impl ConnectMechanism for TcpMechanism<'_> { + type Connection = PostgresConnection; + type ConnectError = compute::ConnectionError; + type Error = compute::ConnectionError; + + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result { + connect_to_compute_once(node_info, timeout).await + } + + fn update_connect_config(&self, config: &mut compute::ConnCfg) { + config.set_startup_params(self.params); + } +} + /// Try to connect to the compute node, retrying if necessary. /// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] -async fn connect_to_compute( - node_info: &mut console::CachedNodeInfo, - params: &StartupMessageParams, +pub async fn connect_to_compute( + mechanism: &M, + mut node_info: console::CachedNodeInfo, extra: &console::ConsoleReqExtra<'_>, creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, -) -> Result { +) -> Result +where + M::ConnectError: ShouldRetry + std::fmt::Debug, + M::Error: From, +{ + mechanism.update_connect_config(&mut node_info.config); + let mut num_retries = 0; - let mut wait_duration = time::Duration::ZERO; - let mut should_wake_with_error = None; + let mut state = ConnectionState::::Cached(node_info); + loop { - // Apply startup params to the (possibly, cached) compute node info. - node_info.config.set_startup_params(params); + match state { + ConnectionState::Invalid(config, err) => { + match try_wake(&config, extra, creds).await { + // we can't wake up the compute node + Ok(None) => return Err(err.into()), + // there was an error communicating with the control plane + Err(e) => return Err(e.into()), + // failed to wake up but we can continue to retry + Ok(Some(ControlFlow::Continue(()))) => { + state = ConnectionState::Invalid(config, err); + let wait_duration = retry_after(num_retries); + num_retries += 1; - if !wait_duration.is_zero() { - time::sleep(wait_duration).await; - } - - // try wake the compute node if we have determined it's sensible to do so - if let Some(err) = should_wake_with_error.take() { - match try_wake(node_info, extra, creds).await { - // we can't wake up the compute node - Ok(None) => return Err(err), - // there was an error communicating with the control plane - Err(e) => return Err(io_error(e).into()), - // failed to wake up but we can continue to retry - Ok(Some(ControlFlow::Continue(()))) => { - wait_duration = retry_after(num_retries); - should_wake_with_error = Some(err); - - num_retries += 1; - info!(num_retries, "retrying wake compute"); - continue; + info!(num_retries, "retrying wake compute"); + time::sleep(wait_duration).await; + continue; + } + // successfully woke up a compute node and can break the wakeup loop + Ok(Some(ControlFlow::Break(mut node_info))) => { + mechanism.update_connect_config(&mut node_info.config); + state = ConnectionState::Cached(node_info) + } } - // successfully woke up a compute node and can break the wakeup loop - Ok(Some(ControlFlow::Break(()))) => {} } - } + ConnectionState::Cached(node_info) => { + match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => return Ok(res), + Err(e) => { + error!(error = ?e, "could not connect to compute node"); + if !e.should_retry(num_retries) { + return Err(e.into()); + } - // Set a shorter timeout for the initial connection attempt. - // - // In case we try to connect to an outdated address that is no longer valid, the - // default behavior of Kubernetes is to drop the packets, causing us to wait for - // the entire timeout period. We want to fail fast in such cases. - // - // A specific case to consider is when we have cached compute node information - // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API - // call, resulting in the nonexistence of the compute node. - // - // We only use caching in case of scram proxy backed by the console, so reduce - // the timeout only in that case. - let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _)); - let timeout = if is_scram_proxy && num_retries == 0 { - time::Duration::from_secs(2) - } else { - time::Duration::from_secs(10) - }; + // after the first connect failure, + // we should invalidate the cache and wake up a new compute node + if num_retries == 0 { + state = ConnectionState::Invalid(invalidate_cache(node_info), e); + } else { + state = ConnectionState::Cached(node_info); + } - // do this again to ensure we have username? - node_info.config.set_startup_params(params); + let wait_duration = retry_after(num_retries); + num_retries += 1; - match connect_to_compute_once(node_info, timeout).await { - Ok(res) => return Ok(res), - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - if !can_retry_error(&e, num_retries) { - return Err(e); - } - wait_duration = retry_after(num_retries); - - // after the first connect failure, - // we should invalidate the cache and wake up a new compute node - if num_retries == 0 { - invalidate_cache(node_info); - should_wake_with_error = Some(e); + info!(num_retries, "retrying wake compute"); + time::sleep(wait_duration).await; + } } } } - - num_retries += 1; - info!(num_retries, "retrying connect"); } } @@ -415,11 +443,11 @@ async fn connect_to_compute( /// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable /// * Returns Ok(Some(false)) if the wakeup succeeded /// * Returns Ok(None) or Err(e) if there was an error -pub async fn try_wake( - node_info: &mut console::CachedNodeInfo, +async fn try_wake( + config: &compute::ConnCfg, extra: &console::ConsoleReqExtra<'_>, creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, -) -> Result>, WakeComputeError> { +) -> Result>, WakeComputeError> { info!("compute node's state has likely changed; requesting a wake-up"); match creds.wake_compute(extra).await { // retry wake if the compute was in an invalid state @@ -429,31 +457,68 @@ pub async fn try_wake( })) => Ok(Some(ControlFlow::Continue(()))), // Update `node_info` and try again. Ok(Some(mut new)) => { - new.config.reuse_password(&node_info.config); - *node_info = new; - Ok(Some(ControlFlow::Break(()))) + new.config.reuse_password(config); + Ok(Some(ControlFlow::Break(new))) } Err(e) => Err(e), Ok(None) => Ok(None), } } -fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool { - use std::io::ErrorKind; - match err { - // retry all errors at least once - _ if num_retries == 0 => true, - // keep retrying connection errors - compute::ConnectionError::CouldNotConnect(io_err) - if num_retries < NUM_RETRIES_WAKE_COMPUTE => - { - matches!( - io_err.kind(), - ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable - ) +pub trait ShouldRetry { + fn could_retry(&self) -> bool; + fn should_retry(&self, num_retries: u32) -> bool { + match self { + // retry all errors at least once + _ if num_retries == 0 => true, + _ if num_retries >= NUM_RETRIES_CONNECT => false, + err => err.could_retry(), + } + } +} + +impl ShouldRetry for io::Error { + fn could_retry(&self) -> bool { + use std::io::ErrorKind; + matches!( + self.kind(), + ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut + ) + } +} + +impl ShouldRetry for tokio_postgres::error::DbError { + fn could_retry(&self) -> bool { + use tokio_postgres::error::SqlState; + matches!( + self.code(), + &SqlState::CONNECTION_FAILURE + | &SqlState::CONNECTION_EXCEPTION + | &SqlState::CONNECTION_DOES_NOT_EXIST + | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ) + } +} + +impl ShouldRetry for tokio_postgres::Error { + fn could_retry(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::could_retry(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::could_retry(db_err) + } else { + false + } + } +} + +impl ShouldRetry for compute::ConnectionError { + fn could_retry(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.could_retry(), + compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + _ => false, } - // otherwise, don't retry - _ => false, } } @@ -595,15 +660,13 @@ impl Client<'_, S> { application_name: params.get("application_name"), }; - let auth_result = async { - // `&mut stream` doesn't let us merge those 2 lines. - let res = creds - .authenticate(&extra, &mut stream, allow_cleartext) - .await; - - async { res }.or_else(|e| stream.throw_error(e)).await - } - .await?; + let auth_result = match creds + .authenticate(&extra, &mut stream, allow_cleartext) + .await + { + Ok(auth_result) => auth_result, + Err(e) => return stream.throw_error(e).await, + }; let AuthSuccess { reported_auth_ok, @@ -612,7 +675,8 @@ impl Client<'_, S> { node_info.allow_self_signed_compute = allow_self_signed_compute; - let mut node = connect_to_compute(&mut node_info, params, &extra, &creds) + let aux = node_info.aux.clone(); + let mut node = connect_to_compute(&TcpMechanism { params }, node_info, &extra, &creds) .or_else(|e| stream.throw_error(e)) .await?; @@ -623,6 +687,6 @@ impl Client<'_, S> { // immediately after opening the connection. let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; - proxy_pass(stream, node.stream, &node_info.aux).await + proxy_pass(stream, node.stream, &aux).await } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 85854427ed..07822e8da5 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -12,7 +12,7 @@ mod messages; mod secret; mod signature; -#[cfg(test)] +#[cfg(any(test, doc))] mod password; pub use exchange::Exchange; diff --git a/pyproject.toml b/pyproject.toml index ac4e8fa2dd..726e04ea4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ pytest-httpserver = "^1.0.8" aiohttp = "3.7.4" pytest-rerunfailures = "^11.1.2" types-pytest-lazy-fixture = "^0.6.3.3" +pytest-split = "^0.8.1" [tool.poetry.group.dev.dependencies] black = "^23.3.0" @@ -78,6 +79,7 @@ module = [ ignore_missing_imports = true [tool.ruff] +target-version = "py39" extend-exclude = ["vendor/"] ignore = ["E501"] select = [ @@ -85,4 +87,5 @@ select = [ "F", # Pyflakes "I", # isort "W", # pycodestyle + "B", # bugbear ] diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 6abb435018..0ce368ff9d 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.70.0" +channel = "1.71.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 6c4ad24323..504c2d355d 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -163,8 +163,9 @@ impl Deref for FileStorage { #[async_trait::async_trait] impl Storage for FileStorage { - /// persists state durably to underlying storage - /// for description see https://lwn.net/Articles/457667/ + /// Persists state durably to the underlying storage. + /// + /// For a description, see . async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); @@ -191,6 +192,12 @@ impl Storage for FileStorage { control_partial_path.display() ) })?; + control_partial.flush().await.with_context(|| { + format!( + "failed to flush safekeeper state into control file at: {}", + control_partial_path.display() + ) + })?; // fsync the file if !self.conf.no_sync { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 61ba37efaa..8d8ef6192c 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -188,6 +188,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result let mut response = client.get(&http_url).send().await?; while let Some(chunk) = response.chunk().await? { file.write_all(&chunk).await?; + file.flush().await?; } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index abca0a86b1..92a7bb703a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -403,16 +403,18 @@ impl SafekeeperPostgresHandler { }; // take the latest commit_lsn if don't have stop_pos - let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow()); + let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow()); if end_pos < start_pos { - warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos); - end_pos = start_pos; + warn!( + "requested start_pos {} is ahead of available WAL end_pos {}", + start_pos, end_pos + ); } info!( - "starting streaming from {:?} till {:?}", - start_pos, stop_pos + "starting streaming from {:?} till {:?}, available WAL ends at {}", + start_pos, stop_pos, end_pos ); // switch to copy @@ -547,12 +549,14 @@ impl WalSender<'_, IO> { self.end_pos = *self.commit_lsn_watch_rx.borrow(); if self.end_pos > self.start_pos { // We have something to send. + trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } // Wait for WAL to appear, now self.end_pos == self.start_pos. if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? { self.end_pos = lsn; + trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index f2d5df8744..eda5b9044e 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -1,4 +1,4 @@ -//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! This module contains global `(tenant_id, timeline_id)` -> `Arc` mapping. //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 687e1ba6b6..d728312de4 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -106,13 +106,15 @@ pub struct PhysicalStorage { /// Imagine the following: /// - 000000010000000000000001 /// - it was fully written, but the last record is split between 2 segments - /// - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment - /// - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0 + /// - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment + /// - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0 /// - 000000010000000000000002.partial /// - it has only 1 byte written, which is not enough to make a full WAL record /// /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal(). /// This flag will be set to true after the first truncate_wal() call. + /// + /// [`write_lsn`]: Self::write_lsn is_truncated_after_restart: bool, } @@ -248,6 +250,10 @@ impl PhysicalStorage { }; file.write_all(buf).await?; + // Note: flush just ensures write above reaches the OS (this is not + // needed in case of sync IO as Write::write there calls directly write + // syscall, but needed in case of async). It does *not* fsyncs the file. + file.flush().await?; if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. @@ -716,6 +722,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { count -= XLOG_BLCKSZ; } file.write_all(&ZERO_BLOCK[0..count]).await?; + file.flush().await?; Ok(()) } diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py new file mode 100755 index 0000000000..37f8470038 --- /dev/null +++ b/scripts/benchmark_durations.py @@ -0,0 +1,177 @@ +#! /usr/bin/env python3 + +import argparse +import json +import logging +from typing import Dict + +import psycopg2 +import psycopg2.extras + +""" +The script fetches the durations of benchmarks from the database and stores it in a file compatible with pytest-split plugin. +""" + + +BENCHMARKS_DURATION_QUERY = """ + SELECT + DISTINCT parent_suite, suite, test, + PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms + FROM + ( + SELECT + jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite, + jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status, + to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp, + (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms + FROM + regress_test_results + WHERE + reference = 'refs/heads/main' + ) data + WHERE + timestamp > CURRENT_DATE - INTERVAL '%s' day + AND parent_suite = 'test_runner.performance' + AND status = 'passed' + GROUP BY + parent_suite, suite, test + ; +""" + +# For out benchmarks the default distibution for 4 worked produces pretty uneven chunks, +# the total duration varies from 8 to 40 minutes. +# We use some pre-collected durations as a fallback to have a better distribution. +FALLBACK_DURATION = { + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0, + "test_runner/performance/test_compaction.py::test_compaction": 77.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0, + "test_runner/performance/test_copy.py::test_copy[neon]": 12.0, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0, + "test_runner/performance/test_layer_map.py::test_layer_map": 44.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0, + "test_runner/performance/test_startup.py::test_startup_simple": 2.0, + "test_runner/performance/test_startup.py::test_startup": 539.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0, +} + + +def main(args: argparse.Namespace): + connstr = args.connstr + interval_days = args.days + output = args.output + percentile = args.percentile + + res: Dict[str, float] = {} + + try: + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching benchmarks...") + cur.execute(BENCHMARKS_DURATION_QUERY, (percentile, interval_days)) + rows = cur.fetchall() + except psycopg2.OperationalError as exc: + logging.error("cannot fetch benchmarks duration from the DB due to an error", exc) + rows = [] + res = FALLBACK_DURATION + + for row in rows: + pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}" + duration = row["percentile_ms"] / 1000 + logging.info(f"\t{pytest_name}: {duration}") + res[pytest_name] = duration + + logging.info(f"saving results to {output.name}") + json.dump(res, output, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Get of benchmarks duration for the last days" + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default=".test_durations", + help="path to output json file (default: .test_durations)", + ) + parser.add_argument( + "--percentile", + type=float, + default="0.99", + help="percentile (default: 0.99)", + ) + parser.add_argument( + "--days", + required=False, + default=10, + type=int, + help="how many days to look back for (default: 10)", + ) + parser.add_argument( + "connstr", + help="connection string to the test results database", + ) + args = parser.parse_args() + + level = logging.INFO + logging.basicConfig( + format="%(message)s", + level=level, + ) + + main(args) diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index dd60d42a37..b68df65c41 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -205,29 +205,25 @@ module.exports = async ({ github, context, fetch, report }) => { const {reportUrl, reportJsonUrl} = report - if (!reportUrl || !reportJsonUrl) { + if (reportUrl && reportJsonUrl) { + try { + const parsed = await parseReportJson({ reportJsonUrl, fetch }) + commentBody += await reportSummary({ ...parsed, reportUrl }) + } catch (error) { + commentBody += `### [full report](${reportUrl})\n___\n` + commentBody += `#### Failed to create a summary for the test run: \n` + commentBody += "```\n" + commentBody += `${error.stack}\n` + commentBody += "```\n" + commentBody += "\nTo reproduce and debug the error locally run:\n" + commentBody += "```\n" + commentBody += `scripts/comment-test-report.js ${reportJsonUrl}` + commentBody += "\n```\n" + } + } else { commentBody += `#### No tests were run or test report is not available\n` - commentBody += autoupdateNotice - return } - try { - const parsed = await parseReportJson({ reportJsonUrl, fetch }) - commentBody += await reportSummary({ ...parsed, reportUrl }) - } catch (error) { - commentBody += `### [full report](${reportUrl})\n___\n` - commentBody += `#### Failed to create a summary for the test run: \n` - commentBody += "```\n" - commentBody += `${error.stack}\n` - commentBody += "```\n" - commentBody += "\nTo reproduce and debug the error locally run:\n" - commentBody += "```\n" - commentBody += `scripts/comment-test-report.js ${reportJsonUrl}` - commentBody += "\n```\n" - } - - commentBody += autoupdateNotice - let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha if (isPullRequest) { createCommentFn = github.rest.issues.createComment diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index d95878b341..fca645078a 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -214,8 +214,7 @@ class VanillaPostgres(PgProtocol): assert not self.running self.running = True - if log_path is None: - log_path = os.path.join(self.pgdatadir, "pg.log") + log_path = log_path or os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] @@ -396,7 +395,7 @@ def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): query = "select relname, pg_relation_filepath(oid) from pg_class" result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) - for relname, filepath in result: + for _relname, filepath in result: if filepath is not None: if database == "template0copy": # Add all template0copy paths to template0 diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 99682caf80..297f2c6da7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -5,7 +5,6 @@ import json import os import re import timeit -import warnings from contextlib import contextmanager from datetime import datetime from pathlib import Path @@ -18,6 +17,7 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.terminal import TerminalReporter +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver from fixtures.types import TenantId, TimelineId @@ -385,7 +385,7 @@ class NeonBenchmarker: path = f"{repo_dir}/tenants/{tenant_id}/timelines/{timeline_id}" totalbytes = 0 - for root, dirs, files in os.walk(path): + for root, _dirs, files in os.walk(path): for name in files: totalbytes += os.path.getsize(os.path.join(root, name)) @@ -492,7 +492,7 @@ def pytest_terminal_summary( return if not result: - warnings.warn("no results to store (no passed test suites)") + log.warning("no results to store (no passed test suites)") return get_out_path(Path(out_dir), revision=revision).write_text( diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index a2bc2e28e5..3f87aa10a3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -61,6 +61,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_materialized_cache_hits_direct_total", "pageserver_page_cache_read_hits_total", "pageserver_page_cache_read_accesses_total", + "pageserver_page_cache_size_current_bytes", + "pageserver_page_cache_size_max_bytes", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c3e9853978..9e43a2bfdb 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -213,7 +213,7 @@ def worker_base_port(worker_seq_no: int) -> int: def get_dir_size(path: str) -> int: """Return size in bytes.""" totalbytes = 0 - for root, dirs, files in os.walk(path): + for root, _dirs, files in os.walk(path): for name in files: totalbytes += os.path.getsize(os.path.join(root, name)) @@ -1231,7 +1231,7 @@ class AbstractNeonCli(abc.ABC): stderr: {res.stderr} """ log.info(msg) - raise Exception(msg) from subprocess.CalledProcessError( + raise RuntimeError(msg) from subprocess.CalledProcessError( res.returncode, res.args, res.stdout, res.stderr ) return res @@ -1255,10 +1255,8 @@ class NeonCli(AbstractNeonCli): """ Creates a new tenant, returns its id and its initial timeline's id. """ - if tenant_id is None: - tenant_id = TenantId.generate() - if timeline_id is None: - timeline_id = TimelineId.generate() + tenant_id = tenant_id or TenantId.generate() + timeline_id = timeline_id or TimelineId.generate() args = [ "tenant", @@ -1885,8 +1883,7 @@ class VanillaPostgres(PgProtocol): assert not self.running self.running = True - if log_path is None: - log_path = os.path.join(self.pgdatadir, "pg.log") + log_path = log_path or os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] @@ -2346,8 +2343,7 @@ class Endpoint(PgProtocol): if not config_lines: config_lines = [] - if endpoint_id is None: - endpoint_id = self.env.generate_endpoint_id() + endpoint_id = endpoint_id or self.env.generate_endpoint_id() self.endpoint_id = endpoint_id self.branch_name = branch_name @@ -2363,8 +2359,7 @@ class Endpoint(PgProtocol): path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) - if config_lines is None: - config_lines = [] + config_lines = config_lines or [] # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. @@ -2560,8 +2555,7 @@ class EndpointFactory: http_port=self.env.port_distributor.get_port(), ) - if endpoint_id is None: - endpoint_id = self.env.generate_endpoint_id() + endpoint_id = endpoint_id or self.env.generate_endpoint_id() self.num_instances += 1 self.endpoints.append(ep) @@ -2641,7 +2635,7 @@ class Safekeeper: if elapsed > 3: raise RuntimeError( f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}" - ) + ) from e time.sleep(0.5) else: break # success @@ -2721,7 +2715,8 @@ class SafekeeperHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def debug_dump(self, params: Dict[str, str] = {}) -> Dict[str, Any]: + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + params = params or {} res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) res.raise_for_status() res_json = res.json() @@ -2861,7 +2856,7 @@ class NeonBroker: if elapsed > 5: raise RuntimeError( f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" - ) + ) from e time.sleep(0.5) else: break # success @@ -2977,7 +2972,7 @@ def should_skip_file(filename: str) -> bool: # def list_files_to_compare(pgdata_dir: Path) -> List[str]: pgdata_files = [] - for root, _file, filenames in os.walk(pgdata_dir): + for root, _dirs, filenames in os.walk(pgdata_dir): for filename in filenames: rel_dir = os.path.relpath(root, pgdata_dir) # Skip some dirs and files we don't want to compare @@ -3109,3 +3104,18 @@ def last_flush_lsn_upload( ps_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) return last_flush_lsn + + +def parse_project_git_version_output(s: str) -> str: + """ + Parses the git commit hash out of the --version output supported at least by neon_local. + + The information is generated by utils::project_git_version! + """ + import re + + res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s) + if res and (commit := res.group(2)): + return commit + + raise ValueError(f"unable to parse --version output: '{s}'") diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 824d11cb17..8c053c8073 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -193,8 +193,7 @@ class PageserverHttpClient(requests.Session): body = "null" else: # null-config is prohibited by the API - if config is None: - config = {} + config = config or {} body = json.dumps({"config": config}) res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 30acd3f637..2c8b4f4303 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -95,7 +95,7 @@ def query_scalar(cur: cursor, query: str) -> Any: def get_dir_size(path: str) -> int: """Return size in bytes.""" totalbytes = 0 - for root, dirs, files in os.walk(path): + for root, _dirs, files in os.walk(path): for name in files: try: totalbytes += os.path.getsize(os.path.join(root, name)) diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index f93b560d8e..cf9e4808fc 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -47,7 +47,7 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma # without modifying the earlier parts of the table. for step in range(n_steps): cur.execute(f"INSERT INTO t (step) SELECT {step} FROM generate_series(1, {step_size})") - for i in range(n_update_iters): + for _ in range(n_update_iters): cur.execute(f"UPDATE t set count=count+1 where step = {step}") cur.execute("vacuum t") diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index a133aca8ce..5fcffc8afb 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -33,6 +33,6 @@ def test_hot_table(env: PgCompare): # Read the table with env.record_duration("read"): - for i in range(num_reads): + for _ in range(num_reads): cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 18308e1077..6bd0d85fa2 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -28,7 +28,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant) cur = endpoint.connect().cursor() cur.execute("create table t(x integer)") - for i in range(n_iters): + for _ in range(n_iters): cur.execute(f"insert into t values (generate_series(1,{n_records}))") time.sleep(1) diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index 746c1b73dd..9a0b7723ac 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -6,7 +6,7 @@ from fixtures.neon_fixtures import PgProtocol async def repeat_bytes(buf, repetitions: int): - for i in range(repetitions): + for _ in range(repetitions): yield buf diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index df766d52da..c1a59ebb31 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -77,8 +77,8 @@ def test_random_writes(neon_with_baseline: PgCompare): # Update random keys with env.record_duration("run"): - for it in range(n_iterations): - for i in range(n_writes): + for _ in range(n_iterations): + for _ in range(n_writes): key = random.randint(1, n_rows) cur.execute(f"update Big set count=count+1 where pk={key}") env.flush() diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 409b30a909..67d4f3ae9b 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -61,5 +61,5 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in cur.execute(f"set max_parallel_workers_per_gather = {workers}") with env.record_duration("run"): - for i in range(iters): + for _ in range(iters): cur.execute("select count(*) from t;") diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 4744c1ed2e..d897df1bcb 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -60,6 +60,11 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc value = metrics[key] zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + # Check basebackup size makes sense + basebackup_bytes = metrics["basebackup_bytes"] + if i > 0: + assert basebackup_bytes < 100 * 1024 + # Stop so we can restart endpoint.stop() diff --git a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py index 4d9dfb09c1..de86fe482d 100755 --- a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py +++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py @@ -19,7 +19,7 @@ async def run(**kwargs) -> asyncpg.Record: if __name__ == "__main__": kwargs = { - k.lstrip("NEON_").lower(): v + k.removeprefix("NEON_").lower(): v for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") if (v := os.environ.get(k, None)) is not None } diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py index b1d77af5bb..840ed97c97 100755 --- a/test_runner/pg_clients/python/pg8000/pg8000_example.py +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -6,7 +6,7 @@ import pg8000.dbapi if __name__ == "__main__": kwargs = { - k.lstrip("NEON_").lower(): v + k.removeprefix("NEON_").lower(): v for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") if (v := os.environ.get(k, None)) is not None } diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index fb79748832..76b75c1caf 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,5 +1,6 @@ from contextlib import closing +import psycopg2 import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol from fixtures.pageserver.http import PageserverApiException @@ -106,7 +107,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if expect_success: op() else: - with pytest.raises(Exception): + with pytest.raises(psycopg2.Error): op() def check_pageserver(expect_success: bool, **conn_kwargs): diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index 352e149171..b14974279e 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -141,13 +141,13 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): log.info("stopping check thread") check_stop_event.set() check_thread.join() - assert ( - False - ), f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" + raise AssertionError( + f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" + ) from e else: - assert ( - False - ), f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." + raise AssertionError( + f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." + ) from e log.info(f"inserted {rows_inserted} rows") @@ -157,9 +157,9 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): check_thread.join() log.info("check thread stopped") else: - assert ( - False - ), "WAL lag overflowed configured threshold. That means backpressure doesn't work." + raise AssertionError( + "WAL lag overflowed configured threshold. That means backpressure doesn't work." + ) # TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 0fb3b4f262..57e9413aa3 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -26,7 +26,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = [] - for n in range(4): + for _ in range(4): tenant_id, timeline_id = env.neon_cli.create_tenant() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py index c622d562fd..4e53928d14 100644 --- a/test_runner/regress/test_build_info_metric.py +++ b/test_runner/regress/test_build_info_metric.py @@ -12,7 +12,7 @@ def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonPro parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str()) parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics()) - for component, metrics in parsed_metrics.items(): + for _component, metrics in parsed_metrics.items(): sample = metrics.query_one("libmetrics_build_info") assert "revision" in sample.labels diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 51e7b01eba..a3d02c3f5a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, PortDistributor, + parse_project_git_version_output, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -72,9 +73,9 @@ def test_create_snapshot( ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" ) - pg_bin.run(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) - pg_bin.run(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) - pg_bin.run( + pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) + pg_bin.run_capture( ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -352,7 +353,7 @@ def prepare_snapshot( # get git SHA of neon binary def get_neon_version(neon_binpath: Path): out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8") - return out.split("git:", 1)[1].rstrip() + return parse_project_git_version_output(out) def check_neon_works( @@ -404,7 +405,9 @@ def check_neon_works( request.addfinalizer(lambda: cli_current.endpoint_stop("main")) connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" - pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]) + pg_bin.run_capture( + ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] + ) initial_dump_differs = dump_differs( repo_dir.parent / "dump.sql", test_output_dir / "dump.sql", @@ -424,7 +427,7 @@ def check_neon_works( shutil.rmtree(repo_dir / "local_fs_remote_storage") timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) pageserver_http.timeline_create(pg_version, tenant_id, timeline_id) - pg_bin.run( + pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) # The assert itself deferred to the end of the test @@ -436,7 +439,7 @@ def check_neon_works( ) # Check that we can interract with the data - pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr]) + pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr]) assert not dump_from_wal_differs, "dump from WAL differs" assert not initial_dump_differs, "initial dump differs" diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py new file mode 100644 index 0000000000..c1832a2063 --- /dev/null +++ b/test_runner/regress/test_duplicate_layers.py @@ -0,0 +1,36 @@ +import time + +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin + + +# Test duplicate layer detection +# +# This test sets fail point at the end of first compaction phase: +# after flushing new L1 layers but before deletion of L0 layers +# it should cause generation of duplicate L1 layer by compaction after restart. +@pytest.mark.timeout(600) +def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Use aggressive compaction and checkpoint settings + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + "compaction_period": "5 s", + "compaction_threshold": "3", + } + ) + + pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return")) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + connstr = endpoint.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) + + time.sleep(10) # let compaction to be performed + assert env.pageserver.log_contains("compact-level0-phase1-return-same") + + pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index d38be057d3..18f506cfce 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -54,7 +54,7 @@ async def gc(env: NeonEnv, timeline: TimelineId): # At the same time, run UPDATEs and GC async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): workers = [] - for worker_id in range(num_connections): + for _ in range(num_connections): workers.append(asyncio.create_task(update_table(endpoint))) workers.append(asyncio.create_task(gc(env, timeline))) diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 79453c1bdc..6e2a0622f1 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -1,3 +1,5 @@ +import subprocess + import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin @@ -38,7 +40,7 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) for _ in range(5): - with pytest.raises(Exception): + with pytest.raises(subprocess.SubprocessError): pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) env.pageserver.stop() env.pageserver.start() diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 141c69b230..d35366b467 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -135,11 +135,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Importing empty file fails empty_file = os.path.join(test_output_dir, "empty_file") with open(empty_file, "w") as _: - with pytest.raises(Exception): + with pytest.raises(RuntimeError): import_tar(empty_file, empty_file) # Importing corrupt backup fails - with pytest.raises(Exception): + with pytest.raises(RuntimeError): import_tar(corrupt_base_tar, wal_tar) # A tar with trailing garbage is currently accepted. It prints a warnings @@ -149,12 +149,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ".*WARN.*ignored .* unexpected bytes after the tar archive.*" ) - # NOTE: delete can easily come before upload operations are completed - # https://github.com/neondatabase/neon/issues/4326 - env.pageserver.allowed_errors.append( - ".*files not bound to index_file.json, proceeding with their deletion.*" - ) - timeline_delete_wait_completed(client, tenant, timeline) # Importing correct backup works diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py index d2d85a43e0..5ffc12b5b3 100644 --- a/test_runner/regress/test_layer_writers_fail.py +++ b/test_runner/regress/test_layer_writers_fail.py @@ -1,5 +1,6 @@ import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserver +from fixtures.pageserver.http import PageserverApiException @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") @@ -77,7 +78,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): pageserver_http.configure_failpoints(("delta-layer-writer-fail-before-finish", "return")) # Note: we cannot test whether the exception is exactly 'delta-layer-writer-fail-before-finish' # since our code does it in loop, we cannot get this exact error for our request. - with pytest.raises(Exception): + with pytest.raises(PageserverApiException): pageserver_http.timeline_checkpoint(tenant_id, timeline_id) new_temp_layer_files = list( diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index fe50969a0a..78635576f1 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -30,7 +30,7 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): # Lock entries using parallel connections in a round-robin fashion. nclients = 20 connections = [] - for i in range(nclients): + for _ in range(nclients): # Do not turn on autocommit. We want to hold the key-share locks. conn = endpoint.connect(autocommit=False) connections.append(conn) diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index cd481e69eb..9d24594cb6 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,12 +1,18 @@ +import os +import subprocess +from pathlib import Path from typing import cast +import pytest import requests from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, + parse_project_git_version_output, ) from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.types import TenantId, TimelineId @@ -131,3 +137,66 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): # Default stop res = env.neon_cli.raw_cli(["stop"]) res.check_returncode() + + +@skip_on_postgres(PgVersion.V14, reason="does not use postgres") +@pytest.mark.skipif( + os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works" +) +def test_parse_project_git_version_output_positive(): + commit = "b6f77b5816cf1dba12a3bc8747941182ce220846" + + positive = [ + # most likely when developing locally + f"Neon CLI git:{commit}-modified", + # when developing locally + f"Neon CLI git:{commit}", + # this is not produced in practice, but the impl supports it + f"Neon CLI git-env:{commit}-modified", + # most likely from CI or docker build + f"Neon CLI git-env:{commit}", + ] + + for example in positive: + assert parse_project_git_version_output(example) == commit + + +@skip_on_postgres(PgVersion.V14, reason="does not use postgres") +@pytest.mark.skipif( + os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works" +) +def test_parse_project_git_version_output_local_docker(): + """ + Makes sure the tests don't accept the default version in Dockerfile one gets without providing + a commit lookalike in --build-arg GIT_VERSION=XXX + """ + input = "Neon CLI git-env:local" + + with pytest.raises(ValueError) as e: + parse_project_git_version_output(input) + + assert input in str(e) + + +@skip_on_postgres(PgVersion.V14, reason="does not use postgres") +@pytest.mark.skipif( + os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works" +) +def test_binaries_version_parses(neon_binpath: Path): + """ + Ensures that we can parse the actual outputs of --version from a set of binaries. + + The list is not meant to be exhaustive, and compute_ctl has a different way for example. + """ + + binaries = [ + "neon_local", + "pageserver", + "safekeeper", + "proxy", + "pg_sni_router", + "storage_broker", + ] + for bin in binaries: + out = subprocess.check_output([neon_binpath / bin, "--version"]).decode("utf-8") + parse_project_git_version_output(out) diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 814b9f3de0..9b0bab5125 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -58,12 +58,12 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. - for i in range(10): + for _ in range(10): pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) print_gc_result(gc_result) - for j in range(100): + for _ in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") # All (or at least most of) the updates should've been on the same page, so diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index fc93dcffbb..65569f3bac 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -25,7 +25,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() - for i in range(n_restarts): + for _ in range(n_restarts): # Stop the pageserver gracefully and restart it. time.sleep(1) env.pageserver.stop() diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index 577bbc21bf..6f74d50b92 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -6,7 +6,7 @@ from fixtures.neon_fixtures import Endpoint, NeonEnv async def repeat_bytes(buf, repetitions: int): - for i in range(repetitions): + for _ in range(repetitions): yield buf diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 24c5b42b5a..1f6dcd39e9 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -1,6 +1,6 @@ import json import subprocess -from typing import Any, List +from typing import Any, List, Optional import psycopg2 import pytest @@ -179,7 +179,8 @@ def test_close_on_connections_exit(static_proxy: NeonProxy): def test_sql_over_http(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") - def q(sql: str, params: List[Any] = []) -> Any: + def q(sql: str, params: Optional[List[Any]] = None) -> Any: + params = params or [] connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", @@ -229,7 +230,8 @@ def test_sql_over_http(static_proxy: NeonProxy): def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") - def q(sql: str, raw_text: bool, array_mode: bool, params: List[Any] = []) -> Any: + def q(sql: str, raw_text: bool, array_mode: bool, params: Optional[List[Any]] = None) -> Any: + params = params or [] connstr = ( f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" ) diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 47a06359bb..d695410efc 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -133,7 +133,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Validation page inspect won't allow reading pages of dropped relations") try: c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") - assert False, "query should have failed" + raise AssertionError("query should have failed") except UndefinedTable as e: log.info("Caught an expected failure: {}".format(e)) @@ -157,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): c.execute( "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" ) - assert False, "query should have failed" + raise AssertionError("query should have failed") except UndefinedTable as e: log.info("Caught an expected failure: {}".format(e)) @@ -169,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): c.execute( "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" ) - assert False, "query should have failed" + raise AssertionError("query should have failed") except IoError as e: log.info("Caught an expected failure: {}".format(e)) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 9a70266314..13bc01f609 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -598,9 +598,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping" ) - env.pageserver.allowed_errors.append( - ".*files not bound to index_file.json, proceeding with their deletion.*" - ) timeline_delete_wait_completed(client, tenant_id, timeline_id) assert not timeline_path.exists() @@ -777,6 +774,95 @@ def test_empty_branch_remote_storage_upload_on_restart( create_thread.join() +# Regression test for a race condition where files are compactified before the upload, +# resulting in the uploading complaining about the file not being found +# https://github.com/neondatabase/neon/issues/4526 +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_compaction_delete_before_upload( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_delete_before_upload", + ) + + env = neon_env_builder.init_start() + + # create tenant with config that will determinstically allow + # compaction and disables gc + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # Set a small compaction threshold + "compaction_threshold": "3", + # Disable GC + "gc_period": "0s", + # disable PITR + "pitr_interval": "0s", + } + ) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + # Build two tables with some data inside + endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + client.timeline_checkpoint(tenant_id, timeline_id) + + endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)") + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + # Now make the flushing hang and update one small piece of data + client.configure_failpoints(("flush-frozen-before-sync", "pause")) + + endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + q: queue.Queue[Optional[PageserverApiException]] = queue.Queue() + barrier = threading.Barrier(2) + + def checkpoint_in_background(): + barrier.wait() + try: + client.timeline_checkpoint(tenant_id, timeline_id) + q.put(None) + except PageserverApiException as e: + q.put(e) + + create_thread = threading.Thread(target=checkpoint_in_background) + create_thread.start() + + try: + barrier.wait() + + time.sleep(4) + client.timeline_compact(tenant_id, timeline_id) + + client.configure_failpoints(("flush-frozen-before-sync", "off")) + + conflict = q.get() + + assert conflict is None + finally: + create_thread.join() + + # Add a delay for the uploads to run into either the file not found or the + time.sleep(4) + + # Ensure that this actually terminates + wait_upload_queue_empty(client, tenant_id, timeline_id) + + # For now we are hitting this message. + # Maybe in the future the underlying race condition will be fixed, + # but until then, ensure that this message is hit instead. + assert env.pageserver.log_contains( + "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more." + ) + + def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): diff --git a/test_runner/regress/test_setup.py b/test_runner/regress/test_setup.py index 3d1471621b..02710fc807 100644 --- a/test_runner/regress/test_setup.py +++ b/test_runner/regress/test_setup.py @@ -8,10 +8,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_fixture_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - for i in range(3): + for _ in range(3): env.pageserver.stop() env.pageserver.start() - for i in range(3): + for _ in range(3): env.safekeepers[0].stop() env.safekeepers[0].start() diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 2ded79954e..6803f6dbb1 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -167,7 +167,7 @@ async def reattach_while_busy( env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId ): workers = [] - for worker_id in range(num_connections): + for _ in range(num_connections): pg_conn = await endpoint.connect_async() workers.append(asyncio.create_task(update_table(pg_conn))) @@ -791,7 +791,7 @@ def test_ignore_while_attaching( pageserver_http.tenant_attach(tenant_id) # Run ignore on the task, thereby cancelling the attach. # XXX This should take priority over attach, i.e., it should cancel the attach task. - # But neither the failpoint, nor the proper storage_sync download functions, + # But neither the failpoint, nor the proper remote_timeline_client download functions, # are sensitive to task_mgr::shutdown. # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 . # So, for now, effectively, this ignore here will block until attach task completes. diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9043c29060..2805d56c98 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -80,7 +80,7 @@ def new_pageserver_service( except Exception as e: log.error(e) pageserver_process.kill() - raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") + raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e log.info("new pageserver started") try: diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 98f9e94276..498563325b 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -94,7 +94,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem # Wait for the remote storage uploads to finish pageserver_http = env.pageserver.http_client() - for tenant, endpoint in tenants_endpoints: + for _tenant, endpoint in tenants_endpoints: res = endpoint.safe_psql_many( ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7c3424cf32..a4c5bf626a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -144,7 +144,7 @@ def test_delete_timeline_post_rm_failure( ps_http.configure_failpoints((failpoint_name, "return")) ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) - timeline_info = wait_until_timeline_state( + wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=env.initial_timeline, @@ -152,7 +152,8 @@ def test_delete_timeline_post_rm_failure( iterations=2, # effectively try immediately and retry once in one second ) - timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" + # FIXME: #4719 + # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*" env.pageserver.allowed_errors.append(at_failpoint_log_message) @@ -326,7 +327,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) - timeline_info = wait_until_timeline_state( + wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=leaf_timeline_id, @@ -334,7 +335,8 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild iterations=2, # effectively try immediately and retry once in one second ) - timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" + # FIXME: #4719 + # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" assert leaf_timeline_path.exists(), "the failpoint didn't work" diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 6338f4ca77..cb993c93d2 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -189,7 +189,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): # If we get here, the timeline size limit failed log.error("Query unexpectedly succeeded") - assert False + raise AssertionError() except psycopg2.errors.DiskFull as err: log.info(f"Query expectedly failed with: {err}") @@ -284,9 +284,9 @@ def test_timeline_initial_logical_size_calculation_cancellation( # give it some time to settle in the state where it waits for size computation task time.sleep(5) if not delete_timeline_success.empty(): - assert ( - False - ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" + raise AssertionError( + f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" + ) log.info( "resume the size calculation. The failpoint checks that the timeline directory still exists." diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index b1ddd93a40..52f125ce0b 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -32,7 +32,7 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): cur.execute("create table t1(x integer)") cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") cur.execute("vacuum t1") - for i in range(n_iter): + for _ in range(n_iter): cur.execute(f"delete from t1 where x>{n_records//2}") cur.execute("vacuum t1") time.sleep(1) # let pageserver a chance to create image layers diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 5828d4306c..f3a6d09398 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -46,8 +46,9 @@ def wait_lsn_force_checkpoint( timeline_id: TimelineId, endpoint: Endpoint, ps: NeonPageserver, - pageserver_conn_options={}, + pageserver_conn_options=None, ): + pageserver_conn_options = pageserver_conn_options or {} lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") @@ -944,7 +945,7 @@ class SafekeeperEnv: except Exception as e: log.error(e) safekeeper_process.kill() - raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") + raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") from e def get_safekeeper_connstrs(self): assert self.safekeepers is not None, "safekeepers are not initialized" @@ -1137,7 +1138,7 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): collect_stats(endpoint, cur) # generate WAL to simulate normal workload - for i in range(5): + for _ in range(5): generate_wal(cur) collect_stats(endpoint, cur) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index ce33975a0e..bb8ee8f52c 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -392,7 +392,7 @@ async def run_concurrent_computes( break await asyncio.sleep(0.1) else: - assert False, "Timed out while waiting for another query by computes[0]" + raise AssertionError("Timed out while waiting for another query by computes[0]") computes[0].stopped = True await asyncio.gather(background_tasks[0]) @@ -545,7 +545,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat # invalid, to make them unavailable to the endpoint. We use # ports 10, 11 and 12 to simulate unavailable safekeepers. config = toml.load(test_output_dir / "repo" / "config") - for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)): + for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk)): if active: config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg else: diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 1144aee166..12c5dc8281 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 1144aee1661c79eec65e784a8dad8bd450d9df79 +Subproject commit 12c5dc8281d20b5bd636e1097eea80a7bc609591 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 1984832c74..e3fbfc4d14 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 1984832c740a7fa0e468bb720f40c525b652835d +Subproject commit e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e diff --git a/vendor/revisions.json b/vendor/revisions.json new file mode 100644 index 0000000000..18da5900a8 --- /dev/null +++ b/vendor/revisions.json @@ -0,0 +1,4 @@ +{ + "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e", + "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591" +}