diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml index 415b3d9cc6..c304172ff7 100644 --- a/.github/workflows/_push-to-acr.yml +++ b/.github/workflows/_push-to-acr.yml @@ -52,5 +52,5 @@ jobs: for image in ${images}; do docker buildx imagetools create \ -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ - neondatabase/${image}:${{ inputs.image_tag }} + neondatabase/${image}:${{ inputs.image_tag }} done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7c06fd9ab8..c1ec3f207b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,8 +54,8 @@ jobs: build-tag: ${{steps.build-tag.outputs.tag}} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -159,6 +159,10 @@ jobs: # This will catch compiler & clippy warnings in all feature combinations. # TODO: use cargo hack for build and test as well, but, that's quite expensive. # NB: keep clippy args in sync with ./run_clippy.sh + # + # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, + # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second + # time just for that, so skip "clippy --release". - run: | CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" if [ "$CLIPPY_COMMON_ARGS" = "" ]; then @@ -168,8 +172,6 @@ jobs: echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - name: Run cargo clippy (debug) run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - name: Run cargo clippy (release) - run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation run: cargo doc --workspace --no-deps --document-private-items @@ -357,6 +359,7 @@ jobs: }) coverage-report: + if: ${{ !startsWith(github.ref_name, 'release') }} needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: @@ -373,8 +376,8 @@ jobs: coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }} coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` for differential coverage (to get diff between two commits) + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -475,11 +478,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -554,11 +555,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -705,10 +704,7 @@ jobs: VM_BUILDER_VERSION: v0.29.3 steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Downloading vm-builder run: | @@ -748,10 +744,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 @@ -957,6 +950,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() runs-on: [ self-hosted, small ] @@ -976,10 +970,7 @@ jobs: git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Trigger deploy workflow env: @@ -1058,7 +1049,8 @@ jobs: # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ deploy ] - if: github.ref_name == 'release' + # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` + if: github.ref_name == 'release' && !failure() && !cancelled() runs-on: ubuntu-22.04 steps: diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 6fbe785c56..b299cf9b99 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -34,8 +34,8 @@ jobs: build-tag: ${{ steps.build-tag.outputs.tag }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/Cargo.lock b/Cargo.lock index 3ca6acbc3e..136f07956f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1209,7 +1209,6 @@ dependencies = [ "remote_storage", "serde", "serde_json", - "serde_with", "utils", ] @@ -1218,7 +1217,6 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "bytes", "cfg-if", "chrono", @@ -1237,7 +1235,6 @@ dependencies = [ "reqwest 0.12.4", "rlimit", "rust-ini", - "serde", "serde_json", "signal-hook", "tar", @@ -1246,7 +1243,6 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1317,12 +1313,9 @@ dependencies = [ name = "consumption_metrics" version = "0.1.0" dependencies = [ - "anyhow", "chrono", "rand 0.8.5", "serde", - "serde_with", - "utils", ] [[package]] @@ -1334,9 +1327,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", - "futures", "git-version", - "hex", "humantime", "humantime-serde", "hyper 0.14.26", @@ -1344,7 +1335,6 @@ dependencies = [ "once_cell", "pageserver_api", "pageserver_client", - "postgres", "postgres_backend", "postgres_connection", "regex", @@ -1353,9 +1343,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "serde_with", "storage_broker", - "tar", "thiserror", "tokio", "tokio-postgres", @@ -1663,7 +1651,6 @@ dependencies = [ "hex", "parking_lot 0.12.1", "rand 0.8.5", - "scopeguard", "smallvec", "tracing", "utils", @@ -2233,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git-version" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" dependencies = [ "git-version-macro", - "proc-macro-hack", ] [[package]] name = "git-version-macro" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -2744,19 +2729,6 @@ dependencies = [ "libc", ] -[[package]] -name = "inotify" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" -dependencies = [ - "bitflags 1.3.2", - "futures-core", - "inotify-sys", - "libc", - "tokio", -] - [[package]] name = "inotify-sys" version = "0.1.5" @@ -3251,7 +3223,7 @@ dependencies = [ "crossbeam-channel", "filetime", "fsevent-sys", - "inotify 0.9.6", + "inotify", "kqueue", "libc", "log", @@ -3642,7 +3614,6 @@ name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", - "bytes", "camino", "clap", "git-version", @@ -3651,7 +3622,6 @@ dependencies = [ "pageserver_api", "postgres_ffi", "remote_storage", - "serde", "serde_json", "svg_fmt", "thiserror", @@ -3670,7 +3640,6 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", - "async-trait", "bit_field", "byteorder", "bytes", @@ -3678,16 +3647,13 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "const_format", "consumption_metrics", "crc32c", "criterion", - "crossbeam-utils", "either", "enum-map", "enumset", "fail", - "flate2", "futures", "git-version", "hex", @@ -3726,13 +3692,9 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_with", - "signal-hook", - "smallvec", "storage_broker", "strum", "strum_macros", - "svg_fmt", - "sync_wrapper", "sysinfo", "tenant_size_model", "thiserror", @@ -3746,7 +3708,6 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", - "twox-hash", "url", "utils", "walkdir", @@ -3810,44 +3771,22 @@ name = "pageserver_compaction" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "async-stream", - "byteorder", - "bytes", - "chrono", "clap", - "const_format", - "consumption_metrics", "criterion", - "crossbeam-utils", - "either", - "fail", - "flate2", "futures", "git-version", - "hex", "hex-literal", - "humantime", - "humantime-serde", "itertools 0.10.5", - "metrics", "once_cell", "pageserver_api", "pin-project-lite", "rand 0.8.5", - "smallvec", "svg_fmt", - "sync_wrapper", - "thiserror", "tokio", - "tokio-io-timeout", - "tokio-util", "tracing", - "tracing-error", "tracing-subscriber", - "url", "utils", - "walkdir", "workspace_hack", ] @@ -4164,9 +4103,7 @@ name = "postgres_backend" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", - "futures", "once_cell", "pq_proto", "rustls 0.22.4", @@ -4199,16 +4136,13 @@ version = "0.1.0" dependencies = [ "anyhow", "bindgen", - "byteorder", "bytes", "crc32c", "env_logger", - "hex", "log", "memoffset 0.8.0", "once_cell", "postgres", - "rand 0.8.5", "regex", "serde", "thiserror", @@ -4243,13 +4177,11 @@ dependencies = [ "byteorder", "bytes", "itertools 0.10.5", - "pin-project-lite", "postgres-protocol", "rand 0.8.5", "serde", "thiserror", "tokio", - "tracing", ] [[package]] @@ -4281,12 +4213,6 @@ dependencies = [ "elliptic-curve 0.13.8", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.78" @@ -4405,7 +4331,6 @@ dependencies = [ "aws-config", "aws-sdk-iam", "aws-sigv4", - "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -4414,7 +4339,6 @@ dependencies = [ "chrono", "clap", "consumption_metrics", - "crossbeam-deque", "dashmap", "ecdsa 0.16.9", "env_logger", @@ -4440,11 +4364,9 @@ dependencies = [ "jose-jwa", "jose-jwk", "lasso", - "md5", "measured", "metrics", "once_cell", - "opentelemetry", "p256 0.13.2", "parking_lot 0.12.1", "parquet", @@ -4465,7 +4387,6 @@ dependencies = [ "reqwest-middleware", "reqwest-retry", "reqwest-tracing", - "routerify", "rsa", "rstest", "rustc-hash", @@ -4481,7 +4402,6 @@ dependencies = [ "smol_str", "socket2 0.5.5", "subtle", - "task-local-extensions", "thiserror", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -4491,7 +4411,6 @@ dependencies = [ "tokio-rustls 0.25.0", "tokio-tungstenite", "tokio-util", - "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -4781,7 +4700,6 @@ dependencies = [ "async-stream", "async-trait", "aws-config", - "aws-credential-types", "aws-sdk-s3", "aws-smithy-async", "aws-smithy-types", @@ -4795,7 +4713,6 @@ dependencies = [ "futures", "futures-util", "http-types", - "humantime", "humantime-serde", "hyper 0.14.26", "itertools 0.10.5", @@ -5275,14 +5192,12 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "async-trait", "byteorder", "bytes", "camino", "camino-tempfile", "chrono", "clap", - "const_format", "crc32c", "desim", "fail", @@ -5308,9 +5223,7 @@ dependencies = [ "sd-notify", "serde", "serde_json", - "serde_with", "sha2", - "signal-hook", "storage_broker", "strum", "strum_macros", @@ -5321,7 +5234,6 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit", "tracing", "tracing-subscriber", "url", @@ -5336,7 +5248,6 @@ version = "0.1.0" dependencies = [ "const_format", "serde", - "serde_with", "utils", ] @@ -5865,7 +5776,6 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "bytes", "clap", "const_format", "futures", @@ -5879,7 +5789,6 @@ dependencies = [ "parking_lot 0.12.1", "prost", "tokio", - "tokio-stream", "tonic", "tonic-build", "tracing", @@ -5892,9 +5801,7 @@ name = "storage_controller" version = "0.1.0" dependencies = [ "anyhow", - "aws-config", "bytes", - "camino", "chrono", "clap", "control_plane", @@ -5935,20 +5842,9 @@ dependencies = [ name = "storage_controller_client" version = "0.1.0" dependencies = [ - "anyhow", - "bytes", - "futures", - "pageserver_api", "pageserver_client", - "postgres", "reqwest 0.12.4", "serde", - "thiserror", - "tokio", - "tokio-postgres", - "tokio-stream", - "tokio-util", - "utils", "workspace_hack", ] @@ -5960,13 +5856,9 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", "camino", "chrono", "clap", - "crc32c", "either", "futures", "futures-util", @@ -5978,20 +5870,16 @@ dependencies = [ "pageserver", "pageserver_api", "postgres_ffi", - "rand 0.8.5", "remote_storage", "reqwest 0.12.4", "rustls 0.22.4", "rustls-native-certs 0.7.0", "serde", "serde_json", - "serde_with", "storage_controller_client", - "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", "tokio-stream", "tokio-util", "tracing", @@ -6010,14 +5898,11 @@ dependencies = [ "comfy-table", "futures", "humantime", - "hyper 0.14.26", "pageserver_api", "pageserver_client", "reqwest 0.12.4", - "serde", "serde_json", "storage_controller_client", - "thiserror", "tokio", "tracing", "utils", @@ -6140,15 +6025,6 @@ dependencies = [ "xattr", ] -[[package]] -name = "task-local-extensions" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" -dependencies = [ - "pin-utils", -] - [[package]] name = "tempfile" version = "3.9.0" @@ -6739,7 +6615,6 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -6943,7 +6818,6 @@ dependencies = [ "serde_assert", "serde_json", "serde_path_to_error", - "serde_with", "signal-hook", "strum", "strum_macros", @@ -6999,13 +6873,11 @@ dependencies = [ "cgroups-rs", "clap", "futures", - "inotify 0.10.2", "serde", "serde_json", "sysinfo", "tokio", "tokio-postgres", - "tokio-stream", "tokio-util", "tracing", "tracing-subscriber", @@ -7032,7 +6904,6 @@ dependencies = [ "clap", "env_logger", "log", - "once_cell", "postgres", "postgres_ffi", "regex", @@ -7555,6 +7426,7 @@ dependencies = [ "digest", "either", "fail", + "futures", "futures-channel", "futures-executor", "futures-io", @@ -7610,6 +7482,8 @@ dependencies = [ "tower", "tracing", "tracing-core", + "tracing-log", + "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 6e2510fe60..6bf6fb650f 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -55,22 +55,27 @@ RUN cd postgres && \ # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. - old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ - # the first loop is for pg_stat_statement extension version <= 1.6 for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ - if echo "$old_list" | grep -q -F "$filename"; then \ + # Note that there are no downgrade scripts for pg_stat_statements, so we \ + # don't have to modify any downgrade paths or (much) older versions: we only \ + # have to make sure every creation of the pg_stat_statements_reset function \ + # also adds execute permissions to the neon_superuser. + case $filename in \ + pg_stat_statements--1.4.sql) \ + # pg_stat_statements_reset is first created with 1.4 echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ - fi; \ - done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, - # where pg_stat_statement_reset() got 3 additional arguments - for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ - filename=$(basename "$file"); \ - if ! echo "$old_list" | grep -q -F "$filename"; then \ + ;; \ + pg_stat_statements--1.6--1.7.sql) \ + # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ - fi; \ - done + ;; \ + pg_stat_statements--1.10--1.11.sql) \ + # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \ + ;; \ + esac; \ + done; ######################################################################################### # diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8af0ed43ce..00a82e4be6 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,7 +11,6 @@ testing = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true @@ -24,7 +23,6 @@ num_cpus.workspace = true opentelemetry.workspace = true postgres.workspace = true regex.workspace = true -serde.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,7 +41,6 @@ url.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true -toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql new file mode 100644 index 0000000000..425ed8cd3d --- /dev/null +++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6a87263821..aa9405d28d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { include_str!( "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), + include_str!( + "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" + ), ]; MigrationRunner::new(client, &migrations).run_migrations()?; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 6fca59b368..c185d20484 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,13 +9,10 @@ anyhow.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true -futures.workspace = true git-version.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true -postgres.workspace = true -hex.workspace = true humantime-serde.workspace = true hyper.workspace = true regex.workspace = true @@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] } scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -tar.workspace = true thiserror.workspace = true toml.workspace = true toml_edit.workspace = true diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 619c5bce3e..94a072e394 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -151,7 +151,7 @@ where print!("."); io::stdout().flush().unwrap(); } - thread::sleep(RETRY_INTERVAL); + tokio::time::sleep(RETRY_INTERVAL).await; } Err(e) => { println!("error starting process {process_name:?}: {e:#}"); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 144cd647c9..92f609761a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -34,12 +34,14 @@ use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; +use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; use std::time::Duration; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; +use tokio::task::JoinSet; use url::Host; use utils::{ auth::{Claims, Scope}, @@ -87,34 +89,35 @@ fn main() -> Result<()> { // Check for 'neon init' command first. let subcommand_result = if sub_name == "init" { - handle_init(sub_args).map(Some) + handle_init(sub_args).map(|env| Some(Cow::Owned(env))) } else { // all other commands need an existing config - let mut env = - LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; - let original_env = env.clone(); + let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; + let original_env = env.clone(); + let env = Box::leak(Box::new(env)); let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); let subcommand_result = match sub_name { - "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), - "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))), - "stop" => rt.block_on(handle_stop_all(sub_args, &env)), - "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), - "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), - "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), - "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), - "mappings" => handle_mappings(sub_args, &mut env), + "tenant" => rt.block_on(handle_tenant(sub_args, env)), + "timeline" => rt.block_on(handle_timeline(sub_args, env)), + "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))), + "stop" => rt.block_on(handle_stop_all(sub_args, env)), + "pageserver" => rt.block_on(handle_pageserver(sub_args, env)), + "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)), + "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)), + "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)), + "endpoint" => rt.block_on(handle_endpoint(sub_args, env)), + "mappings" => handle_mappings(sub_args, env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), _ => bail!("unexpected subcommand {sub_name}"), }; - if original_env != env { - subcommand_result.map(|()| Some(env)) + if &original_env != env { + subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) } @@ -1245,49 +1248,122 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } +async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match sub_match.subcommand() { + Some(broker_command_data) => broker_command_data, + None => bail!("no broker subcommand provided"), + }; + + match sub_name { + "start" => { + if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await { + eprintln!("broker start failed: {e}"); + exit(1); + } + } + + "stop" => { + if let Err(e) = broker::stop_broker_process(env) { + eprintln!("broker stop failed: {e}"); + exit(1); + } + } + + _ => bail!("Unexpected broker subcommand '{}'", sub_name), + } + Ok(()) +} + async fn handle_start_all( - env: &local_env::LocalEnv, + env: &'static local_env::LocalEnv, retry_timeout: &Duration, ) -> anyhow::Result<()> { + let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else { + neon_start_status_check(env, retry_timeout) + .await + .context("status check after successful startup of all services")?; + return Ok(()); + }; + + eprintln!("startup failed because one or more services could not be started"); + + for e in errors { + eprintln!("{e}"); + let debug_repr = format!("{e:?}"); + for line in debug_repr.lines() { + eprintln!(" {line}"); + } + } + + try_stop_all(env, true).await; + + exit(2); +} + +/// Returns Ok() if and only if all services could be started successfully. +/// Otherwise, returns the list of errors that occurred during startup. +async fn handle_start_all_impl( + env: &'static local_env::LocalEnv, + retry_timeout: Duration, +) -> Result<(), Vec> { // Endpoints are not started automatically - broker::start_broker_process(env, retry_timeout).await?; + let mut js = JoinSet::new(); - // Only start the storage controller if the pageserver is configured to need it - if env.control_plane_api.is_some() { - let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller - .start(NeonStorageControllerStartArgs::with_default_instance_id( - (*retry_timeout).into(), - )) - .await - { - eprintln!("storage_controller start failed: {:#}", e); - try_stop_all(env, true).await; - exit(1); + // force infalliblity through closure + #[allow(clippy::redundant_closure_call)] + (|| { + js.spawn(async move { + let retry_timeout = retry_timeout; + broker::start_broker_process(env, &retry_timeout).await + }); + + // Only start the storage controller if the pageserver is configured to need it + if env.control_plane_api.is_some() { + js.spawn(async move { + let storage_controller = StorageController::from_env(env); + storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + retry_timeout.into(), + )) + .await + .map_err(|e| e.context("start storage_controller")) + }); + } + + for ps_conf in &env.pageservers { + js.spawn(async move { + let pageserver = PageServerNode::from_env(env, ps_conf); + pageserver + .start(&retry_timeout) + .await + .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id))) + }); + } + + for node in env.safekeepers.iter() { + js.spawn(async move { + let safekeeper = SafekeeperNode::from_env(env, node); + safekeeper + .start(vec![], &retry_timeout) + .await + .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) + }); + } + })(); + + let mut errors = Vec::new(); + while let Some(result) = js.join_next().await { + let result = result.expect("we don't panic or cancel the tasks"); + if let Err(e) = result { + errors.push(e); } } - for ps_conf in &env.pageservers { - let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver.start(retry_timeout).await { - eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true).await; - exit(1); - } + if !errors.is_empty() { + return Err(errors); } - for node in env.safekeepers.iter() { - let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start(vec![], retry_timeout).await { - eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false).await; - exit(1); - } - } - - neon_start_status_check(env, retry_timeout).await?; - Ok(()) } @@ -1672,6 +1748,19 @@ fn cli() -> Command { .arg(stop_mode_arg.clone()) .arg(instance_id)) ) + .subcommand( + Command::new("storage_broker") + .arg_required_else_help(true) + .about("Manage broker") + .subcommand(Command::new("start") + .about("Start broker") + .arg(timeout_arg.clone()) + ) + .subcommand(Command::new("stop") + .about("Stop broker") + .arg(stop_mode_arg.clone()) + ) + ) .subcommand( Command::new("safekeeper") .arg_required_else_help(true) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 9f879c4b08..7554a03a68 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -702,7 +702,7 @@ impl Endpoint { } } } - std::thread::sleep(ATTEMPT_INTERVAL); + tokio::time::sleep(ATTEMPT_INTERVAL).await; } // disarm the scopeguard, let the child outlive this function (and neon_local invoction) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 33ca70af96..cae9416af6 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -17,9 +17,7 @@ use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use pageserver_api::models::{ - self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo, -}; +use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; @@ -324,22 +322,6 @@ impl PageServerNode { background_process::stop_process(immediate, "pageserver", &self.pid_file()) } - pub async fn page_server_psql_client( - &self, - ) -> anyhow::Result<( - tokio_postgres::Client, - tokio_postgres::Connection, - )> { - let mut config = self.pg_connection_config.clone(); - if self.conf.pg_auth_type == AuthType::NeonJWT { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; - config = config.set_password(Some(token)); - } - Ok(config.connect_no_tls().await?) - } - pub async fn check_status(&self) -> mgmt_api::Result<()> { self.http_client.status().await } @@ -540,19 +522,6 @@ impl PageServerNode { Ok(()) } - pub async fn location_config( - &self, - tenant_shard_id: TenantShardId, - config: LocationConfig, - flush_ms: Option, - lazy: bool, - ) -> anyhow::Result<()> { - Ok(self - .http_client - .location_config(tenant_shard_id, config, flush_ms, lazy) - .await?) - } - pub async fn timeline_list( &self, tenant_shard_id: &TenantShardId, @@ -636,14 +605,4 @@ impl PageServerNode { Ok(()) } - - pub async fn tenant_synthetic_size( - &self, - tenant_shard_id: TenantShardId, - ) -> anyhow::Result { - Ok(self - .http_client - .tenant_synthetic_size(tenant_shard_id) - .await?) - } } diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 638575eb82..5aee12dc97 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -4,13 +4,10 @@ /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just /// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. -use anyhow::{bail, Context, Result}; use once_cell::sync::Lazy; use regex::Regex; use std::collections::HashMap; use std::fmt; -use std::io::BufRead; -use std::str::FromStr; /// In-memory representation of a postgresql.conf file #[derive(Default, Debug)] @@ -19,84 +16,16 @@ pub struct PostgresConf { hash: HashMap, } -static CONF_LINE_RE: Lazy = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap()); - impl PostgresConf { pub fn new() -> PostgresConf { PostgresConf::default() } - /// Read file into memory - pub fn read(read: impl std::io::Read) -> Result { - let mut result = Self::new(); - - for line in std::io::BufReader::new(read).lines() { - let line = line?; - - // Store each line in a vector, in original format - result.lines.push(line.clone()); - - // Also parse each line and insert key=value lines into a hash map. - // - // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL. - // But it's close enough for our usage. - let line = line.trim(); - if line.starts_with('#') { - // comment, ignore - continue; - } else if let Some(caps) = CONF_LINE_RE.captures(line) { - let name = caps.get(1).unwrap().as_str(); - let raw_val = caps.get(2).unwrap().as_str(); - - if let Ok(val) = deescape_str(raw_val) { - // Note: if there's already an entry in the hash map for - // this key, this will replace it. That's the behavior what - // we want; when PostgreSQL reads the file, each line - // overrides any previous value for the same setting. - result.hash.insert(name.to_string(), val.to_string()); - } - } - } - Ok(result) - } - /// Return the current value of 'option' pub fn get(&self, option: &str) -> Option<&str> { self.hash.get(option).map(|x| x.as_ref()) } - /// Return the current value of a field, parsed to the right datatype. - /// - /// This calls the FromStr::parse() function on the value of the field. If - /// the field does not exist, or parsing fails, returns an error. - /// - pub fn parse_field(&self, field_name: &str, context: &str) -> Result - where - T: FromStr, - ::Err: std::error::Error + Send + Sync + 'static, - { - self.get(field_name) - .with_context(|| format!("could not find '{}' option {}", field_name, context))? - .parse::() - .with_context(|| format!("could not parse '{}' option {}", field_name, context)) - } - - pub fn parse_field_optional(&self, field_name: &str, context: &str) -> Result> - where - T: FromStr, - ::Err: std::error::Error + Send + Sync + 'static, - { - if let Some(val) = self.get(field_name) { - let result = val - .parse::() - .with_context(|| format!("could not parse '{}' option {}", field_name, context))?; - - Ok(Some(result)) - } else { - Ok(None) - } - } - /// /// Note: if you call this multiple times for the same option, the config /// file will a line for each call. It would be nice to have a function @@ -154,48 +83,8 @@ fn escape_str(s: &str) -> String { } } -/// De-escape a possibly-quoted value. -/// -/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL -/// does this. -fn deescape_str(s: &str) -> Result { - // If the string has a quote at the beginning and end, strip them out. - if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') { - let mut result = String::new(); - - let mut iter = s[1..(s.len() - 1)].chars().peekable(); - while let Some(c) = iter.next() { - let newc = if c == '\\' { - match iter.next() { - Some('b') => '\x08', - Some('f') => '\x0c', - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('0'..='7') => { - // TODO - bail!("octal escapes not supported"); - } - Some(n) => n, - None => break, - } - } else if c == '\'' && iter.peek() == Some(&'\'') { - // doubled quote becomes just one quote - iter.next().unwrap() - } else { - c - }; - - result.push(newc); - } - Ok(result) - } else { - Ok(s.to_string()) - } -} - #[test] -fn test_postgresql_conf_escapes() -> Result<()> { +fn test_postgresql_conf_escapes() -> anyhow::Result<()> { assert_eq!(escape_str("foo bar"), "'foo bar'"); // these don't need to be quoted assert_eq!(escape_str("foo"), "foo"); @@ -214,13 +103,5 @@ fn test_postgresql_conf_escapes() -> Result<()> { assert_eq!(escape_str("fo\\o"), "'fo\\\\o'"); assert_eq!(escape_str("10 cats"), "'10 cats'"); - // Test de-escaping - assert_eq!(deescape_str(&escape_str("foo"))?, "foo"); - assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r"); - assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t"); - - // octal-escapes are currently not supported - assert!(deescape_str("'foo\\7\\07\\007'").is_err()); - Ok(()) } diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index be69208d0d..ce89116691 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -11,14 +11,11 @@ clap.workspace = true comfy-table.workspace = true futures.workspace = true humantime.workspace = true -hyper.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true reqwest.workspace = true -serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } storage_controller_client.workspace = true -thiserror.workspace = true tokio.workspace = true tracing.workspace = true utils.workspace = true diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 8aaa481f8c..c0ec40a6c2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true anyhow.workspace = true chrono.workspace = true serde.workspace = true -serde_with.workspace = true serde_json.workspace = true regex.workspace = true diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index a40b74b952..0e517e3856 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -5,9 +5,6 @@ edition = "2021" license = "Apache-2.0" [dependencies] -anyhow.workspace = true chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true -serde_with.workspace = true -utils.workspace = true diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 810196aff6..fbe2e6830f 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc}; use rand::Rng; use serde::{Deserialize, Serialize}; -#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] #[serde(tag = "type")] pub enum EventType { #[serde(rename = "absolute")] @@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } -#[derive(serde::Serialize, serde::Deserialize)] +#[derive(serde::Serialize, Deserialize)] pub struct EventChunk<'a, T: Clone> { pub events: std::borrow::Cow<'a, [T]>, } diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml index 0c4be90267..473f3a2a13 100644 --- a/libs/desim/Cargo.toml +++ b/libs/desim/Cargo.toml @@ -12,5 +12,4 @@ bytes.workspace = true utils.workspace = true parking_lot.workspace = true hex.workspace = true -scopeguard.workspace = true smallvec = { workspace = true, features = ["write"] } diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 1194ee93ef..61e32bc9ab 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -173,40 +173,6 @@ impl Default for EvictionOrder { } } -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetVectoredImpl { - Sequential, - Vectored, -} - -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetImpl { - Legacy, - Vectored, -} - #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); @@ -338,8 +304,6 @@ pub mod defaults { pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Zstd { level: Some(1) }; - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; - pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; @@ -376,7 +340,10 @@ impl Default for ConfigToml { concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) .expect("Invalid default constant")), - concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(), + concurrent_tenant_size_logical_size_queries: NonZeroUsize::new( + DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES, + ) + .unwrap(), metric_collection_interval: (humantime::parse_duration( DEFAULT_METRIC_COLLECTION_INTERVAL, ) @@ -467,8 +434,6 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } impl Default for TenantConfigToml { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 45e84baa1f..c9be53f0b0 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, } -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] pub enum L0FlushConfig { #[serde(rename_all = "snake_case")] diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index f6854328fc..a0c87263ed 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -5,10 +5,8 @@ edition.workspace = true license.workspace = true [dependencies] -async-trait.workspace = true anyhow.workspace = true bytes.workspace = true -futures.workspace = true rustls.workspace = true serde.workspace = true thiserror.workspace = true diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 8ea4b93fb1..e274d24585 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -280,16 +280,6 @@ pub struct PostgresBackend { pub type PostgresBackendTCP = PostgresBackend; -pub fn query_from_cstring(query_string: Bytes) -> Vec { - let mut query_string = query_string.to_vec(); - if let Some(ch) = query_string.last() { - if *ch == 0 { - query_string.pop(); - } - } - query_string -} - /// Cast a byte slice to a string slice, dropping null terminator if there's one. fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> { let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index ee69878f69..ef17833a48 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -5,13 +5,10 @@ edition.workspace = true license.workspace = true [dependencies] -rand.workspace = true regex.workspace = true bytes.workspace = true -byteorder.workspace = true anyhow.workspace = true crc32c.workspace = true -hex.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 61b49a634d..497d011d7a 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -9,8 +9,8 @@ //! comments on them. //! +use crate::PageHeaderData; use crate::BLCKSZ; -use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -194,8 +194,6 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = size_of::() as u32; - // // from xlogrecord.h // @@ -219,8 +217,6 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; pub const INVALID_TRANSACTION_ID: u32 = 0; -pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; -pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; /* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0cfd56962e..a636bd2a97 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -26,11 +26,12 @@ use bytes::{Buf, Bytes}; use log::*; use serde::Serialize; +use std::ffi::OsStr; use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; use std::io::SeekFrom; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; @@ -78,19 +79,34 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize ) } -pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { - let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); - let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; - let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; - (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) +pub fn XLogFromFileName( + fname: &OsStr, + wal_seg_size: usize, +) -> anyhow::Result<(XLogSegNo, TimeLineID)> { + if let Some(fname_str) = fname.to_str() { + let tli = u32::from_str_radix(&fname_str[0..8], 16)?; + let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo; + let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo; + Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)) + } else { + anyhow::bail!("non-ut8 filename: {:?}", fname); + } } -pub fn IsXLogFileName(fname: &str) -> bool { - return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); +pub fn IsXLogFileName(fname: &OsStr) -> bool { + if let Some(fname) = fname.to_str() { + fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()) + } else { + false + } } -pub fn IsPartialXLogFileName(fname: &str) -> bool { - fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) +pub fn IsPartialXLogFileName(fname: &OsStr) -> bool { + if let Some(fname) = fname.to_str() { + fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8])) + } else { + false + } } /// If LSN points to the beginning of the page, then shift it to first record, @@ -260,13 +276,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { } } -pub fn main() { - let mut data_dir = PathBuf::new(); - data_dir.push("."); - let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap(); - println!("wal_end={:?}", wal_end); -} - impl XLogRecord { pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 29dd01a936..14c7d2e340 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -9,7 +9,6 @@ anyhow.workspace = true clap.workspace = true env_logger.workspace = true log.workspace = true -once_cell.workspace = true postgres.workspace = true postgres_ffi.workspace = true camino-tempfile.workspace = true diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 949e3f4251..5c0abda522 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -7,6 +7,7 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; use postgres_ffi::{ XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; +use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; @@ -26,7 +27,6 @@ macro_rules! xlog_utils_test { postgres_ffi::for_all_postgres_versions! { xlog_utils_test } -#[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { pub pg_version: u32, pub pg_distrib_dir: PathBuf, @@ -136,8 +136,8 @@ impl Conf { pub fn pg_waldump( &self, - first_segment_name: &str, - last_segment_name: &str, + first_segment_name: &OsStr, + last_segment_name: &OsStr, ) -> anyhow::Result { let first_segment_file = self.datadir.join(first_segment_name); let last_segment_file = self.datadir.join(last_segment_name); diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 79d45de67a..9eb3f0e95a 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -4,6 +4,7 @@ use super::*; use crate::{error, info}; use regex::Regex; use std::cmp::min; +use std::ffi::OsStr; use std::fs::{self, File}; use std::io::Write; use std::{env, str::FromStr}; @@ -54,7 +55,7 @@ fn test_end_of_wal(test_name: &str) { .wal_dir() .read_dir() .unwrap() - .map(|f| f.unwrap().file_name().into_string().unwrap()) + .map(|f| f.unwrap().file_name()) .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); @@ -70,11 +71,11 @@ fn test_end_of_wal(test_name: &str) { start_lsn ); for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { - let fname = file.file_name().into_string().unwrap(); + let fname = file.file_name(); if !IsXLogFileName(&fname) { continue; } - let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); + let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap(); let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); if seg_start_lsn > u64::from(*start_lsn) { continue; @@ -93,10 +94,10 @@ fn test_end_of_wal(test_name: &str) { } } -fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn { +fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn { // Get the actual end of WAL by pg_waldump let waldump_output = cfg - .pg_waldump("000000010000000000000001", last_segment) + .pg_waldump(OsStr::new("000000010000000000000001"), last_segment) .unwrap() .stderr; let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); @@ -117,7 +118,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn { fn check_end_of_wal( cfg: &crate::Conf, - last_segment: &str, + last_segment: &OsStr, start_lsn: Lsn, expected_end_of_wal: Lsn, ) { @@ -132,7 +133,8 @@ fn check_end_of_wal( // Rename file to partial to actually find last valid lsn, then rename it back. fs::rename( cfg.wal_dir().join(last_segment), - cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir() + .join(format!("{}.partial", last_segment.to_str().unwrap())), ) .unwrap(); let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); @@ -142,7 +144,8 @@ fn check_end_of_wal( ); assert_eq!(wal_end, expected_end_of_wal); fs::rename( - cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir() + .join(format!("{}.partial", last_segment.to_str().unwrap())), cfg.wal_dir().join(last_segment), ) .unwrap(); diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 66bbe03ebc..9524a1490d 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -8,10 +8,8 @@ license.workspace = true bytes.workspace = true byteorder.workspace = true itertools.workspace = true -pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true tokio = { workspace = true, features = ["io-util"] } -tracing.workspace = true thiserror.workspace = true serde.workspace = true diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 02adee058f..f48f1801a4 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,14 +13,11 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true -aws-credential-types.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } -humantime.workspace = true humantime-serde.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true -rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index b5b69c9faf..45267ccda9 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -127,10 +127,6 @@ impl RemotePath { &self.0 } - pub fn extension(&self) -> Option<&str> { - self.0.extension() - } - pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index e1f4bcca46..14811232d3 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -6,6 +6,5 @@ license.workspace = true [dependencies] serde.workspace = true -serde_with.workspace = true const_format.workspace = true utils.workspace = true diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 5ea8db6b42..05eb538d42 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -9,8 +9,9 @@ hyper.workspace = true opentelemetry = { workspace = true, features=["rt-tokio"] } opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true -reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true -tracing-subscriber.workspace = true + +[dev-dependencies] +tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 19deaab63f..f199b15554 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -42,7 +42,6 @@ tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true -serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs deleted file mode 100644 index 0fb0190a92..0000000000 --- a/libs/utils/src/accum.rs +++ /dev/null @@ -1,33 +0,0 @@ -/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you -/// feed the accumulated values by calling the 'accum' function, instead of having an -/// iterator. -/// -/// For example, to calculate the smallest value among some integers: -/// -/// ``` -/// use utils::accum::Accum; -/// -/// let values = [1, 2, 3]; -/// -/// let mut min_value: Accum = Accum(None); -/// for new_value in &values { -/// min_value.accum(std::cmp::min, *new_value); -/// } -/// -/// assert_eq!(min_value.0.unwrap(), 1); -/// ``` -pub struct Accum(pub Option); -impl Accum { - pub fn accum(&mut self, func: F, new_value: T) - where - F: FnOnce(T, T) -> T, - { - // If there is no previous value, just store the new value. - // Otherwise call the function to decide which one to keep. - self.0 = Some(if let Some(accum) = self.0 { - func(accum, new_value) - } else { - new_value - }); - } -} diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 3d863a6518..5e05e4e713 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -82,7 +82,7 @@ impl ApiError { StatusCode::INTERNAL_SERVER_ERROR, ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( - err.to_string(), + format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace StatusCode::INTERNAL_SERVER_ERROR, ), } diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 2cda899b15..eb91839504 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id { } impl Id { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { - let mut arr = [0u8; 16]; - buf.copy_to_slice(&mut arr); - Id::from(arr) - } - pub fn from_slice(src: &[u8]) -> Result { if src.len() != 16 { return Err(IdError::SliceParseError(src.len())); @@ -179,10 +173,6 @@ impl fmt::Debug for Id { macro_rules! id_newtype { ($t:ident) => { impl $t { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t { - $t(Id::get_from_buf(buf)) - } - pub fn from_slice(src: &[u8]) -> Result<$t, IdError> { Ok($t(Id::from_slice(src)?)) } diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs index a120dc0ac5..0cc58738c0 100644 --- a/libs/utils/src/leaky_bucket.rs +++ b/libs/utils/src/leaky_bucket.rs @@ -21,7 +21,13 @@ //! //! Another explaination can be found here: -use std::{sync::Mutex, time::Duration}; +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Mutex, + }, + time::Duration, +}; use tokio::{sync::Notify, time::Instant}; @@ -128,6 +134,7 @@ impl LeakyBucketState { pub struct RateLimiter { pub config: LeakyBucketConfig, + pub sleep_counter: AtomicU64, pub state: Mutex, /// a queue to provide this fair ordering. pub queue: Notify, @@ -144,6 +151,7 @@ impl Drop for Requeue<'_> { impl RateLimiter { pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self { RateLimiter { + sleep_counter: AtomicU64::new(0), state: Mutex::new(LeakyBucketState::with_initial_tokens( &config, initial_tokens, @@ -163,15 +171,16 @@ impl RateLimiter { /// returns true if we did throttle pub async fn acquire(&self, count: usize) -> bool { - let mut throttled = false; - let start = tokio::time::Instant::now(); + let start_count = self.sleep_counter.load(Ordering::Acquire); + let mut end_count = start_count; + // wait until we are the first in the queue let mut notified = std::pin::pin!(self.queue.notified()); if !notified.as_mut().enable() { - throttled = true; notified.await; + end_count = self.sleep_counter.load(Ordering::Acquire); } // notify the next waiter in the queue when we are done. @@ -184,9 +193,22 @@ impl RateLimiter { .unwrap() .add_tokens(&self.config, start, count as f64); match res { - Ok(()) => return throttled, + Ok(()) => return end_count > start_count, Err(ready_at) => { - throttled = true; + struct Increment<'a>(&'a AtomicU64); + + impl Drop for Increment<'_> { + fn drop(&mut self) { + self.0.fetch_add(1, Ordering::AcqRel); + } + } + + // increment the counter after we finish sleeping (or cancel this task). + // this ensures that tasks that have already started the acquire will observe + // the new sleep count when they are allowed to resume on the notify. + let _inc = Increment(&self.sleep_counter); + end_count += 1; + tokio::time::sleep_until(ready_at).await; } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 218dd468b1..03fb36caf8 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -43,16 +43,9 @@ pub mod logging; pub mod lock_file; pub mod pid_file; -// Misc -pub mod accum; -pub mod shutdown; - // Utility for binding TcpListeners with proper socket options. pub mod tcp_listener; -// Utility for putting a raw file descriptor into non-blocking mode -pub mod nonblock; - // Default signal handling pub mod sentry_init; pub mod signals; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 1aebe91428..06d5c27ebf 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -1,6 +1,5 @@ #![warn(missing_docs)] -use camino::Utf8Path; use serde::{de::Visitor, Deserialize, Serialize}; use std::fmt; use std::ops::{Add, AddAssign}; @@ -145,14 +144,6 @@ impl Lsn { i128::from(self.0) - i128::from(other) } - /// Parse an LSN from a filename in the form `0000000000000000` - pub fn from_filename(filename: F) -> Result - where - F: AsRef, - { - Lsn::from_hex(filename.as_ref().as_str()) - } - /// Parse an LSN from a string in the form `0000000000000000` pub fn from_hex(s: S) -> Result where diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs deleted file mode 100644 index 05e2e3af4c..0000000000 --- a/libs/utils/src/nonblock.rs +++ /dev/null @@ -1,17 +0,0 @@ -use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL}; -use std::os::unix::io::RawFd; - -/// Put a file descriptor into non-blocking mode -pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> { - let bits = fcntl(fd, F_GETFL)?; - - // If F_GETFL returns some unknown bits, they should be valid - // for passing back to F_SETFL, too. If we left them out, the F_SETFL - // would effectively clear them, which is not what we want. - let mut flags = OFlag::from_bits_retain(bits); - flags |= OFlag::O_NONBLOCK; - - fcntl(fd, F_SETFL(flags))?; - - Ok(()) -} diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs deleted file mode 100644 index cb5a44d664..0000000000 --- a/libs/utils/src/shutdown.rs +++ /dev/null @@ -1,7 +0,0 @@ -/// Immediately terminate the calling process without calling -/// atexit callbacks, C runtime destructors etc. We mainly use -/// this to protect coverage data from concurrent writes. -pub fn exit_now(code: u8) -> ! { - // SAFETY: exiting is safe, the ffi is not safe - unsafe { nix::libc::_exit(code as _) }; -} diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 5f0028bacd..1fe048c6f0 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -120,32 +120,6 @@ impl VecMap { Ok((None, delta_size)) } - /// Split the map into two. - /// - /// The left map contains everything before `cutoff` (exclusive). - /// Right map contains `cutoff` and everything after (inclusive). - pub fn split_at(&self, cutoff: &K) -> (Self, Self) - where - K: Clone, - V: Clone, - { - let split_idx = self - .data - .binary_search_by_key(&cutoff, extract_key) - .unwrap_or_else(std::convert::identity); - - ( - VecMap { - data: self.data[..split_idx].to_vec(), - ordering: self.ordering, - }, - VecMap { - data: self.data[split_idx..].to_vec(), - ordering: self.ordering, - }, - ) - } - /// Move items from `other` to the end of `self`, leaving `other` empty. /// If the `other` ordering is different from `self` ordering /// `ExtendOrderingError` error will be returned. diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index 46e9f880a1..ba73902d38 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -15,13 +15,11 @@ anyhow.workspace = true axum.workspace = true clap.workspace = true futures.workspace = true -inotify.workspace = true serde.workspace = true serde_json.workspace = true sysinfo.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } tokio-postgres.workspace = true -tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 24373afca3..0eb48d6823 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -15,7 +15,6 @@ anyhow.workspace = true arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true bit_field.workspace = true byteorder.workspace = true bytes.workspace = true @@ -23,12 +22,9 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true -crossbeam-utils.workspace = true either.workspace = true -flate2.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true @@ -57,10 +53,6 @@ serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true serde_with.workspace = true -signal-hook.workspace = true -smallvec = { workspace = true, features = ["write"] } -svg_fmt.workspace = true -sync_wrapper.workspace = true sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true @@ -73,7 +65,6 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true -twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 0fd1d81845..52b58fc298 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -9,41 +9,19 @@ default = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true async-stream.workspace = true -byteorder.workspace = true -bytes.workspace = true -chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true -consumption_metrics.workspace = true -crossbeam-utils.workspace = true -either.workspace = true -flate2.workspace = true -fail.workspace = true futures.workspace = true git-version.workspace = true -hex.workspace = true -humantime.workspace = true -humantime-serde.workspace = true itertools.workspace = true once_cell.workspace = true pageserver_api.workspace = true pin-project-lite.workspace = true rand.workspace = true -smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true -sync_wrapper.workspace = true -thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } -tokio-io-timeout.workspace = true -tokio-util.workspace = true tracing.workspace = true -tracing-error.workspace = true tracing-subscriber.workspace = true -url.workspace = true -walkdir.workspace = true -metrics.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index be5626040b..9592002de1 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true [dependencies] anyhow.workspace = true -bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true @@ -24,5 +23,4 @@ toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true -serde.workspace = true serde_json.workspace = true diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index e9f197ec2d..8567c6aa52 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -13,7 +13,6 @@ use pageserver_api::{ use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; use storage_broker::Uri; -use utils::crashsafe::path_with_suffix_extension; use utils::logging::SecretString; use once_cell::sync::OnceCell; @@ -33,7 +32,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::virtual_file; use crate::virtual_file::io_engine; -use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; /// Global state of pageserver. /// @@ -257,17 +256,6 @@ impl PageServerConf { .join(timeline_id.to_string()) } - pub(crate) fn timeline_delete_mark_file_path( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - ) -> Utf8PathBuf { - path_with_suffix_extension( - self.timeline_path(&tenant_shard_id, &timeline_id), - TIMELINE_DELETE_MARK_SUFFIX, - ) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -491,11 +479,6 @@ pub struct ConfigurableSemaphore { } impl ConfigurableSemaphore { - pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) { - Some(x) => x, - None => panic!("const unwrap is not yet stable"), - }; - /// Initializse using a non-zero amount of permits. /// /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a @@ -516,12 +499,6 @@ impl ConfigurableSemaphore { } } -impl Default for ConfigurableSemaphore { - fn default() -> Self { - Self::new(Self::DEFAULT_INITIAL) - } -} - impl PartialEq for ConfigurableSemaphore { fn eq(&self, other: &Self) -> bool { // the number of permits can be increased at runtime, so we cannot really fulfill the diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 64a267e0e4..0c7630edca 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -178,7 +178,7 @@ async fn collect_metrics( ) .await; if let Err(e) = res { - tracing::error!("failed to upload to S3: {e:#}"); + tracing::error!("failed to upload to remote storage: {e:#}"); } } }; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9197505876..078d12f934 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine { } struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - global_metric: &'a Histogram, + global_latency_histo: &'a Histogram, // Optional because not all op types are tracked per-timeline - timeline_metric: Option<&'a Histogram>, + per_timeline_latency_histo: Option<&'a Histogram>, ctx: &'c RequestContext, start: std::time::Instant, @@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { elapsed } }; - self.global_metric.observe(ex_throttled.as_secs_f64()); - if let Some(timeline_metric) = self.timeline_metric { - timeline_metric.observe(ex_throttled.as_secs_f64()); + self.global_latency_histo + .observe(ex_throttled.as_secs_f64()); + if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo { + per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64()); } } } @@ -1240,10 +1241,32 @@ pub enum SmgrQueryType { #[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { - global_metrics: [Histogram; SmgrQueryType::COUNT], - per_timeline_getpage: Histogram, + global_started: [IntCounter; SmgrQueryType::COUNT], + global_latency: [Histogram; SmgrQueryType::COUNT], + per_timeline_getpage_started: IntCounter, + per_timeline_getpage_latency: Histogram, } +static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + // it's a counter, but, name is prepared to extend it to a histogram of queue depth + "pageserver_smgr_query_started_global_count", + "Number of smgr queries started, aggregated by query type.", + &["smgr_query_type"], + ) + .expect("failed to define a metric") +}); + +static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + // it's a counter, but, name is prepared to extend it to a histogram of queue depth + "pageserver_smgr_query_started_count", + "Number of smgr queries started, aggregated by query type and tenant/timeline.", + &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", @@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); - let global_metrics = std::array::from_fn(|i| { + let global_started = std::array::from_fn(|i| { + let op = SmgrQueryType::from_repr(i).unwrap(); + SMGR_QUERY_STARTED_GLOBAL + .get_metric_with_label_values(&[op.into()]) + .unwrap() + }); + let global_latency = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) .unwrap() }); - let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE .get_metric_with_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), &tenant_id, @@ -1334,9 +1363,20 @@ impl SmgrQueryTimePerTimeline { &timeline_id, ]) .unwrap(); + let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + &tenant_id, + &shard_slug, + &timeline_id, + ]) + .unwrap(); + Self { - global_metrics, - per_timeline_getpage, + global_started, + global_latency, + per_timeline_getpage_latency, + per_timeline_getpage_started, } } pub(crate) fn start_timer<'c: 'a, 'a>( @@ -1344,8 +1384,11 @@ impl SmgrQueryTimePerTimeline { op: SmgrQueryType, ctx: &'c RequestContext, ) -> Option { - let global_metric = &self.global_metrics[op as usize]; let start = Instant::now(); + + self.global_started[op as usize].inc(); + + // We subtract time spent throttled from the observed latency. match ctx.micros_spent_throttled.open() { Ok(()) => (), Err(error) => { @@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline { } } - let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) { - Some(&self.per_timeline_getpage) + let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) { + self.per_timeline_getpage_started.inc(); + Some(&self.per_timeline_getpage_latency) } else { None }; Some(GlobalAndPerTimelineHistogramTimer { - global_metric, - timeline_metric, + global_latency_histo: &self.global_latency[op as usize], + per_timeline_latency_histo, ctx, start, op, @@ -1423,9 +1467,12 @@ mod smgr_query_time_tests { let get_counts = || { let global: u64 = ops .iter() - .map(|op| metrics.global_metrics[*op as usize].get_sample_count()) + .map(|op| metrics.global_latency[*op as usize].get_sample_count()) .sum(); - (global, metrics.per_timeline_getpage.get_sample_count()) + ( + global, + metrics.per_timeline_getpage_latency.get_sample_count(), + ) }; let (pre_global, pre_per_tenant_timeline) = get_counts(); @@ -1777,7 +1824,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { .expect("failed to define a metric"), upload_heatmap_duration: register_histogram!( "pageserver_secondary_upload_heatmap_duration", - "Time to build and upload a heatmap, including any waiting inside the S3 client" + "Time to build and upload a heatmap, including any waiting inside the remote storage client" ) .expect("failed to define a metric"), download_heatmap: register_int_counter!( @@ -2576,6 +2623,12 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + tenant_id, + shard_id, + timeline_id, + ]); let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -2592,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } + tenant_throttling::remove_tenant_metrics(tenant_shard_id); + // we leave the BROKEN_TENANTS_SET entry if any } @@ -3055,41 +3110,180 @@ pub mod tokio_epoll_uring { pub(crate) mod tenant_throttling { use metrics::{register_int_counter_vec, IntCounter}; use once_cell::sync::Lazy; + use utils::shard::TenantShardId; use crate::tenant::{self, throttle::Metric}; - pub(crate) struct TimelineGet { - wait_time: IntCounter, - count: IntCounter, + struct GlobalAndPerTenantIntCounter { + global: IntCounter, + per_tenant: IntCounter, } - pub(crate) static TIMELINE_GET: Lazy = Lazy::new(|| { - static WAIT_USECS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_tenant_throttling_wait_usecs_sum_global", - "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.", + impl GlobalAndPerTenantIntCounter { + #[inline(always)] + pub(crate) fn inc(&self) { + self.inc_by(1) + } + #[inline(always)] + pub(crate) fn inc_by(&self, n: u64) { + self.global.inc_by(n); + self.per_tenant.inc_by(n); + } + } + + pub(crate) struct TimelineGet { + count_accounted_start: GlobalAndPerTenantIntCounter, + count_accounted_finish: GlobalAndPerTenantIntCounter, + wait_time: GlobalAndPerTenantIntCounter, + count_throttled: GlobalAndPerTenantIntCounter, + } + + static COUNT_ACCOUNTED_START: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_accounted_start_global", + "Count of tenant throttling starts, by kind of throttle.", &["kind"] ) - .unwrap() - }); - - static WAIT_COUNT: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_tenant_throttling_count_global", - "Count of tenant throttlings, by kind of throttle.", - &["kind"] - ) - .unwrap() - }); - - let kind = "timeline_get"; - TimelineGet { - wait_time: WAIT_USECS.with_label_values(&[kind]), - count: WAIT_COUNT.with_label_values(&[kind]), - } + .unwrap() + }); + static COUNT_ACCOUNTED_START_PER_TENANT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_accounted_start", + "Count of tenant throttling starts, by kind of throttle.", + &["kind", "tenant_id", "shard_id"] + ) + .unwrap() + }); + static COUNT_ACCOUNTED_FINISH: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_accounted_finish_global", + "Count of tenant throttling finishes, by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_accounted_finish", + "Count of tenant throttling finishes, by kind of throttle.", + &["kind", "tenant_id", "shard_id"] + ) + .unwrap() + }); + static WAIT_USECS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_wait_usecs_sum_global", + "Sum of microseconds that spent waiting throttle by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + static WAIT_USECS_PER_TENANT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_wait_usecs_sum", + "Sum of microseconds that spent waiting throttle by kind of throttle.", + &["kind", "tenant_id", "shard_id"] + ) + .unwrap() }); - impl Metric for &'static TimelineGet { + static WAIT_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_global", + "Count of tenant throttlings, by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + static WAIT_COUNT_PER_TENANT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count", + "Count of tenant throttlings, by kind of throttle.", + &["kind", "tenant_id", "shard_id"] + ) + .unwrap() + }); + + const KIND: &str = "timeline_get"; + + impl TimelineGet { + pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self { + TimelineGet { + count_accounted_start: { + GlobalAndPerTenantIntCounter { + global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]), + per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]), + } + }, + count_accounted_finish: { + GlobalAndPerTenantIntCounter { + global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]), + per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]), + } + }, + wait_time: { + GlobalAndPerTenantIntCounter { + global: WAIT_USECS.with_label_values(&[KIND]), + per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]), + } + }, + count_throttled: { + GlobalAndPerTenantIntCounter { + global: WAIT_COUNT.with_label_values(&[KIND]), + per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]), + } + }, + } + } + } + + pub(crate) fn preinitialize_global_metrics() { + Lazy::force(&COUNT_ACCOUNTED_START); + Lazy::force(&COUNT_ACCOUNTED_FINISH); + Lazy::force(&WAIT_USECS); + Lazy::force(&WAIT_COUNT); + } + + pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { + for m in &[ + &COUNT_ACCOUNTED_START_PER_TENANT, + &COUNT_ACCOUNTED_FINISH_PER_TENANT, + &WAIT_USECS_PER_TENANT, + &WAIT_COUNT_PER_TENANT, + ] { + let _ = m.remove_label_values(&[ + KIND, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]); + } + } + + impl Metric for TimelineGet { + #[inline(always)] + fn accounting_start(&self) { + self.count_accounted_start.inc(); + } + #[inline(always)] + fn accounting_finish(&self) { + self.count_accounted_finish.inc(); + } #[inline(always)] fn observe_throttling( &self, @@ -3097,7 +3291,7 @@ pub(crate) mod tenant_throttling { ) { let val = u64::try_from(wait_time.as_micros()).unwrap(); self.wait_time.inc_by(val); - self.count.inc(); + self.count_throttled.inc(); } } } @@ -3227,11 +3421,14 @@ pub fn preinitialize_metrics() { } // countervecs - [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] - .into_iter() - .for_each(|c| { - Lazy::force(c); - }); + [ + &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT, + &SMGR_QUERY_STARTED_GLOBAL, + ] + .into_iter() + .for_each(|c| { + Lazy::force(c); + }); // gauges WALRECEIVER_ACTIVE_MANAGERS.get(); @@ -3253,7 +3450,8 @@ pub fn preinitialize_metrics() { // Custom Lazy::force(&RECONSTRUCT_TIME); - Lazy::force(&tenant_throttling::TIMELINE_GET); Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&COMPUTE_COMMANDS_COUNTERS); + + tenant_throttling::preinitialize_global_metrics(); } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c6f0e48101..e328cd2044 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -140,6 +140,7 @@ pub mod metadata; pub mod remote_timeline_client; pub mod storage_layer; +pub mod checks; pub mod config; pub mod mgr; pub mod secondary; @@ -301,7 +302,7 @@ pub struct Tenant { /// Throttle applied at the top of [`Timeline::get`]. /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. pub(crate) timeline_get_throttle: - Arc>, + Arc>, /// An ongoing timeline detach concurrency limiter. /// @@ -1573,6 +1574,9 @@ impl Tenant { image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { + use checks::check_valid_layermap; + use itertools::Itertools; + let tline = self .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) .await?; @@ -1587,6 +1591,18 @@ impl Tenant { .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) .await?; } + let layer_names = tline + .layers + .read() + .await + .layer_map() + .unwrap() + .iter_historic_layers() + .map(|layer| layer.layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("invalid layermap: {err}"); + } Ok(tline) } @@ -2815,7 +2831,7 @@ impl Tenant { gate: Gate::default(), timeline_get_throttle: Arc::new(throttle::Throttle::new( Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), - &crate::metrics::tenant_throttling::TIMELINE_GET, + crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id), )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), @@ -3197,6 +3213,9 @@ impl Tenant { image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { + use checks::check_valid_layermap; + use itertools::Itertools; + let tline = self .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx) .await?; @@ -3217,6 +3236,18 @@ impl Tenant { .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx) .await?; } + let layer_names = tline + .layers + .read() + .await + .layer_map() + .unwrap() + .iter_historic_layers() + .map(|layer| layer.layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("invalid layermap: {err}"); + } Ok(tline) } @@ -4164,9 +4195,18 @@ pub(crate) mod harness { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { // For Neon wal records, we can decode without spawning postgres, so do so. - let base_img = base_img.expect("Neon WAL redo requires base image").1; - let mut page = BytesMut::new(); - page.extend_from_slice(&base_img); + let mut page = match (base_img, records.first()) { + (Some((_lsn, img)), _) => { + let mut page = BytesMut::new(); + page.extend_from_slice(&img); + page + } + (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(), + _ => { + panic!("Neon WAL redo requires base image or will init record"); + } + }; + for (record_lsn, record) in records { apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?; } @@ -8470,4 +8510,135 @@ mod tests { Ok(()) } + + // Regression test for https://github.com/neondatabase/neon/issues/9012 + // Create an image arrangement where we have to read at different LSN ranges + // from a delta layer. This is achieved by overlapping an image layer on top of + // a delta layer. Like so: + // + // A B + // +----------------+ -> delta_layer + // | | ^ lsn + // | =========|-> nested_image_layer | + // | C | | + // +----------------+ | + // ======== -> baseline_image_layer +-------> key + // + // + // When querying the key range [A, B) we need to read at different LSN ranges + // for [A, C) and [C, B). This test checks that the described edge case is handled correctly. + #[tokio::test] + async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?; + let (tenant, ctx) = harness.load().await; + + let will_init_keys = [2, 6]; + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let mut expected_key_values = HashMap::new(); + + let baseline_image_layer_lsn = Lsn(0x10); + let mut baseline_img_layer = Vec::new(); + for i in 0..5 { + let key = get_key(i); + let value = format!("value {i}@{baseline_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + baseline_img_layer.push((key, Bytes::from(value))); + } + + let nested_image_layer_lsn = Lsn(0x50); + let mut nested_img_layer = Vec::new(); + for i in 5..10 { + let key = get_key(i); + let value = format!("value {i}@{nested_image_layer_lsn}"); + + let removed = expected_key_values.insert(key, value.clone()); + assert!(removed.is_none()); + + nested_img_layer.push((key, Bytes::from(value))); + } + + let mut delta_layer_spec = Vec::default(); + let delta_layer_start_lsn = Lsn(0x20); + let mut delta_layer_end_lsn = delta_layer_start_lsn; + + for i in 0..10 { + let key = get_key(i); + let key_in_nested = nested_img_layer + .iter() + .any(|(key_with_img, _)| *key_with_img == key); + let lsn = { + if key_in_nested { + Lsn(nested_image_layer_lsn.0 + 0x10) + } else { + delta_layer_start_lsn + } + }; + + let will_init = will_init_keys.contains(&i); + if will_init { + delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init()))); + + expected_key_values.insert(key, "".to_string()); + } else { + let delta = format!("@{lsn}"); + delta_layer_spec.push(( + key, + lsn, + Value::WalRecord(NeonWalRecord::wal_append(&delta)), + )); + + expected_key_values + .get_mut(&key) + .expect("An image exists for each key") + .push_str(delta.as_str()); + } + delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn); + } + + delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1); + + assert!( + nested_image_layer_lsn > delta_layer_start_lsn + && nested_image_layer_lsn < delta_layer_end_lsn + ); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + baseline_image_layer_lsn, + DEFAULT_PG_VERSION, + &ctx, + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + delta_layer_start_lsn..delta_layer_end_lsn, + delta_layer_spec, + )], // delta layers + vec![ + (baseline_image_layer_lsn, baseline_img_layer), + (nested_image_layer_lsn, nested_img_layer), + ], // image layers + delta_layer_end_lsn, + ) + .await?; + + let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let results = tline + .get_vectored(keyspace, delta_layer_end_lsn, &ctx) + .await + .expect("No vectored errors"); + for (key, res) in results { + let value = res.expect("No key errors"); + let expected_value = expected_key_values.remove(&key).expect("No unknown keys"); + assert_eq!(value, Bytes::from(expected_value)); + } + + Ok(()) + } } diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs new file mode 100644 index 0000000000..8eaa8a001c --- /dev/null +++ b/pageserver/src/tenant/checks.rs @@ -0,0 +1,55 @@ +use std::collections::BTreeSet; + +use itertools::Itertools; + +use super::storage_layer::LayerName; + +/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). +/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// +/// ```plain +/// | | | | +/// | 1 | | 2 | | 3 | +/// | | | | | | +/// ``` +/// +/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have +/// the same LSN range. +/// +/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, +/// +/// ```plain +/// | | | 2 | | | +/// | 1 | |-------| | 3 | +/// | | | 4 | | | +/// +/// If layer 2 and 4 contain the same single key, this is also a valid layer map. +pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) + let mut all_delta_layers = Vec::new(); + for name in metadata { + if let LayerName::Delta(layer) = name { + if layer.key_range.start.next() != layer.key_range.end { + all_delta_layers.push(layer.clone()); + } + } + } + for layer in &all_delta_layers { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } + for layer in &all_delta_layers { + let lsn_range = layer.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + layer, + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + return Some(err); + } + } + None +} diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index 8b41ba1746..1271d25b76 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,11 +1,29 @@ -use std::collections::HashMap; - -use utils::id::TimelineId; +use std::{collections::HashMap, time::Duration}; use super::remote_timeline_client::index::GcBlockingReason; +use tokio::time::Instant; +use utils::id::TimelineId; -type Storage = HashMap>; +type TimelinesBlocked = HashMap>; +#[derive(Default)] +struct Storage { + timelines_blocked: TimelinesBlocked, + /// The deadline before which we are blocked from GC so that + /// leases have a chance to be renewed. + lsn_lease_deadline: Option, +} + +impl Storage { + fn is_blocked_by_lsn_lease_deadline(&self) -> bool { + self.lsn_lease_deadline + .map(|d| Instant::now() < d) + .unwrap_or(false) + } +} + +/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc +/// blocking. #[derive(Default)] pub(crate) struct GcBlock { /// The timelines which have current reasons to block gc. @@ -13,6 +31,12 @@ pub(crate) struct GcBlock { /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`. reasons: std::sync::Mutex, + + /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this. + /// + /// Do not add any more features taking and forbidding taking this lock. It should be + /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`] + /// synchronizes with gc attempts by locking and unlocking this mutex. blocking: tokio::sync::Mutex<()>, } @@ -42,6 +66,20 @@ impl GcBlock { } } + /// Sets a deadline before which we cannot proceed to GC due to lsn lease. + /// + /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease + /// length, we guarantee that all the leases we granted before will have a chance to renew + /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle. + pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) { + let deadline = Instant::now() + lsn_lease_length; + let mut g = self.reasons.lock().unwrap(); + g.lsn_lease_deadline = Some(deadline); + } + + /// Describe the current gc blocking reasons. + /// + /// TODO: make this json serializable. pub(crate) fn summary(&self) -> Option { let g = self.reasons.lock().unwrap(); @@ -64,7 +102,7 @@ impl GcBlock { ) -> anyhow::Result { let (added, uploaded) = { let mut g = self.reasons.lock().unwrap(); - let set = g.entry(timeline.timeline_id).or_default(); + let set = g.timelines_blocked.entry(timeline.timeline_id).or_default(); let added = set.insert(reason); // LOCK ORDER: intentionally hold the lock, see self.reasons. @@ -95,7 +133,7 @@ impl GcBlock { let (remaining_blocks, uploaded) = { let mut g = self.reasons.lock().unwrap(); - match g.entry(timeline.timeline_id) { + match g.timelines_blocked.entry(timeline.timeline_id) { Entry::Occupied(mut oe) => { let set = oe.get_mut(); set.remove(reason); @@ -109,7 +147,7 @@ impl GcBlock { } } - let remaining_blocks = g.len(); + let remaining_blocks = g.timelines_blocked.len(); // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons let uploaded = timeline @@ -134,11 +172,11 @@ impl GcBlock { pub(crate) fn before_delete(&self, timeline: &super::Timeline) { let unblocked = { let mut g = self.reasons.lock().unwrap(); - if g.is_empty() { + if g.timelines_blocked.is_empty() { return; } - g.remove(&timeline.timeline_id); + g.timelines_blocked.remove(&timeline.timeline_id); BlockingReasons::clean_and_summarize(g).is_none() }; @@ -149,10 +187,11 @@ impl GcBlock { } /// Initialize with the non-deleted timelines of this tenant. - pub(crate) fn set_scanned(&self, scanned: Storage) { + pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) { let mut g = self.reasons.lock().unwrap(); - assert!(g.is_empty()); - g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); + assert!(g.timelines_blocked.is_empty()); + g.timelines_blocked + .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty())); if let Some(reasons) = BlockingReasons::clean_and_summarize(g) { tracing::info!(summary=?reasons, "initialized with gc blocked"); @@ -166,6 +205,7 @@ pub(super) struct Guard<'a> { #[derive(Debug)] pub(crate) struct BlockingReasons { + tenant_blocked_by_lsn_lease_deadline: bool, timelines: usize, reasons: enumset::EnumSet, } @@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "{} timelines block for {:?}", - self.timelines, self.reasons + "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}", + self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons ) } } @@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons { impl BlockingReasons { fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option { let mut reasons = enumset::EnumSet::empty(); - g.retain(|_key, value| { + g.timelines_blocked.retain(|_key, value| { reasons = reasons.union(*value); !value.is_empty() }); - if !g.is_empty() { + let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); + if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline { Some(BlockingReasons { - timelines: g.len(), + tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, + timelines: g.timelines_blocked.len(), reasons, }) } else { @@ -198,14 +240,17 @@ impl BlockingReasons { } fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option { - if g.is_empty() { + let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline(); + if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline { None } else { let reasons = g + .timelines_blocked .values() .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next)); Some(BlockingReasons { - timelines: g.len(), + tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline, + timelines: g.timelines_blocked.len(), reasons, }) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2104f41531..1e7c1e10a5 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -949,6 +949,12 @@ impl TenantManager { (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { match attach_conf.generation.cmp(&tenant.generation) { Ordering::Equal => { + if attach_conf.attach_mode == AttachmentMode::Single { + tenant + .gc_block + .set_lsn_lease_deadline(tenant.get_lsn_lease_length()); + } + // A transition from Attached to Attached in the same generation, we may // take our fast path and just provide the updated configuration // to the tenant. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index dac6b2f893..cd252aa371 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -276,6 +276,16 @@ pub(crate) enum LayerId { InMemoryLayerId(InMemoryLayerFileId), } +/// Uniquely identify a layer visit by the layer +/// and LSN floor (or start LSN) of the reads. +/// The layer itself is not enough since we may +/// have different LSN lower bounds for delta layer reads. +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +struct LayerToVisitId { + layer_id: LayerId, + lsn_floor: Lsn, +} + /// Layer wrapper for the read path. Note that it is valid /// to use these layers even after external operations have /// been performed on them (compaction, freeze, etc.). @@ -287,9 +297,9 @@ pub(crate) enum ReadableLayer { /// A partial description of a read to be done. #[derive(Debug, Clone)] -struct ReadDesc { +struct LayerVisit { /// An id used to resolve the readable layer within the fringe - layer_id: LayerId, + layer_to_visit_id: LayerToVisitId, /// Lsn range for the read, used for selecting the next read lsn_range: Range, } @@ -303,12 +313,12 @@ struct ReadDesc { /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - planned_reads_by_lsn: BinaryHeap, - layers: HashMap, + planned_visits_by_lsn: BinaryHeap, + visit_reads: HashMap, } #[derive(Debug)] -struct LayerKeyspace { +struct LayerVisitReads { layer: ReadableLayer, target_keyspace: KeySpaceRandomAccum, } @@ -316,23 +326,23 @@ struct LayerKeyspace { impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - planned_reads_by_lsn: BinaryHeap::new(), - layers: HashMap::new(), + planned_visits_by_lsn: BinaryHeap::new(), + visit_reads: HashMap::new(), } } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { - let read_desc = match self.planned_reads_by_lsn.pop() { + let read_desc = match self.planned_visits_by_lsn.pop() { Some(desc) => desc, None => return None, }; - let removed = self.layers.remove_entry(&read_desc.layer_id); + let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id); match removed { Some(( _, - LayerKeyspace { + LayerVisitReads { layer, mut target_keyspace, }, @@ -351,20 +361,24 @@ impl LayerFringe { keyspace: KeySpace, lsn_range: Range, ) { - let layer_id = layer.id(); - let entry = self.layers.entry(layer_id.clone()); + let layer_to_visit_id = LayerToVisitId { + layer_id: layer.id(), + lsn_floor: lsn_range.start, + }; + + let entry = self.visit_reads.entry(layer_to_visit_id.clone()); match entry { Entry::Occupied(mut entry) => { entry.get_mut().target_keyspace.add_keyspace(keyspace); } Entry::Vacant(entry) => { - self.planned_reads_by_lsn.push(ReadDesc { + self.planned_visits_by_lsn.push(LayerVisit { lsn_range, - layer_id: layer_id.clone(), + layer_to_visit_id: layer_to_visit_id.clone(), }); let mut accum = KeySpaceRandomAccum::new(); accum.add_keyspace(keyspace); - entry.insert(LayerKeyspace { + entry.insert(LayerVisitReads { layer, target_keyspace: accum, }); @@ -379,7 +393,7 @@ impl Default for LayerFringe { } } -impl Ord for ReadDesc { +impl Ord for LayerVisit { fn cmp(&self, other: &Self) -> Ordering { let ord = self.lsn_range.end.cmp(&other.lsn_range.end); if ord == std::cmp::Ordering::Equal { @@ -390,19 +404,19 @@ impl Ord for ReadDesc { } } -impl PartialOrd for ReadDesc { +impl PartialOrd for LayerVisit { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl PartialEq for ReadDesc { +impl PartialEq for LayerVisit { fn eq(&self, other: &Self) -> bool { self.lsn_range == other.lsn_range } } -impl Eq for ReadDesc {} +impl Eq for LayerVisit {} impl ReadableLayer { pub(crate) fn id(&self) -> LayerId { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 875e223c9c..5de2582ab7 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; @@ -58,7 +58,6 @@ use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; -use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tracing::*; @@ -70,9 +69,7 @@ use utils::{ }; use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -800,10 +797,9 @@ impl ImageLayerWriterInner { /// async fn finish( self, - timeline: &Arc, ctx: &RequestContext, end_key: Option, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -879,12 +875,9 @@ impl ImageLayerWriterInner { // fsync the file file.sync_all().await?; - // FIXME: why not carry the virtualfile here, it supports renaming? - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created image layer {}", self.path); - info!("created image layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -963,24 +956,18 @@ impl ImageLayerWriter { /// pub(crate) async fn finish( mut self, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline, ctx, None).await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, None).await } /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. pub(super) async fn finish_with_end_key( mut self, - timeline: &Arc, end_key: Key, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(timeline, ctx, Some(end_key)) - .await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, Some(end_key)).await } } @@ -1084,7 +1071,7 @@ mod test { tenant::{ config::TenantConf, harness::{TenantHarness, TIMELINE_ID}, - storage_layer::ResidentLayer, + storage_layer::{Layer, ResidentLayer}, vectored_blob_io::StreamingVectoredReadPlanner, Tenant, Timeline, }, @@ -1155,7 +1142,8 @@ mod test { key = key.next(); } - writer.finish(&timeline, &ctx).await.unwrap() + let (desc, path) = writer.finish(&ctx).await.unwrap(); + Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap() }; let original_size = resident.metadata().file_size; @@ -1217,7 +1205,9 @@ mod test { .await .unwrap(); let replacement = if wrote_keys > 0 { - Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + let (desc, path) = filtered_writer.finish(&ctx).await.unwrap(); + let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap(); + Some(resident) } else { None }; @@ -1290,7 +1280,8 @@ mod test { for (key, img) in images { writer.put_image(key, img, ctx).await?; } - let img_layer = writer.finish(tline, ctx).await?; + let (desc, path) = writer.finish(ctx).await?; + let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(img_layer) } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b15cd4da39..f0e2ca5c83 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -439,11 +439,30 @@ impl Layer { fn record_access(&self, ctx: &RequestContext) { if self.0.access_stats.record_access(ctx) { - // Visibility was modified to Visible - tracing::info!( - "Layer {} became visible as a result of access", - self.0.desc.key() - ); + // Visibility was modified to Visible: maybe log about this + match ctx.task_kind() { + TaskKind::CalculateSyntheticSize + | TaskKind::GarbageCollector + | TaskKind::MgmtRequest => { + // This situation is expected in code paths do binary searches of the LSN space to resolve + // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size, + // and on-demand for certain HTTP API requests. + } + _ => { + // In all other contexts, it is unusual to do I/O involving layers which are not visible at + // some branch tip, so we log the fact that we are accessing something that the visibility + // calculation thought should not be visible. + // + // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object + // which was covered by a concurrent compaction. + tracing::info!( + "Layer {} became visible as a result of access", + self.0.desc.key() + ); + } + } + + // Update the timeline's visible bytes count if let Some(tl) = self.0.timeline.upgrade() { tl.metrics .visible_physical_size_gauge diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 0b9bde4f57..9de70f14ee 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1025,6 +1025,15 @@ fn access_stats() { assert_eq!(access_stats.latest_activity(), lowres_time(atime)); access_stats.set_visibility(LayerVisibilityHint::Visible); assert_eq!(access_stats.latest_activity(), lowres_time(atime)); + + // Recording access implicitly makes layer visible, if it wasn't already + let atime = UNIX_EPOCH + Duration::from_secs(2200000000); + access_stats.set_visibility(LayerVisibilityHint::Covered); + assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered); + assert!(access_stats.record_access_at(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); + assert!(!access_stats.record_access_at(atime)); + access_stats.set_visibility(LayerVisibilityHint::Visible); } #[test] diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index 40a6a77a50..b499a0eef4 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -121,11 +121,11 @@ impl SplitImageLayerWriter { self.generated_layers .push(SplitWriterResult::Discarded(layer_key)); } else { - self.generated_layers.push(SplitWriterResult::Produced( - prev_image_writer - .finish_with_end_key(tline, key, ctx) - .await?, - )); + let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?; + + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers + .push(SplitWriterResult::Produced(layer)); } } self.inner.put_image(key, img, ctx).await @@ -170,9 +170,9 @@ impl SplitImageLayerWriter { if discard(&layer_key).await { generated_layers.push(SplitWriterResult::Discarded(layer_key)); } else { - generated_layers.push(SplitWriterResult::Produced( - inner.finish_with_end_key(tline, end_key, ctx).await?, - )); + let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?; + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(SplitWriterResult::Produced(layer)); } Ok(generated_layers) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 478e9bb4f0..341febb30a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // How many errors we have seen consequtively let mut error_run_count = 0; - let mut last_throttle_flag_reset_at = Instant::now(); - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); @@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } - - let sleep_duration; if period == Duration::ZERO { #[cfg(not(feature = "testing"))] @@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { }; // Run compaction - let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await; + let IterationResult { output, elapsed } = iteration + .run(tenant.compaction_iteration(&cancel, &ctx)) + .await; match output { Ok(has_pending_task) => { error_run_count = 0; // schedule the next compaction immediately in case there is a pending compaction task - sleep_duration = if has_pending_task { Duration::ZERO } else { period }; + sleep_duration = if has_pending_task { + Duration::ZERO + } else { + period + }; } Err(e) => { let wait_duration = backoff::exponential_backoff_duration_seconds( @@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); + tracing::debug!( + elapsed_ms = elapsed.as_millis(), + "compaction iteration complete" + ); }; - // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. + // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off, + // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. if let Some(walredo_mgr) = &tenant.walredo_mgr { walredo_mgr.maybe_quiesce(period * 10); } - // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off, - // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. - info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { - let now = Instant::now(); - let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats(); - if count_throttled == 0 { - return; - } - let allowed_rps = tenant.timeline_get_throttle.steady_rps(); - let delta = now - prev; - info!( - n_seconds=%format_args!("{:.3}", - delta.as_secs_f64()), - count_accounted, - count_throttled, - sum_throttled_usecs, - allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds" - ); - }); - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await @@ -346,6 +330,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; + tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length()); loop { tokio::select! { _ = cancel.cancelled() => { @@ -363,7 +348,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { first = false; let delays = async { - delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; random_init_delay(period, &cancel).await?; Ok::<_, Cancelled>(()) }; @@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let mut last_throttle_flag_reset_at = Instant::now(); loop { tokio::select! { _ = cancel.cancelled() => { @@ -483,6 +468,29 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken kind: BackgroundLoopKind::IngestHouseKeeping, }; iteration.run(tenant.ingest_housekeeping()).await; + + // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. + // Or just spawn another background loop for this throttle, it's not like it's super costly. + info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", + delta.as_secs_f64()), + count_accounted = count_accounted_finish, // don't break existing log scraping + count_throttled, + sum_throttled_usecs, + count_accounted_start, // log after pre-existing fields to not break existing log scraping + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds" + ); + }); } } .await; @@ -538,28 +546,12 @@ pub(crate) async fn random_init_delay( let mut rng = rand::thread_rng(); rng.gen_range(Duration::ZERO..=period) }; - match tokio::time::timeout(d, cancel.cancelled()).await { Ok(_) => Err(Cancelled), Err(_) => Ok(()), } } -/// Delays GC by defaul lease length at restart. -/// -/// We do this as the leases mapping are not persisted to disk. By delaying GC by default -/// length, we gurantees that all the leases we granted before the restart will expire -/// when we run GC for the first time after the restart. -pub(crate) async fn delay_by_lease_length( - length: Duration, - cancel: &CancellationToken, -) -> Result<(), Cancelled> { - match tokio::time::timeout(length, cancel.cancelled()).await { - Ok(_) => Err(Cancelled), - Err(_) => Ok(()), - } -} - struct Iteration { started_at: Instant, period: Duration, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index f222e708e1..6a80953901 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind}; pub struct Throttle { inner: ArcSwap, metric: M, - /// will be turned into [`Stats::count_accounted`] - count_accounted: AtomicU64, + /// will be turned into [`Stats::count_accounted_start`] + count_accounted_start: AtomicU64, + /// will be turned into [`Stats::count_accounted_finish`] + count_accounted_finish: AtomicU64, /// will be turned into [`Stats::count_throttled`] count_throttled: AtomicU64, /// will be turned into [`Stats::sum_throttled_usecs`] @@ -43,17 +45,21 @@ pub struct Observation { pub wait_time: Duration, } pub trait Metric { + fn accounting_start(&self); + fn accounting_finish(&self); fn observe_throttling(&self, observation: &Observation); } /// See [`Throttle::reset_stats`]. pub struct Stats { - // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`]. - pub count_accounted: u64, - // Subset of the `accounted` requests that were actually throttled. - // Note that the numbers are stored as two independent atomics, so, there might be a slight drift. + /// Number of requests that started [`Throttle::throttle`] calls. + pub count_accounted_start: u64, + /// Number of requests that finished [`Throttle::throttle`] calls. + pub count_accounted_finish: u64, + /// Subset of the `accounted` requests that were actually throttled. + /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift. pub count_throttled: u64, - // Sum of microseconds that throttled requests spent waiting for throttling. + /// Sum of microseconds that throttled requests spent waiting for throttling. pub sum_throttled_usecs: u64, } @@ -65,7 +71,8 @@ where Self { inner: ArcSwap::new(Arc::new(Self::new_inner(config))), metric, - count_accounted: AtomicU64::new(0), + count_accounted_start: AtomicU64::new(0), + count_accounted_finish: AtomicU64::new(0), count_throttled: AtomicU64::new(0), sum_throttled_usecs: AtomicU64::new(0), } @@ -117,11 +124,13 @@ where /// This method allows retrieving & resetting that flag. /// Useful for periodic reporting. pub fn reset_stats(&self) -> Stats { - let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed); + let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed); + let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed); let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed); let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed); Stats { - count_accounted, + count_accounted_start, + count_accounted_finish, count_throttled, sum_throttled_usecs, } @@ -139,9 +148,12 @@ where }; let start = std::time::Instant::now(); + self.metric.accounting_start(); + self.count_accounted_start.fetch_add(1, Ordering::Relaxed); let did_throttle = inner.rate_limiter.acquire(key_count).await; + self.count_accounted_finish.fetch_add(1, Ordering::Relaxed); + self.metric.accounting_finish(); - self.count_accounted.fetch_add(1, Ordering::Relaxed); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); let now = Instant::now(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 262dccac7d..c98efd5f71 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -196,9 +196,8 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, - pub timeline_get_throttle: Arc< - crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, - >, + pub timeline_get_throttle: + Arc>, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -406,9 +405,8 @@ pub struct Timeline { gc_lock: tokio::sync::Mutex<()>, /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. - timeline_get_throttle: Arc< - crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, - >, + timeline_get_throttle: + Arc>, /// Keep aux directory cache to avoid it's reconstruction on each update pub(crate) aux_files: tokio::sync::Mutex, @@ -4013,7 +4011,9 @@ impl Timeline { if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; + info!("created image layer for rel {}", image_layer.local_path()); Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -4101,7 +4101,12 @@ impl Timeline { if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; + info!( + "created image layer for metadata {}", + image_layer.local_path() + ); Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -4309,7 +4314,9 @@ impl Timeline { timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered - self.update_layer_visibility().await?; + if !image_layers.is_empty() { + self.update_layer_visibility().await?; + } Ok(image_layers) } @@ -5371,7 +5378,8 @@ impl Timeline { /// Force create an image layer and place it into the layer map. /// /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] - /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are + /// placed into the layer map in one run AND be validated. #[cfg(test)] pub(super) async fn force_create_image_layer( self: &Arc, @@ -5403,8 +5411,9 @@ impl Timeline { for (key, img) in images { image_layer_writer.put_image(key, img, ctx).await?; } - let image_layer = image_layer_writer.finish(self, ctx).await?; - + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; + info!("force created image layer {}", image_layer.local_path()); { let mut guard = self.layers.write().await; guard.open_mut().unwrap().force_insert_layer(image_layer); @@ -5416,7 +5425,8 @@ impl Timeline { /// Force create a delta layer and place it into the layer map. /// /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] - /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are + /// placed into the layer map in one run AND be validated. #[cfg(test)] pub(super) async fn force_create_delta_layer( self: &Arc, @@ -5442,33 +5452,6 @@ impl Timeline { if let Some(check_start_lsn) = check_start_lsn { assert!(deltas.lsn_range.start >= check_start_lsn); } - // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of - // layers of the same start/end LSN, and so should the force inserted layer - { - /// Checks if a overlaps with b, assume a/b = [start, end). - pub fn overlaps_with(a: &Range, b: &Range) -> bool { - !(a.end <= b.start || b.end <= a.start) - } - - if deltas.key_range.start.next() != deltas.key_range.end { - let guard = self.layers.read().await; - let mut invalid_layers = - guard.layer_map()?.iter_historic_layers().filter(|layer| { - layer.is_delta() - && overlaps_with(&layer.lsn_range, &deltas.lsn_range) - && layer.lsn_range != deltas.lsn_range - // skip single-key layer files - && layer.key_range.start.next() != layer.key_range.end - }); - if let Some(layer) = invalid_layers.next() { - // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic - panic!( - "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", - deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end - ); - } - } - } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, @@ -5483,7 +5466,7 @@ impl Timeline { } let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?; let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; - + info!("force created delta layer {}", delta_layer.local_path()); { let mut guard = self.layers.write().await; guard.open_mut().unwrap().force_insert_layer(delta_layer); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 0b5c520ba7..d1567b6b39 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::split_writer::{ @@ -563,10 +564,12 @@ impl Timeline { .await?; if keys_written > 0 { - let new_layer = image_layer_writer - .finish(self, ctx) + let (desc, path) = image_layer_writer + .finish(ctx) .await .map_err(CompactionError::Other)?; + let new_layer = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -1786,20 +1789,12 @@ impl Timeline { stat.visit_image_layer(desc.file_size()); } } - for layer in &layer_selection { - let desc = layer.layer_desc(); - let key_range = &desc.key_range; - if desc.is_delta() && key_range.start.next() != key_range.end { - let lsn_range = desc.lsn_range.clone(); - let intersects = lsn_split_point.range(lsn_range).collect_vec(); - if intersects.len() > 1 { - bail!( - "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - desc.key(), - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - } - } + let layer_names: Vec = layer_selection + .iter() + .map(|layer| layer.layer_desc().layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("cannot run gc-compaction because {}", err); } // The maximum LSN we are processing in this compaction loop let end_lsn = layer_selection diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index dc4118bb4a..90db08ea81 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( .context("delete_all") } -// This function removs remaining traces of a timeline on disk. -// Namely: metadata file, timeline directory, delete mark. -// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir. -// delete mark should be present because it is the last step during deletion. -// (nothing can fail after its deletion) -async fn cleanup_remaining_timeline_fs_traces( - conf: &PageServerConf, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, -) -> anyhow::Result<()> { - // Remove delete mark - // TODO: once we are confident that no more exist in the field, remove this - // line. It cleans up a legacy marker file that might in rare cases be present. - tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("remove delete mark") -} - /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`DeleteTimelineFlow::prepare`] async fn remove_timeline_from_tenant( @@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant( /// 7. Delete mark file /// /// It is resumable from any step in case a crash/restart occurs. -/// There are three entrypoints to the process: +/// There are two entrypoints to the process: /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present /// and we possibly neeed to continue deletion of remote files. -/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote -/// index but still have local metadata, timeline directory and delete mark. /// /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. #[derive(Default)] @@ -311,18 +290,6 @@ impl DeleteTimelineFlow { Ok(()) } - #[instrument(skip_all, fields(%timeline_id))] - pub async fn cleanup_remaining_timeline_fs_traces( - tenant: &Tenant, - timeline_id: TimelineId, - ) -> anyhow::Result<()> { - let r = - cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id) - .await; - info!("Done"); - r - } - fn prepare( tenant: &Tenant, timeline_id: TimelineId, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index a36955fa21..0fe7def8b0 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,6 +35,7 @@ use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; +use std::future::Future; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -296,6 +297,97 @@ impl PostgresRedoManager { } } + async fn do_with_walredo_process< + F: FnOnce(Arc) -> Fut, + Fut: Future>, + O, + >( + &self, + pg_version: u32, + closure: F, + ) -> Result { + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); + } + }, + Err(permit) => { + let start = Instant::now(); + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + elapsed_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc + } + }; + + // async closures are unstable, would support &Process + let result = closure(proc.clone()).await; + + if result.is_err() { + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // + // NB: there may still be other concurrent threads using `proc`. + // The last one will send SIGKILL when the underlying Arc reaches refcount 0. + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster + // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, + // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. + // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. + drop(proc); + } + + result + } + /// /// Process one request for WAL redo using wal-redo postgres /// @@ -319,130 +411,63 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = match self.redo_process.get_or_init_detached().await { - Ok(guard) => match &*guard { - ProcessOnceCell::Spawned(proc) => Arc::clone(proc), - ProcessOnceCell::ManagerShutDown => { - return Err(Error::Cancelled); - } - }, - Err(permit) => { - let start = Instant::now(); - // acquire guard before spawning process, so that we don't spawn new processes - // if the gate is already closed. - let _launched_processes_guard = match self.launched_processes.enter() { - Ok(guard) => guard, - Err(GateError::GateClosed) => unreachable!( - "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" - ), - }; - let proc = Arc::new(Process { - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - _launched_processes_guard, - }); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process - .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); - proc - } - }; + let base_img = &base_img; + let closure = |proc: Arc| async move { + let started_at = std::time::Instant::now(); - let started_at = std::time::Instant::now(); + // Relational WAL records are applied using wal-redo-postgres + let result = proc + .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await + .context("apply_wal_records"); - // Relational WAL records are applied using wal-redo-postgres - let result = proc - .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) - .await - .context("apply_wal_records"); + let duration = started_at.elapsed(); - let duration = started_at.elapsed(); - - let len = records.len(); - let nbytes = records.iter().fold(0, |acumulator, record| { - acumulator - + match &record.1 { - NeonWalRecord::Postgres { rec, .. } => rec.len(), - _ => unreachable!("Only PostgreSQL records are accepted in this batch"), - } - }); - - WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); - WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); - - debug!( - "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", - len, - nbytes, - duration.as_micros(), - lsn - ); - - // If something went wrong, don't try to reuse the process. Kill it, and - // next request will launch a new one. - if let Err(e) = result.as_ref() { - error!( - "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", - records.len(), - records.first().map(|p| p.0).unwrap_or(Lsn(0)), - records.last().map(|p| p.0).unwrap_or(Lsn(0)), - nbytes, - base_img_lsn, - lsn, - n_attempts, - e, - ); - // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. - // Note that there may be other tasks concurrent with us that also hold `proc`. - // We have to deal with that here. - // Also read the doc comment on field `self.redo_process`. - // - // NB: there may still be other concurrent threads using `proc`. - // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // - // NB: the drop impl blocks the dropping thread with a wait() system call for - // the child process. In some ways the blocking is actually good: if we - // deferred the waiting into the background / to tokio if we used `tokio::process`, - // it could happen that if walredo always fails immediately, we spawn processes faster - // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, - // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. - // This probably needs revisiting at some later point. - match self.redo_process.get() { - None => (), - Some(guard) => { - match &*guard { - ProcessOnceCell::ManagerShutDown => {} - ProcessOnceCell::Spawned(guard_proc) => { - if Arc::ptr_eq(&proc, guard_proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - guard.take_and_deinit(); - } else { - // Another task already spawned another redo process (further up in this method) - // and put it into `redo_process`. Do nothing, our view of the world is behind. - } - } + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), } - } + }); + + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + + debug!( + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, + duration.as_micros(), + lsn + ); + + if let Err(e) = result.as_ref() { + error!( + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), + nbytes, + base_img_lsn, + lsn, + n_attempts, + e, + ); } - // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. - drop(proc); - } else if n_attempts != 0 { + + result.map_err(Error::Other) + }; + let result = self.do_with_walredo_process(pg_version, closure).await; + + if result.is_ok() && n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result.map_err(Error::Other); + return result; } } } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 21d92abb20..6703eb06eb 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -18,7 +18,6 @@ atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true -aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -26,7 +25,6 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true -crossbeam-deque.workspace = true dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true @@ -48,11 +46,9 @@ indexmap.workspace = true ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } -md5.workspace = true measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true -opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true @@ -67,7 +63,6 @@ reqwest.workspace = true reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true -routerify.workspace = true rustc-hash.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -79,7 +74,6 @@ smol_str.workspace = true smallvec.workspace = true socket2.workspace = true subtle.workspace = true -task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } @@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } -tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 5561c9c56d..5bc2f2ff65 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -163,6 +163,7 @@ impl ComputeUserInfo { } pub(crate) enum ComputeCredentialKeys { + #[cfg(any(test, feature = "testing"))] Password(Vec), AuthKeys(AuthKeys), None, @@ -293,16 +294,10 @@ async fn auth_quirks( // We now expect to see a very specific payload in the place of password. let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { - let res = hacks::password_hack_no_authentication(ctx, info, client).await?; - - ctx.set_endpoint_id(res.info.endpoint.clone()); - let password = match res.keys { - ComputeCredentialKeys::Password(p) => p, - ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => { - unreachable!("password hack should return a password") - } - }; - (res.info, Some(password)) + let (info, password) = + hacks::password_hack_no_authentication(ctx, info, client).await?; + ctx.set_endpoint_id(info.endpoint.clone()); + (info, Some(password)) } Ok(info) => (info, None), }; diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index e9019ce2cf..15123a2623 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,6 +1,4 @@ -use super::{ - ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint, -}; +use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use crate::{ auth::{self, AuthFlow}, config::AuthenticationConfig, @@ -63,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication( ctx: &RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, -) -> auth::Result { +) -> auth::Result<(ComputeUserInfo, Vec)> { warn!("project not specified, resorting to the password hack auth flow"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); @@ -79,12 +77,12 @@ pub(crate) async fn password_hack_no_authentication( info!(project = &*payload.endpoint, "received missing parameter"); // Report tentative success; compute node will check the password anyway. - Ok(ComputeCredentials { - info: ComputeUserInfo { + Ok(( + ComputeUserInfo { user: info.user, options: info.options, endpoint: payload.endpoint, }, - keys: ComputeCredentialKeys::Password(payload.password), - }) + payload.password, + )) } diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 1f44e4af5d..94e5999a5f 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -25,6 +25,8 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024; pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { fn fetch_auth_rules( &self, + ctx: &RequestMonitoring, + endpoint: EndpointId, role_name: RoleName, ) -> impl Future>> + Send; } @@ -101,7 +103,9 @@ impl JwkCacheEntryLock { async fn renew_jwks( &self, _permit: JwkRenewalPermit<'_>, + ctx: &RequestMonitoring, client: &reqwest::Client, + endpoint: EndpointId, role_name: RoleName, auth_rules: &F, ) -> anyhow::Result> { @@ -115,7 +119,9 @@ impl JwkCacheEntryLock { } } - let rules = auth_rules.fetch_auth_rules(role_name).await?; + let rules = auth_rules + .fetch_auth_rules(ctx, endpoint, role_name) + .await?; let mut key_sets = ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new()); // TODO(conrad): run concurrently @@ -166,6 +172,7 @@ impl JwkCacheEntryLock { self: &Arc, ctx: &RequestMonitoring, client: &reqwest::Client, + endpoint: EndpointId, role_name: RoleName, fetch: &F, ) -> Result, anyhow::Error> { @@ -176,7 +183,9 @@ impl JwkCacheEntryLock { let Some(cached) = guard else { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let permit = self.acquire_permit().await; - return self.renew_jwks(permit, client, role_name, fetch).await; + return self + .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) + .await; }; let last_update = now.duration_since(cached.last_retrieved); @@ -187,7 +196,9 @@ impl JwkCacheEntryLock { let permit = self.acquire_permit().await; // it's been too long since we checked the keys. wait for them to update. - return self.renew_jwks(permit, client, role_name, fetch).await; + return self + .renew_jwks(permit, ctx, client, endpoint, role_name, fetch) + .await; } // every 5 minutes we should spawn a job to eagerly update the token. @@ -198,8 +209,12 @@ impl JwkCacheEntryLock { let entry = self.clone(); let client = client.clone(); let fetch = fetch.clone(); + let ctx = ctx.clone(); tokio::spawn(async move { - if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await { + if let Err(e) = entry + .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch) + .await + { tracing::warn!(error=?e, "could not fetch JWKs in background job"); } }); @@ -216,6 +231,7 @@ impl JwkCacheEntryLock { ctx: &RequestMonitoring, jwt: &str, client: &reqwest::Client, + endpoint: EndpointId, role_name: RoleName, fetch: &F, ) -> Result<(), anyhow::Error> { @@ -242,7 +258,7 @@ impl JwkCacheEntryLock { let kid = header.key_id.context("missing key id")?; let mut guard = self - .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch) + .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch) .await?; // get the key from the JWKs if possible. If not, wait for the keys to update. @@ -254,7 +270,14 @@ impl JwkCacheEntryLock { let permit = self.acquire_permit().await; guard = self - .renew_jwks(permit, client, role_name.clone(), fetch) + .renew_jwks( + permit, + ctx, + client, + endpoint.clone(), + role_name.clone(), + fetch, + ) .await?; } _ => { @@ -318,7 +341,7 @@ impl JwkCache { jwt: &str, ) -> Result<(), anyhow::Error> { // try with just a read lock first - let key = (endpoint, role_name.clone()); + let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); let entry = entry.unwrap_or_else(|| { // acquire a write lock after to insert. @@ -327,7 +350,7 @@ impl JwkCache { }); entry - .check_jwt(ctx, jwt, &self.client, role_name, fetch) + .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch) .await } } @@ -688,6 +711,8 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL impl FetchAuthRules for Fetch { async fn fetch_auth_rules( &self, + _ctx: &RequestMonitoring, + _endpoint: EndpointId, _role_name: RoleName, ) -> anyhow::Result> { Ok(vec![ @@ -706,6 +731,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL } let role_name = RoleName::from("user"); + let endpoint = EndpointId::from("ep"); let jwk_cache = Arc::new(JwkCacheEntryLock::default()); @@ -715,6 +741,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL &RequestMonitoring::test(), &token, &client, + endpoint.clone(), role_name.clone(), &Fetch(addr), ) diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 8124f568cf..2ff2ca00f0 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -9,8 +9,9 @@ use crate::{ messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, NodeInfo, }, + context::RequestMonitoring, intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag}, - RoleName, + EndpointId, RoleName, }; use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; @@ -57,7 +58,12 @@ pub struct JwksRoleSettings { } impl FetchAuthRules for StaticAuthRules { - async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result> { + async fn fetch_auth_rules( + &self, + _ctx: &RequestMonitoring, + _endpoint: EndpointId, + role_name: RoleName, + ) -> anyhow::Result> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 6eba71df1b..94365ddf05 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -92,6 +92,12 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 16)] sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, } #[tokio::main] @@ -208,6 +214,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; Ok(Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ca9aeb04d8..2ac66ffe8c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -62,12 +62,13 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[derive(Clone, Debug, ValueEnum)] enum AuthBackendType { Console, - #[cfg(feature = "testing")] - Postgres, // clap only shows the name, not the alias, in usage text. // TODO: swap name/alias and deprecate "link" #[value(name("link"), alias("web"))] Web, + + #[cfg(feature = "testing")] + Postgres, } /// Neon proxy/router @@ -268,6 +269,12 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 64)] sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, } #[tokio::main] @@ -633,17 +640,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let api = console::provider::ConsoleBackend::Console(api); auth::Backend::Console(MaybeOwned::Owned(api), ()) } - #[cfg(feature = "testing")] - AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = console::provider::mock::Api::new(url); - let api = console::provider::ConsoleBackend::Postgres(api); - auth::Backend::Console(MaybeOwned::Owned(api), ()) - } + AuthBackendType::Web => { let url = args.uri.parse()?; auth::Backend::Web(MaybeOwned::Owned(url), ()) } + + #[cfg(feature = "testing")] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy); + let api = console::provider::ConsoleBackend::Postgres(api); + auth::Backend::Console(MaybeOwned::Owned(api), ()) + } }; let config::ConcurrencyLockOptions { @@ -679,6 +688,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { thread_pool, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1cda6d200c..373e4cf650 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -56,6 +56,8 @@ pub struct HttpConfig { pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, + pub max_request_size_bytes: u64, + pub max_response_size_bytes: usize, } pub struct AuthenticationConfig { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 12a6e2f12a..16e8da605b 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -303,6 +303,7 @@ impl NodeInfo { pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) { match keys { + #[cfg(any(test, feature = "testing"))] ComputeCredentialKeys::Password(password) => self.config.password(password), ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), ComputeCredentialKeys::None => &mut self.config, diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 08b87cd87a..1b77418de6 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -41,11 +41,15 @@ impl From for ApiError { #[derive(Clone)] pub struct Api { endpoint: ApiUrl, + ip_allowlist_check_enabled: bool, } impl Api { - pub fn new(endpoint: ApiUrl) -> Self { - Self { endpoint } + pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self { + Self { + endpoint, + ip_allowlist_check_enabled, + } } pub(crate) fn url(&self) -> &str { @@ -64,6 +68,7 @@ impl Api { tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); + let secret = if let Some(entry) = get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", @@ -79,21 +84,26 @@ impl Api { warn!("user '{}' does not exist", user_info.user); None }; - let allowed_ips = match get_execute_postgres_query( - &client, - "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", - &[&user_info.endpoint.as_str()], - "allowed_ips", - ) - .await? - { - Some(s) => { - info!("got allowed_ips: {s}"); - s.split(',') - .map(|s| IpPattern::from_str(s).unwrap()) - .collect() + + let allowed_ips = if self.ip_allowlist_check_enabled { + match get_execute_postgres_query( + &client, + "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", + &[&user_info.endpoint.as_str()], + "allowed_ips", + ) + .await? + { + Some(s) => { + info!("got allowed_ips: {s}"); + s.split(',') + .map(|s| IpPattern::from_str(s).unwrap()) + .collect() + } + None => vec![], } - None => vec![], + } else { + vec![] }; Ok((secret, allowed_ips)) diff --git a/proxy/src/context.rs b/proxy/src/context.rs index c013218ad9..021659e175 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -79,6 +79,40 @@ pub(crate) enum AuthMethod { Cleartext, } +impl Clone for RequestMonitoring { + fn clone(&self) -> Self { + let inner = self.0.try_lock().expect("should not deadlock"); + let new = RequestMonitoringInner { + peer_addr: inner.peer_addr, + session_id: inner.session_id, + protocol: inner.protocol, + first_packet: inner.first_packet, + region: inner.region, + span: info_span!("background_task"), + + project: inner.project, + branch: inner.branch, + endpoint_id: inner.endpoint_id.clone(), + dbname: inner.dbname.clone(), + user: inner.user.clone(), + application: inner.application.clone(), + error_kind: inner.error_kind, + auth_method: inner.auth_method.clone(), + success: inner.success, + rejected: inner.rejected, + cold_start_info: inner.cold_start_info, + pg_options: inner.pg_options.clone(), + + sender: None, + disconnect_sender: None, + latency_timer: LatencyTimer::noop(inner.protocol), + disconnect_timestamp: inner.disconnect_timestamp, + }; + + Self(TryLock::new(new)) + } +} + impl RequestMonitoring { pub fn new( session_id: Uuid, diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 2da7eac580..c2567e083a 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -397,6 +397,8 @@ pub struct LatencyTimer { protocol: Protocol, cold_start_info: ColdStartInfo, outcome: ConnectOutcome, + + skip_reporting: bool, } impl LatencyTimer { @@ -409,6 +411,20 @@ impl LatencyTimer { cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified outcome: ConnectOutcome::Failed, + skip_reporting: false, + } + } + + pub(crate) fn noop(protocol: Protocol) -> Self { + Self { + start: time::Instant::now(), + stop: None, + accumulated: Accumulated::default(), + protocol, + cold_start_info: ColdStartInfo::Unknown, + // assume failed unless otherwise specified + outcome: ConnectOutcome::Failed, + skip_reporting: true, } } @@ -443,6 +459,10 @@ pub enum ConnectOutcome { impl Drop for LatencyTimer { fn drop(&mut self) { + if self.skip_reporting { + return; + } + let duration = self .stop .unwrap_or_else(time::Instant::now) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index d163878528..aa236907db 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -27,7 +27,7 @@ use crate::{ Host, }; -use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool}; +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; pub(crate) struct PoolingBackend { pub(crate) pool: Arc>, @@ -274,13 +274,6 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(timeout); - match &self.conn_info.auth { - AuthData::Jwt(_) => {} - AuthData::Password(pw) => { - config.password(pw); - } - } - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(tokio_postgres::NoTls).await; drop(pause); diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index bea599e9b9..a850ecd2be 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -29,11 +29,16 @@ use tracing::{info, info_span, Instrument}; use super::backend::HttpConnError; +#[derive(Debug, Clone)] +pub(crate) struct ConnInfoWithAuth { + pub(crate) conn_info: ConnInfo, + pub(crate) auth: AuthData, +} + #[derive(Debug, Clone)] pub(crate) struct ConnInfo { pub(crate) user_info: ComputeUserInfo, pub(crate) dbname: DbName, - pub(crate) auth: AuthData, } #[derive(Debug, Clone)] @@ -776,6 +781,8 @@ mod tests { }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, + max_request_size_bytes: u64::MAX, + max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { @@ -785,7 +792,6 @@ mod tests { options: NeonOptions::default(), }, dbname: "dbname".into(), - auth: AuthData::Password("password".as_bytes().into()), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), @@ -843,7 +849,6 @@ mod tests { options: NeonOptions::default(), }, dbname: "dbname".into(), - auth: AuthData::Password("password".as_bytes().into()), }; let ep_pool = Arc::downgrade( &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 2188edc8c5..7c78439a0a 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -60,6 +60,7 @@ use super::backend::PoolingBackend; use super::conn_pool::AuthData; use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::conn_pool::ConnInfoWithAuth; use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; @@ -87,9 +88,6 @@ enum Payload { Batch(BatchQueryData), } -const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB -const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB - static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); @@ -151,7 +149,7 @@ fn get_conn_info( ctx: &RequestMonitoring, headers: &HeaderMap, tls: Option<&TlsConfig>, -) -> Result { +) -> Result { // HTTP only uses cleartext (for now and likely always) ctx.set_auth_method(crate::context::AuthMethod::Cleartext); @@ -238,11 +236,8 @@ fn get_conn_info( options: options.unwrap_or_default(), }; - Ok(ConnInfo { - user_info, - dbname, - auth, - }) + let conn_info = ConnInfo { user_info, dbname }; + Ok(ConnInfoWithAuth { conn_info, auth }) } // TODO: return different http error codes @@ -366,10 +361,10 @@ pub(crate) enum SqlOverHttpError { ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), - #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] - RequestTooLarge, - #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] - ResponseTooLarge, + #[error("request is too large (max is {0} bytes)")] + RequestTooLarge(u64), + #[error("response is too large (max is {0} bytes)")] + ResponseTooLarge(usize), #[error("invalid isolation level")] InvalidIsolationLevel, #[error("{0}")] @@ -386,8 +381,8 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), - SqlOverHttpError::RequestTooLarge => ErrorKind::User, - SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, @@ -402,8 +397,8 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), - SqlOverHttpError::RequestTooLarge => self.to_string(), - SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::RequestTooLarge(_) => self.to_string(), + SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), @@ -526,7 +521,10 @@ async fn handle_inner( // TLS config should be there. let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?; - info!(user = conn_info.user_info.user.as_str(), "credentials"); + info!( + user = conn_info.conn_info.user_info.user.as_str(), + "credentials" + ); // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in @@ -537,7 +535,7 @@ async fn handle_inner( let request_content_length = match request.body().size_hint().upper() { Some(v) => v, - None => MAX_REQUEST_SIZE + 1, + None => config.http_config.max_request_size_bytes + 1, }; info!(request_content_length, "request size in bytes"); Metrics::get() @@ -547,8 +545,10 @@ async fn handle_inner( // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body - if request_content_length > MAX_REQUEST_SIZE { - return Err(SqlOverHttpError::RequestTooLarge); + if request_content_length > config.http_config.max_request_size_bytes { + return Err(SqlOverHttpError::RequestTooLarge( + config.http_config.max_request_size_bytes, + )); } let fetch_and_process_request = Box::pin( @@ -569,20 +569,20 @@ async fn handle_inner( .authenticate_with_password( ctx, &config.authentication_config, - &conn_info.user_info, + &conn_info.conn_info.user_info, pw, ) .await? } AuthData::Jwt(jwt) => { backend - .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) + .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt) .await? } }; let client = backend - .connect_to_compute(ctx, conn_info, keys, !allow_pool) + .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool) .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else @@ -612,7 +612,10 @@ async fn handle_inner( // Now execute the query and return the result. let json_output = match payload { - Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, + Payload::Single(stmt) => { + stmt.process(config, cancel, &mut client, parsed_headers) + .await? + } Payload::Batch(statements) => { if parsed_headers.txn_read_only { response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); @@ -628,7 +631,7 @@ async fn handle_inner( } statements - .process(cancel, &mut client, parsed_headers) + .process(config, cancel, &mut client, parsed_headers) .await? } }; @@ -656,6 +659,7 @@ async fn handle_inner( impl QueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -664,7 +668,7 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), pin!(cancel.cancelled()), ) .await @@ -727,6 +731,7 @@ impl QueryData { impl BatchQueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -751,44 +756,52 @@ impl BatchQueryData { discard.discard(); })?; - let json_output = - match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(json_output) => { - info!("commit"); - let status = transaction.commit().await.inspect_err(|_| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; - discard.check_idle(status); - json_output - } - Err(SqlOverHttpError::Cancelled(_)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + let json_output = match query_batch( + config, + cancel.child_token(), + &transaction, + self, + parsed_headers, + ) + .await + { + Ok(json_output) => { + info!("commit"); + let status = transaction.commit().await.inspect_err(|_| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error discard.discard(); + })?; + discard.check_idle(status); + json_output + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(err) => { - info!("rollback"); - let status = transaction.rollback().await.inspect_err(|_| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; - discard.check_idle(status); - return Err(err); - } - }; + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.inspect_err(|_| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + })?; + discard.check_idle(status); + return Err(err); + } + }; Ok(json_output) } } async fn query_batch( + config: &'static ProxyConfig, cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, @@ -798,6 +811,7 @@ async fn query_batch( let mut current_size = 0; for stmt in queries.queries { let query = pin!(query_to_json( + config, transaction, stmt, &mut current_size, @@ -826,6 +840,7 @@ async fn query_batch( } async fn query_to_json( + config: &'static ProxyConfig, client: &T, data: QueryData, current_size: &mut usize, @@ -846,8 +861,10 @@ async fn query_to_json( rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > MAX_RESPONSE_SIZE { - return Err(SqlOverHttpError::ResponseTooLarge); + if *current_size > config.http_config.max_response_size_bytes { + return Err(SqlOverHttpError::ResponseTooLarge( + config.http_config.max_response_size_bytes, + )); } } diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0fdb3147bf..daf21c70b0 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -13,14 +13,12 @@ testing = ["fail/failpoints"] [dependencies] async-stream.workspace = true anyhow.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive"] } -const_format.workspace = true crc32c.workspace = true fail.workspace = true git-version.workspace = true @@ -38,8 +36,6 @@ scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -signal-hook.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true @@ -48,7 +44,6 @@ tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-tar.workspace = true -toml_edit.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 15b0272cd9..589536c7a8 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -17,6 +17,7 @@ use postgres_ffi::MAX_SEND_SIZE; use serde::Deserialize; use serde::Serialize; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName}; use sha2::{Digest, Sha256}; use utils::id::NodeId; use utils::id::TenantTimelineId; @@ -51,6 +52,9 @@ pub struct Args { /// Dump full term history. True by default. pub dump_term_history: bool, + /// Dump last modified time of WAL segments. Uses value of `dump_all` by default. + pub dump_wal_last_modified: bool, + /// Filter timelines by tenant_id. pub tenant_id: Option, @@ -128,12 +132,19 @@ async fn build_from_tli_dump( None }; + let wal_last_modified = if args.dump_wal_last_modified { + get_wal_last_modified(timeline_dir).ok().flatten() + } else { + None + }; + Timeline { tenant_id: timeline.ttid.tenant_id, timeline_id: timeline.ttid.timeline_id, control_file, memory, disk_content, + wal_last_modified, } } @@ -156,6 +167,7 @@ pub struct Timeline { pub control_file: Option, pub memory: Option, pub disk_content: Option, + pub wal_last_modified: Option>, } #[derive(Debug, Serialize, Deserialize)] @@ -302,6 +314,27 @@ fn build_file_info(entry: DirEntry) -> Result { }) } +/// Get highest modified time of WAL segments in the directory. +fn get_wal_last_modified(path: &Utf8Path) -> Result>> { + let mut res = None; + for entry in fs::read_dir(path)? { + if entry.is_err() { + continue; + } + let entry = entry?; + /* Ignore files that are not XLOG segments */ + let fname = entry.file_name(); + if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) { + continue; + } + + let metadata = entry.metadata()?; + let modified: DateTime = DateTime::from(metadata.modified()?); + res = std::cmp::max(res, Some(modified)); + } + Ok(res) +} + /// Converts SafeKeeperConf to Config, filtering out the fields that are not /// supposed to be exposed. fn build_config(config: SafeKeeperConf) -> Config { diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml index 70999853c2..3f14075345 100644 --- a/safekeeper/src/http/openapi_spec.yaml +++ b/safekeeper/src/http/openapi_spec.yaml @@ -1,7 +1,11 @@ openapi: "3.0.2" info: title: Safekeeper control API + description: Neon Safekeeper API version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE servers: @@ -386,6 +390,12 @@ components: msg: type: string + NotFoundError: + type: object + properties: + msg: + type: string + responses: # diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index e482edea55..b4590fe3e5 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -481,6 +481,7 @@ async fn dump_debug_handler(mut request: Request) -> Result let mut dump_memory: Option = None; let mut dump_disk_content: Option = None; let mut dump_term_history: Option = None; + let mut dump_wal_last_modified: Option = None; let mut tenant_id: Option = None; let mut timeline_id: Option = None; @@ -494,6 +495,7 @@ async fn dump_debug_handler(mut request: Request) -> Result "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?), "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?), "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?), + "dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?), "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?), "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?), _ => Err(ApiError::BadRequest(anyhow::anyhow!( @@ -508,6 +510,7 @@ async fn dump_debug_handler(mut request: Request) -> Result let dump_memory = dump_memory.unwrap_or(dump_all); let dump_disk_content = dump_disk_content.unwrap_or(dump_all); let dump_term_history = dump_term_history.unwrap_or(true); + let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all); let args = debug_dump::Args { dump_all, @@ -515,6 +518,7 @@ async fn dump_debug_handler(mut request: Request) -> Result dump_memory, dump_disk_content, dump_term_history, + dump_wal_last_modified, tenant_id, timeline_id, }; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 64585f5edc..c772ae6de7 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -278,7 +278,7 @@ impl WalResidentTimeline { } /// pull_timeline request body. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct Request { pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -293,7 +293,7 @@ pub struct Response { } /// Response for debug dump request. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct DebugDumpResponse { pub start_time: DateTime, pub finish_time: DateTime, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 46c260901d..6e7da94973 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -539,20 +539,17 @@ async fn remove_segments_from_disk( while let Some(entry) = entries.next_entry().await? { let entry_path = entry.path(); let fname = entry_path.file_name().unwrap(); - - if let Some(fname_str) = fname.to_str() { - /* Ignore files that are not XLOG segments */ - if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) { - continue; - } - let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); - if remove_predicate(segno) { - remove_file(entry_path).await?; - n_removed += 1; - min_removed = min(min_removed, segno); - max_removed = max(max_removed, segno); - REMOVED_WAL_SEGMENTS.inc(); - } + /* Ignore files that are not XLOG segments */ + if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) { + continue; + } + let (segno, _) = XLogFromFileName(fname, wal_seg_size)?; + if remove_predicate(segno) { + remove_file(entry_path).await?; + n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); + REMOVED_WAL_SEGMENTS.inc(); } } diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index ac4b00669e..82ec0aa272 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -10,7 +10,6 @@ bench = [] [dependencies] anyhow.workspace = true async-stream.workspace = true -bytes.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true futures.workspace = true @@ -24,7 +23,6 @@ parking_lot.workspace = true prost.workspace = true tonic.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } -tokio-stream.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index ecaac04915..a96d64e096 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -15,9 +15,7 @@ testing = [] [dependencies] anyhow.workspace = true -aws-config.workspace = true bytes.workspace = true -camino.workspace = true chrono.workspace = true clap.workspace = true fail.workspace = true diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml index e7a4264fd0..9fa89176af 100644 --- a/storage_controller/client/Cargo.toml +++ b/storage_controller/client/Cargo.toml @@ -5,18 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] -pageserver_api.workspace = true pageserver_client.workspace = true -thiserror.workspace = true reqwest.workspace = true -utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } -tokio-postgres.workspace = true -tokio-stream.workspace = true -tokio.workspace = true -futures.workspace = true -tokio-util.workspace = true -anyhow.workspace = true -postgres.workspace = true -bytes.workspace = true diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a6638f5191..1745bf5575 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,10 +1,11 @@ +use crate::http; use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; use crate::persistence::SafekeeperPersistence; use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; @@ -22,6 +23,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_client::{mgmt_api, BlockUnblock}; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -87,9 +89,16 @@ fn get_state(request: &Request) -> &HttpState { } /// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { +async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let reattach_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) @@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE /// Pageserver calls into this before doing deletions, to confirm that it still /// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { +async fn handle_validate(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let validate_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.validate(validate_req).await?) @@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr /// Call into this before attaching a tenant to a pageserver, to acquire a generation number /// (in the real control plane this is unnecessary, because the same program is managing /// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { +async fn handle_attach_hook(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let attach_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap ) } -async fn handle_inspect(mut req: Request) -> Result, ApiError> { +async fn handle_inspect(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let inspect_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr async fn handle_tenant_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( @@ -150,11 +187,18 @@ async fn handle_tenant_create( async fn handle_tenant_location_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, @@ -166,10 +210,17 @@ async fn handle_tenant_location_config( async fn handle_tenant_config_set( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) @@ -182,16 +233,30 @@ async fn handle_tenant_config_get( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) } async fn handle_tenant_time_travel_remote_storage( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let time_travel_req = json_request::(&mut req).await?; let timestamp_raw = must_get_query_param(&req, "travel_to")?; @@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; json_response(map_reqwest_hyper_status(status)?, progress) } @@ -243,6 +315,13 @@ async fn handle_tenant_delete( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let status_code = service .tenant_delete(tenant_id) .await @@ -258,11 +337,18 @@ async fn handle_tenant_delete( async fn handle_tenant_timeline_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( StatusCode::CREATED, @@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; // For timeline deletions, which both implement an "initially return 202, then 404 once // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. @@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete( async fn handle_tenant_timeline_archival_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; let create_req = json_request::(&mut req).await?; @@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; let res = service .tenant_timeline_detach_ancestor(tenant_id, timeline_id) @@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let Some(path) = req.uri().path_and_query() else { // This should never happen, our request router only calls us if there is a path return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); @@ -460,9 +574,17 @@ async fn handle_tenant_locate( service: Arc, req: Request, ) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Admin)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } @@ -473,6 +595,14 @@ async fn handle_tenant_describe( check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } @@ -482,12 +612,26 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_list()) } -async fn handle_node_register(mut req: Request) -> Result, ApiError> { +async fn handle_node_register(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let register_req = json_request::(&mut req).await?; let state = get_state(&req); state.service.node_register(register_req).await?; @@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request) -> Result, async fn handle_node_list(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let nodes = state.service.node_list().await?; let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); @@ -507,6 +658,13 @@ async fn handle_node_list(req: Request) -> Result, ApiError async fn handle_node_drop(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_drop(node_id).await?) @@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request) -> Result, ApiError async fn handle_node_delete(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_delete(node_id).await?) } -async fn handle_node_configure(mut req: Request) -> Result, ApiError> { +async fn handle_node_configure(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let node_id: NodeId = parse_request_param(&req, "node_id")?; let config_req = json_request::(&mut req).await?; if node_id != config_req.node_id { @@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request) -> Result, async fn handle_node_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request) -> Result, ApiErr async fn handle_get_leader(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let leader = state.service.get_leader().await.map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!( @@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request) -> Result, ApiErro async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request) -> Result, ApiErro async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request) -> Result, async fn handle_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request) -> Result, ApiError async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { +async fn handle_metadata_health_update(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Scrubber)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; @@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy( } async fn handle_metadata_health_list_outdated( - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let list_outdated_req = json_request::(&mut req).await?; let state = get_state(&req); let health_records = state @@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated( async fn handle_tenant_shard_split( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let split_req = json_request::(&mut req).await?; @@ -686,10 +928,17 @@ async fn handle_tenant_shard_split( async fn handle_tenant_shard_migrate( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; let migrate_req = json_request::(&mut req).await?; json_response( @@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate( ) } -async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { +async fn handle_update_preferred_azs(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let azs_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request) -> Result) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.step_down().await) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response( @@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.tenants_dump() } @@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request) -> Result, ApiEr async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.scheduler_dump() } @@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request) -> Result, Api async fn handle_consistency_check(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.consistency_check().await?) @@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request) -> Result, async fn handle_reconcile_all(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.reconcile_all_now().await?) } /// Status endpoint is just used for checking that our HTTP listener is up -async fn handle_status(_req: Request) -> Result, ApiError> { +async fn handle_status(req: Request) -> Result, ApiError> { + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, ()) } /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling /// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. async fn handle_ready(req: Request) -> Result, ApiError> { + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); if state.service.startup_complete.is_ready() { json_response(StatusCode::OK, ()) @@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api let id = parse_request_param::(&req, "id")?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let res = state.service.get_safekeeper(id).await; @@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.upsert_safekeeper(body).await?; @@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware< let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, - LeadershipStatus::SteppedDown => { - // TODO: does it make sense to allow /status here? - AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::SteppedDown => AllowedRoutes::All, LeadershipStatus::Candidate => { AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) } @@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() @@ -1032,6 +1378,220 @@ where request_span(request, handler).await } +enum ForwardOutcome { + Forwarded(Result, ApiError>), + NotForwarded(Request), +} + +/// Potentially forward the request to the current storage controler leader. +/// More specifically we forward when: +/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state +/// 3. There is a leader in the database to forward to +/// 4. Leader from step (3) is not the current instance +/// +/// Why forward? +/// It turns out that we can't rely on external orchestration to promptly route trafic to the +/// new leader. This is downtime inducing. Forwarding provides a safe way out. +/// +/// Why is it safe? +/// If a storcon instance is persisted in the database, then we know that it is the current leader. +/// There's one exception: time between handling step-down request and the new leader updating the +/// database. +/// +/// Let's treat the happy case first. The stepped down node does not produce any side effects, +/// since all request handling happens on the leader. +/// +/// As for the edge case, we are guaranteed to always have a maximum of two running instances. +/// Hence, if we are in the edge case scenario the leader persisted in the database is the +/// stepped down instance that received the request. Condition (4) above covers this scenario. +async fn maybe_forward(req: Request) -> ForwardOutcome { + const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + + let uri = req.uri().to_string(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + + let leader = state.service.get_leader().await; + let leader = { + match leader { + Ok(Some(leader)) => leader, + Ok(None) => { + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "No leader to forward to while in stepped down state".into(), + ))); + } + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to get leader for forwarding while in stepped down state: {err}" + ), + ))); + } + } + }; + + let cfg = state.service.get_config(); + if let Some(ref self_addr) = cfg.address_for_peers { + let leader_addr = match Uri::from_str(leader.address.as_str()) { + Ok(uri) => uri, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), + ))); + } + }; + + if *self_addr == leader_addr { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Leader is stepped down instance" + )))); + } + } + + tracing::info!("Forwarding {} to leader at {}", uri, leader.address); + + // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and + // include some leeway to get the timeout for proxied requests. + const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); + let client = reqwest::ClientBuilder::new() + .timeout(PROXIED_REQUEST_TIMEOUT) + .build(); + let client = match client { + Ok(client) => client, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to build leader client for forwarding while in stepped down state: {err}" + )))); + } + }; + + let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to convert request for forwarding while in stepped down state: {err}" + )))); + } + }; + + let response = match client.execute(request).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to forward while in stepped down state: {err}" + )))); + } + }; + + ForwardOutcome::Forwarded(convert_response(response).await) +} + +/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// Ideally, we would not have to do this since both types use the http crate +/// under the hood. However, they use different versions of the crate and keeping +/// second order dependencies in sync is difficult. +async fn convert_response(resp: reqwest::Response) -> Result, ApiError> { + use std::str::FromStr; + + let mut builder = hyper::Response::builder().status(resp.status().as_u16()); + for (key, value) in resp.headers().into_iter() { + let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + builder = builder.header(key, value); + } + + let body = http::Body::wrap_stream(resp.bytes_stream()); + + builder.body(body).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + }) +} + +/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// See [`convert_response`] for why we are doing it this way. +async fn convert_request( + req: hyper::Request, + client: &reqwest::Client, + to_address: String, +) -> Result { + use std::str::FromStr; + + let (parts, body) = req.into_parts(); + let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let path_and_query = parts.uri.path_and_query().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Request conversion failed: no path and query" + )) + })?; + + let uri = reqwest::Url::from_str( + format!( + "{}{}", + to_address.trim_end_matches("/"), + path_and_query.as_str() + ) + .as_str(), + ) + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let mut headers = reqwest::header::HeaderMap::new(); + for (key, value) in parts.headers.into_iter() { + let key = match key { + Some(k) => k, + None => { + continue; + } + }; + + let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + headers.insert(key, value); + } + + let body = hyper::body::to_bytes(body).await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + client + .request(method, uri) + .headers(headers) + .body(body) + .build() + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + }) +} + pub fn make_router( service: Arc, auth: Option>, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index be3efaf688..957f633feb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3,6 +3,7 @@ use std::{ borrow::Cow, cmp::Ordering, collections::{BTreeMap, HashMap, HashSet}, + error::Error, ops::Deref, path::PathBuf, str::FromStr, @@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { format!("{node} error receiving error body: {str}").into(), ) } - mgmt_api::Error::ReceiveBody(str) => { - // Presume errors receiving body are connectivity/availability issues - ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into()) + mgmt_api::Error::ReceiveBody(err) if err.is_decode() => { + // Return 500 for decoding errors. + ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body")) + } + mgmt_api::Error::ReceiveBody(err) => { + // Presume errors receiving body are connectivity/availability issues except for decoding errors + let src_str = err.source().map(|e| e.to_string()).unwrap_or_default(); + ApiError::ResourceUnavailable( + format!("{node} error receiving error body: {err} {}", src_str).into(), + ) } mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into()) diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index d19119990b..f9987662b9 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -6,21 +6,13 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true -aws-smithy-async.workspace = true either.workspace = true -tokio-rustls.workspace = true anyhow.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true -thiserror.workspace = true -rand.workspace = true -bytes.workspace = true -bincode.workspace = true -crc32c.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 15dfb101b5..de6918b3da 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,7 +1,8 @@ -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use anyhow::Context; use itertools::Itertools; +use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; @@ -48,56 +49,6 @@ impl TimelineAnalysis { } } -/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). -/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, -/// -/// ```plain -/// | | | | -/// | 1 | | 2 | | 3 | -/// | | | | | | -/// ``` -/// -/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have -/// the same LSN range. -/// -/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, -/// -/// ```plain -/// | | | 2 | | | -/// | 1 | |-------| | 3 | -/// | | | 4 | | | -/// -/// If layer 2 and 4 contain the same single key, this is also a valid layer map. -fn check_valid_layermap(metadata: &HashMap) -> Option { - let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) - let mut all_delta_layers = Vec::new(); - for (name, _) in metadata.iter() { - if let LayerName::Delta(layer) = name { - if layer.key_range.start.next() != layer.key_range.end { - all_delta_layers.push(layer.clone()); - } - } - } - for layer in &all_delta_layers { - let lsn_range = &layer.lsn_range; - lsn_split_point.insert(lsn_range.start); - lsn_split_point.insert(lsn_range.end); - } - for layer in &all_delta_layers { - let lsn_range = layer.lsn_range.clone(); - let intersects = lsn_split_point.range(lsn_range).collect_vec(); - if intersects.len() > 1 { - let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - layer, - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - return Some(err); - } - } - None -} - pub(crate) async fn branch_cleanup_and_check_errors( remote_client: &GenericRemoteStorage, id: &TenantShardTimelineId, @@ -177,7 +128,8 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } - if let Some(err) = check_valid_layermap(&index_part.layer_metadata) { + let layer_names = index_part.layer_metadata.keys().cloned().collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { result.errors.push(format!( "index_part.json contains invalid layer map structure: {err}" )); diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index c5961753c5..ee133e2e58 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -121,8 +121,6 @@ enum Command { async fn main() -> anyhow::Result<()> { let cli = Cli::parse(); - tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); - let bucket_config = BucketConfig::from_env()?; let command_log_name = match &cli.command { @@ -142,6 +140,8 @@ async fn main() -> anyhow::Result<()> { chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); + tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG); + let controller_client = cli.controller_api.map(|controller_api| { ControllerClientConfig { controller_api, diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py new file mode 100644 index 0000000000..8ebaf61e5e --- /dev/null +++ b/test_runner/fixtures/auth_tokens.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Any + +import jwt + +from fixtures.common_types import TenantId + + +@dataclass +class AuthKeys: + priv: str + + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} + token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") + # cast(Any, self.priv) + + # jwt.encode can return 'bytes' or 'str', depending on Python version or type + # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' + # explicitly. + if isinstance(token, bytes): + token = token.decode() + + return token + + def generate_pageserver_token(self) -> str: + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) + + def generate_safekeeper_token(self) -> str: + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) + + # generate token giving access to only one tenant + def generate_tenant_token(self, tenant_id: TenantId) -> str: + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" + SCRUBBER = "scrubber" diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py deleted file mode 100644 index 8aca90a097..0000000000 --- a/test_runner/fixtures/broker.py +++ /dev/null @@ -1,63 +0,0 @@ -import subprocess -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Optional - -from fixtures.log_helper import log - - -@dataclass -class NeonBroker: - """An object managing storage_broker instance""" - - logfile: Path - port: int - neon_binpath: Path - handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon - - def listen_addr(self): - return f"127.0.0.1:{self.port}" - - def client_url(self): - return f"http://{self.listen_addr()}" - - def check_status(self): - return True # TODO - - def try_start(self): - if self.handle is not None: - log.debug(f"storage_broker is already running on port {self.port}") - return - - listen_addr = self.listen_addr() - log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') - with open(self.logfile, "wb") as logfile: - args = [ - str(self.neon_binpath / "storage_broker"), - f"--listen-addr={listen_addr}", - ] - self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) - - # wait for start - started_at = time.time() - while True: - try: - self.check_status() - except Exception as e: - elapsed = time.time() - started_at - if elapsed > 5: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" - ) from e - time.sleep(0.5) - else: - break # success - - def stop(self, immediate: bool = False): - if self.handle is not None: - if immediate: - self.handle.kill() - else: - self.handle.terminate() - self.handle.wait() diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 7c4a8db36f..770b32b11e 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff +from pathlib import Path from typing import Dict, Iterator, List import pytest @@ -229,11 +230,11 @@ class VanillaCompare(PgCompare): pass # TODO find something def report_size(self): - data_size = self.pg.get_subdir_size("base") + data_size = self.pg.get_subdir_size(Path("base")) self.zenbenchmark.record( "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - wal_size = self.pg.get_subdir_size("pg_wal") + wal_size = self.pg.get_subdir_size(Path("pg_wal")) self.zenbenchmark.record( "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index cda70be8da..005dc6cb0d 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]] +def counter(name: str) -> str: + # the prometheus_client package appends _total to all counters client-side + return f"{name}_total" + + PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( "pageserver_remote_timeline_client_calls_started_total", "pageserver_remote_timeline_client_calls_finished_total", @@ -132,9 +137,14 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), + "pageserver_smgr_query_started_global_count_total", "pageserver_tenant_states_count", "pageserver_circuit_breaker_broken_total", "pageserver_circuit_breaker_unbroken_total", + counter("pageserver_tenant_throttling_count_accounted_start_global"), + counter("pageserver_tenant_throttling_count_accounted_finish_global"), + counter("pageserver_tenant_throttling_wait_usecs_sum_global"), + counter("pageserver_tenant_throttling_count_global"), ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( @@ -146,6 +156,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_smgr_query_started_count_total", "pageserver_archive_size", "pageserver_pitr_history_size", "pageserver_layer_bytes", @@ -157,6 +168,10 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", + counter("pageserver_tenant_throttling_count_accounted_start"), + counter("pageserver_tenant_throttling_count_accounted_finish"), + counter("pageserver_tenant_throttling_wait_usecs_sum"), + counter("pageserver_tenant_throttling_count"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 50284a3f5a..fc83cf3f7c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -43,7 +43,6 @@ from urllib.parse import quote, urlparse import asyncpg import backoff import httpx -import jwt import psycopg2 import psycopg2.sql import pytest @@ -60,7 +59,7 @@ from psycopg2.extensions import make_dsn, parse_dsn from urllib3.util.retry import Retry from fixtures import overlayfs -from fixtures.broker import NeonBroker +from fixtures.auth_tokens import AuthKeys, TokenScope from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log @@ -93,6 +92,7 @@ from fixtures.utils import ( allure_add_grafana_links, allure_attach_from_dir, assert_no_errors, + get_dir_size, get_self_dir, print_gc_result, subprocess_capture, @@ -158,7 +158,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: yield binpath -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): distrib_dir = Path(env_postgres_bin).resolve() @@ -182,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: yield output_dir -@pytest.fixture(scope="function") -def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: - versioned_dir = pg_distrib_dir / pg_version.v_prefixed - - psql_bin_path = versioned_dir / "bin/psql" - postgres_bin_path = versioned_dir / "bin/postgres" - - if os.getenv("REMOTE_ENV"): - # When testing against a remote server, we only need the client binary. - if not psql_bin_path.exists(): - raise Exception(f"psql not found at '{psql_bin_path}'") - else: - if not postgres_bin_path.exists(): - raise Exception(f"postgres not found at '{postgres_bin_path}'") - - log.info(f"versioned_pg_distrib_dir is {versioned_dir}") - yield versioned_dir - - @pytest.fixture(scope="session") def neon_api_key() -> str: api_key = os.getenv("NEON_API_KEY") @@ -243,36 +224,11 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: return BASE_PORT + worker_seq_no * worker_port_num -def get_dir_size(path: str) -> int: - """Return size in bytes.""" - totalbytes = 0 - for root, _dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - return totalbytes - - @pytest.fixture(scope="session") def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) -@pytest.fixture(scope="function") -def default_broker( - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, -) -> Iterator[NeonBroker]: - # multiple pytest sessions could get launched in parallel, get them different ports/datadirs - client_port = port_distributor.get_port() - broker_logfile = test_output_dir / "repo" / "storage_broker.log" - - broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath) - yield broker - broker.stop() - - @pytest.fixture(scope="session") def run_id() -> Iterator[uuid.UUID]: yield uuid.uuid4() @@ -401,44 +357,6 @@ class PgProtocol: return self.safe_psql(query, log_query=log_query)[0][0] -@dataclass -class AuthKeys: - priv: str - - def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: - token_data = {key: str(val) for key, val in token_data.items()} - token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") - # cast(Any, self.priv) - - # jwt.encode can return 'bytes' or 'str', depending on Python version or type - # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' - # explicitly. - if isinstance(token, bytes): - token = token.decode() - - return token - - def generate_pageserver_token(self) -> str: - return self.generate_token(scope=TokenScope.PAGE_SERVER_API) - - def generate_safekeeper_token(self) -> str: - return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) - - # generate token giving access to only one tenant - def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) - - -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): - ADMIN = "admin" - PAGE_SERVER_API = "pageserverapi" - GENERATIONS_API = "generations_api" - SAFEKEEPER_DATA = "safekeeperdata" - TENANT = "tenant" - SCRUBBER = "scrubber" - - class NeonEnvBuilder: """ Builder object to create a Neon runtime environment @@ -453,7 +371,6 @@ class NeonEnvBuilder: self, repo_dir: Path, port_distributor: PortDistributor, - broker: NeonBroker, run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -494,7 +411,6 @@ class NeonEnvBuilder: # Safekeepers remote storage self.safekeepers_remote_storage: Optional[RemoteStorage] = None - self.broker = broker self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server self.pageserver_config_override = pageserver_config_override @@ -1006,6 +922,8 @@ class NeonEnvBuilder: self.env.storage_controller.assert_no_errors() + self.env.broker.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1059,7 +977,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] self.pageservers: List[NeonPageserver] = [] - self.broker = config.broker + self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version @@ -1234,7 +1152,7 @@ class NeonEnv: max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: futs.append( - executor.submit(lambda: self.broker.try_start() or None) + executor.submit(lambda: self.broker.start() or None) ) # The `or None` is for the linter for pageserver in self.pageservers: @@ -1291,7 +1209,7 @@ class NeonEnv: pageserver.stop(immediate=immediate) except RuntimeError: stop_later.append(pageserver) - self.broker.stop(immediate=immediate) + self.broker.stop() # TODO: for nice logging we need python 3.11 ExceptionGroup for ps in stop_later: @@ -1405,7 +1323,6 @@ def neon_simple_env( pytestconfig: Config, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, test_output_dir: Path, @@ -1430,7 +1347,6 @@ def neon_simple_env( top_output_dir=top_output_dir, repo_dir=repo_dir, port_distributor=port_distributor, - broker=default_broker, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, @@ -1458,7 +1374,6 @@ def neon_env_builder( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, - default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, test_overlay_dir: Path, @@ -1494,7 +1409,6 @@ def neon_env_builder( neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, - broker=default_broker, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1916,6 +1830,18 @@ class NeonCli(AbstractNeonCli): args.extend(["-m", "immediate"]) return self.raw_cli(args) + def broker_start( + self, timeout_in_seconds: Optional[int] = None + ) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def broker_stop(self) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "stop"] + return self.raw_cli(cmd) + def endpoint_create( self, branch_name: str, @@ -3338,12 +3264,12 @@ class PgBin: ) return base_path - def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn: """ Run pg_controldata on given datadir and extract checkpoint lsn. """ - pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + pg_controldata_path = self.pg_bin_path / "pg_controldata" cmd = f"{pg_controldata_path} -D {pgdata}" result = subprocess.run(cmd, capture_output=True, text=True, shell=True) checkpoint_lsn = re.findall( @@ -3452,9 +3378,9 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - def get_subdir_size(self, subdir) -> int: + def get_subdir_size(self, subdir: Path) -> int: """Return size of pgdatadir subdirectory in bytes.""" - return get_dir_size(os.path.join(self.pgdatadir, subdir)) + return get_dir_size(self.pgdatadir / subdir) def __enter__(self) -> "VanillaPostgres": return self @@ -3937,9 +3863,6 @@ def static_proxy( dbname = vanilla_pg.default_options["dbname"] auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}" - # require password for 'http_auth' user - vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"]) - # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` vanilla_pg.start() vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") @@ -3981,7 +3904,7 @@ class Endpoint(PgProtocol, LogUtils): self.env = env self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA + self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -4038,7 +3961,7 @@ class Endpoint(PgProtocol, LogUtils): allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" - self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.pgdata_dir = self.env.repo_dir / path self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] @@ -4091,21 +4014,21 @@ class Endpoint(PgProtocol, LogUtils): path = Path("endpoints") / self.endpoint_id return self.env.repo_dir / path - def pg_data_dir_path(self) -> str: + def pg_data_dir_path(self) -> Path: """Path to Postgres data directory""" - return os.path.join(self.endpoint_path(), "pgdata") + return self.endpoint_path() / "pgdata" - def pg_xact_dir_path(self) -> str: + def pg_xact_dir_path(self) -> Path: """Path to pg_xact dir""" - return os.path.join(self.pg_data_dir_path(), "pg_xact") + return self.pg_data_dir_path() / "pg_xact" - def pg_twophase_dir_path(self) -> str: + def pg_twophase_dir_path(self) -> Path: """Path to pg_twophase dir""" - return os.path.join(self.pg_data_dir_path(), "pg_twophase") + return self.pg_data_dir_path() / "pg_twophase" - def config_file_path(self) -> str: + def config_file_path(self) -> Path: """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" - return os.path.join(self.endpoint_path(), "postgresql.conf") + return self.endpoint_path() / "postgresql.conf" def config(self, lines: List[str]) -> "Endpoint": """ @@ -4160,7 +4083,7 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self, num_migrations: int = 10): + def wait_for_migrations(self, num_migrations: int = 11): with self.cursor() as cur: def check_migrations_done(): @@ -4270,7 +4193,7 @@ class Endpoint(PgProtocol, LogUtils): log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') self.safe_psql("checkpoint") assert self.pgdata_dir is not None # please mypy - return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 def clear_shared_buffers(self, cursor: Optional[Any] = None): """ @@ -4639,6 +4562,40 @@ class Safekeeper(LogUtils): wait_until(20, 0.5, paused) +class NeonBroker(LogUtils): + """An object managing storage_broker instance""" + + def __init__(self, env: NeonEnv): + super().__init__(logfile=env.repo_dir / "storage_broker.log") + self.env = env + self.port: int = self.env.port_distributor.get_port() + self.running = False + + def start( + self, + timeout_in_seconds: Optional[int] = None, + ): + assert not self.running + self.env.neon_cli.broker_start(timeout_in_seconds) + self.running = True + return self + + def stop(self): + if self.running: + self.env.neon_cli.broker_stop() + self.running = False + return self + + def listen_addr(self): + return f"127.0.0.1:{self.port}" + + def client_url(self): + return f"http://{self.listen_addr()}" + + def assert_no_errors(self): + assert_no_errors(self.logfile, "storage_controller", []) + + # TODO: Replace with `StrEnum` when we upgrade to python 3.11 class NodeKind(str, Enum): PAGESERVER = "pageserver" diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 3e0ffabf74..97e63ed4ba 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -39,7 +39,7 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - log.info(f"duplicating template tenant {ncopies} times in S3") + log.info(f"duplicating template tenant {ncopies} times in remote storage") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index e2dd51802c..2c8e71526c 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -24,7 +24,7 @@ def build_type() -> Optional[str]: return None -@pytest.fixture(scope="function", autouse=True) +@pytest.fixture(scope="session", autouse=True) def platform() -> Optional[str]: return None diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index b3866f1813..f1ab7876f9 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") - neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) branch_creation_durations = [] diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 667d1a4c4a..f8d39487f2 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): env.neon_cli.create_branch("root") endpoint_root = env.endpoints.create_start("root") - pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"]) fork_at_current_lsn(env, endpoint_root, "child", "root") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 29a0380524..dbf94a2cf5 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -24,13 +24,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg endpoint = env.endpoints.create_start("main") - pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history") # now start subscriber vanilla_pg.start() - pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()]) vanilla_pg.safe_psql("truncate table pgbench_accounts") vanilla_pg.safe_psql("truncate table pgbench_history") @@ -99,9 +99,9 @@ def test_subscriber_lag( sub_connstr = benchmark_project_sub.connstr if benchmark_project_pub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) if benchmark_project_sub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -193,8 +193,8 @@ def test_publisher_restart( pub_connstr = benchmark_project_pub.connstr sub_connstr = benchmark_project_sub.connstr - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -288,7 +288,7 @@ def test_snap_files( is_super = cur.fetchall()[0][0] assert is_super, "This benchmark won't work if we don't have superuser" - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env) conn = psycopg2.connect(connstr) conn.autocommit = True diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 7e16197211..49b1176d34 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -85,7 +85,7 @@ def test_ro_replica_lag( endpoint_id=replica["endpoint"]["id"], )["uri"] - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env) master_workload = pg_bin.run_nonblocking( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], @@ -212,7 +212,7 @@ def test_replication_start_stop( for i in range(num_replicas): replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] - pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env) # Sync replicas with psycopg2.connect(master_connstr) as conn_master: diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index f2e3855c12..d7c4cf059a 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -142,6 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC "pitr_interval": "0 s", + "lsn_lease_length": "0s", } ) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 0a5336f5a2..2bf7041cf1 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -11,7 +11,9 @@ from fixtures.utils import print_gc_result, query_scalar # def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) + env = neon_env_builder.init_start( + initial_tenant_conf={"pitr_interval": "0 sec", "lsn_lease_length": "0s"} + ) error_regexes = [ ".*invalid branch start lsn.*", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index fc74707639..3d5c34a595 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -52,7 +52,7 @@ def test_branching_with_pgbench( def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-T15", connstr]) env.neon_cli.create_branch("b0", tenant_id=tenant) @@ -419,7 +419,7 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) client = env.pageserver.http_client() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index be787e0642..cb34551b53 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -240,6 +240,7 @@ def test_uploads_and_deletions( "image_creation_threshold": "1", "image_layer_creation_check_threshold": "0", "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), + "lsn_lease_length": "0s", } env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 85616c3fe2..1fec8b3f18 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -291,7 +291,7 @@ def pgbench_init_tenant( ) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()]) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) return (tenant_id, timeline_id) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index ae63136abb..35e0c0decb 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): def run_pgbench(connstr: str, pg_bin: PgBin): log.info(f"Start a pgbench workload on pg {connstr}") # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr]) log.info("pgbench init done") pg_bin.run_capture(["pgbench", "-T60", connstr]) @@ -222,7 +222,7 @@ def pgbench_accounts_initialized(ep): # Without hs feedback enabled we'd see 'User query might have needed to see row # versions that must be removed.' errors. def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) agressive_vacuum_conf = [ "log_autovacuum_min_duration = 0", "autovacuum_naptime = 10s", diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 193149ea03..97093ea535 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -173,6 +173,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): # "image_creation_threshold": set at runtime "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers "image_layer_creation_check_threshold": "0", # always check if a new image layer can be created + "lsn_lease_length": "0s", } def tenant_update_config(changes): diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index e88e56d030..7211619a99 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - num_migrations = 10 + num_migrations = 11 endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index ba170cfb4c..b65430ff49 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index ebf58d2bd1..519994f774 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -53,6 +53,7 @@ TENANT_CONF = { # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", "image_layer_creation_check_threshold": "0", + "lsn_lease_length": "0s", } @@ -134,7 +135,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() for sk in env.safekeepers: sk.start() env.storage_controller.start() diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 24a37b04ec..37ab51f9fb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -74,7 +74,7 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", + ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*", ] ) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index 37ff923632..ada6da98ff 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 65569f3bac..9bb9b373ad 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 4145a303c6..d8626c15a5 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -30,9 +30,8 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): version = match.group("version") commit = match.group("commit") - if "." in version: - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" - assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 2e5260ca78..0a57fc9605 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -244,6 +244,7 @@ def test_remote_storage_upload_queue_retries( # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", "image_layer_creation_check_threshold": "0", + "lsn_lease_length": "0s", } ) @@ -391,6 +392,7 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", + "lsn_lease_length": "0s", } ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 4a84dca399..1eb33b2d39 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -200,6 +200,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: # Disable automatic creation of image layers, as we will create them explicitly when we want them "image_creation_threshold": 9999, "image_layer_creation_check_threshold": 0, + "lsn_lease_length": "0s", } neon_env_builder.storage_controller_config = { diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index eea05d7548..4106efd4f9 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest +from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log @@ -18,7 +19,6 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, StorageControllerLeadershipStatus, - TokenScope, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -69,7 +69,7 @@ def test_storage_controller_smoke( env = neon_env_builder.init_configs() # Start services by hand so that we can skip a pageserver (this will start + register later) - env.broker.try_start() + env.broker.start() env.storage_controller.start() env.pageservers[0].start() env.pageservers[1].start() @@ -292,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() env.storage_controller.start() # This is the pageserver where we'll initially create the tenant. Run it in emergency @@ -485,7 +485,7 @@ def test_storage_controller_compute_hook( httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) # Start running - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) # Initial notification from tenant creation assert len(notifications) == 1 @@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) - env.storage_controller.allowed_errors.append( - ".*Accepted configuration update but reconciliation failed.*" + env.storage_controller.allowed_errors.extend( + [ + ".*Accepted configuration update but reconciliation failed.*", + ".*Leader is stepped down instance", + ] ) observed_state = env.storage_controller.step_down() @@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): assert "compaction_threshold" in ps_tenant_conf.effective_config assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 - # Validate that the storcon is not replying to the usual requests - # once it has stepped down. - with pytest.raises(StorageControllerApiException, match="stepped_down"): + # Validate that the storcon attempts to forward the request, but stops. + # when it realises it is still the current leader. + with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): env.storage_controller.tenant_list() # Validate that we can step down multiple times and the observed state @@ -2123,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int): max_workers=2 + len(env.pageservers) + len(env.safekeepers) ) as executor: futs.append( - executor.submit(lambda: env.broker.try_start() or None) + executor.submit(lambda: env.broker.start() or None) ) # The `or None` is for the linter for pageserver in env.pageservers: @@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_list() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" + if step_down_times_out: env.storage_controller.allowed_errors.extend( [ diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 848e214c5e..b6c19f03f6 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -204,6 +204,7 @@ def test_scrubber_physical_gc_ancestors( # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas # and makes them GC'able "pitr_interval": "0s", + "lsn_lease_length": "0s", }, ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index b165588636..e7c6d5a4c3 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -266,13 +266,13 @@ def test_tenant_reattach_while_busy( def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) # create new nenant - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline # assert tenant exists on disk assert env.pageserver.tenant_dir(tenant_id).exists() diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 840c7159ad..094dd20529 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -106,7 +106,7 @@ def test_threshold_based_eviction( # create a bunch of layers with env.endpoints.create_start("main", tenant_id=tenant_id) as pg: - pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()]) last_flush_lsn_upload(env, pg, tenant_id, timeline_id) # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint for sk in env.safekeepers: diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index ddfe9b911f..765c72cf2a 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -45,7 +45,10 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool tenant_after = http.tenant_status(env.initial_tenant) assert tenant_before != tenant_after gc_blocking = tenant_after["gc_blocking"] - assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }" + assert ( + gc_blocking + == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }" + ) wait_for_another_gc_round() pss.assert_log_contains(gc_skipped_line) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 50fac441c0..8ee548bdb0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -19,7 +19,6 @@ import psycopg2.errors import psycopg2.extras import pytest import requests -from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics @@ -893,6 +892,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): log.info(f"debug_dump before reboot {debug_dump_0}") assert debug_dump_0["timelines_count"] == 1 assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id) + assert debug_dump_0["timelines"][0]["wal_last_modified"] != "" endpoint.safe_psql("create table t(i int)") @@ -1439,11 +1439,7 @@ class SafekeeperEnv: ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = NeonBroker( - logfile=Path(self.repo_dir) / "storage_broker.log", - port=self.port_distributor.get_port(), - neon_binpath=neon_binpath, - ) + self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}" self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") @@ -1492,7 +1488,7 @@ class SafekeeperEnv: "--id", str(i), "--broker-endpoint", - self.broker.client_url(), + self.fake_broker_endpoint, ] log.info(f'Running command "{" ".join(cmd)}"') diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a317b9b5b9..87cb68f899 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7 +Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 6f6d77fb59..72b904c0b3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9 +Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 0baa7346df..3ec6e2496f 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e +Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 9156d63ce2..5bbb9bd93d 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa +Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68 diff --git a/vendor/revisions.json b/vendor/revisions.json index 3a65a507f3..6289a53670 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,14 +1,18 @@ { + "v17": [ + "17rc1", + "5bbb9bd93dd805e90bd8af15d00080363d18ec68" + ], "v16": [ "16.4", - "0baa7346dfd42d61912eeca554c9bb0a190f0a1e" + "3ec6e2496f64c6fec35c67cb82efd6490a6a4738" ], "v15": [ "15.8", - "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9" + "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1" ], "v14": [ "14.13", - "a317b9b5b96978b49e78986697f3dd80d06f99a7" + "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 140c43639e..662916d42c 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -38,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt", digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } @@ -88,6 +89,8 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] }