diff --git a/CODEOWNERS b/CODEOWNERS index b8ca54bc7e..e384dc39f1 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -5,7 +5,7 @@ /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/safekeepers /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute -/pageserver/ @neondatabase/compute @neondatabase/storage +/pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/safekeepers diff --git a/Cargo.lock b/Cargo.lock index 2055f001af..36e7069eb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1780,18 +1780,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] - -[[package]] -name = "hermit-abi" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "hex" @@ -2053,7 +2044,7 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ - "hermit-abi 0.3.1", + "hermit-abi", "libc", "windows-sys 0.48.0", ] @@ -2070,7 +2061,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" dependencies = [ - "hermit-abi 0.3.1", + "hermit-abi", "io-lifetimes", "rustix 0.37.19", "windows-sys 0.48.0", @@ -2444,11 +2435,11 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi", "libc", ] @@ -3246,6 +3237,7 @@ dependencies = [ "reqwest-tracing", "routerify", "rstest", + "rustc-hash", "rustls", "rustls-pemfile", "scopeguard", @@ -3417,6 +3409,7 @@ dependencies = [ "metrics", "once_cell", "pin-project-lite", + "rand", "scopeguard", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 4fe3069822..b0bcf69039 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,6 +107,7 @@ reqwest-middleware = "0.2.0" reqwest-retry = "0.2.2" routerify = "3" rpds = "0.13" +rustc-hash = "1.1.0" rustls = "0.21" rustls-pemfile = "1" rustls-split = "0.3" diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 55eb9b7411..7e34b66d68 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -615,11 +615,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre ######################################################################################### # # Layer "rust extensions" -# This layer is used to build `pgx` deps -# -# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from -# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx -# dependency on all the rust extension that depend on it, too. +# This layer is used to build `pgrx` deps # ######################################################################################### FROM build-deps AS rust-extensions-build @@ -635,22 +631,12 @@ USER nonroot WORKDIR /home/nonroot ARG PG_VERSION -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16") \ - echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \ - ;; \ - esac && \ - curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ - cargo install --locked --version 0.7.3 cargo-pgx && \ - /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + cargo install --locked --version 0.10.2 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root @@ -664,23 +650,11 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023 -# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5 -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16") \ - echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ - echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ + echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgx install --release && \ + sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### @@ -693,26 +667,11 @@ RUN case "${PG_VERSION}" in \ FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch) -# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in -# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the -# same 1.1 version we've used before. -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16") \ - echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ - echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ + echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \ - cargo pgx install --release && \ + sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control @@ -727,21 +686,11 @@ RUN case "${PG_VERSION}" in \ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION -# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023 -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16") \ - echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ - echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \ +# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 +RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ + echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ - cargo pgx install --release && \ + cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control ######################################################################################### @@ -754,21 +703,15 @@ RUN case "${PG_VERSION}" in \ FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16") \ - echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ - echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ + echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgx install --release && \ + echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ + wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ + patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ + echo "********************************************************************************************************" && \ + sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control ######################################################################################### diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index e879646b63..d4bca59c7b 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap if attach_req.pageserver_id.is_some() { tenant_state.generation += 1; } + tenant_state.pageserver = attach_req.pageserver_id; let generation = tenant_state.generation; locked.save().await.map_err(ApiError::InternalServerError)?; diff --git a/deny.toml b/deny.toml index 55c581ce3a..f4ea0d4dac 100644 --- a/deny.toml +++ b/deny.toml @@ -23,7 +23,7 @@ vulnerability = "deny" unmaintained = "warn" yanked = "warn" notice = "warn" -ignore = ["RUSTSEC-2023-0052"] +ignore = [] # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md new file mode 100644 index 0000000000..f708f641aa --- /dev/null +++ b/docs/rfcs/028-pageserver-migration.md @@ -0,0 +1,599 @@ +# Seamless tenant migration + +- Author: john@neon.tech +- Created on 2023-08-11 +- Implemented on .. + +## Summary + +The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant +migration safe". Following that, +this RFC is about how those migrations are to be done: + +1. Seamlessly (without interruption to client availability) +2. Quickly (enabling faster operations) +3. Efficiently (minimizing I/O and $ cost) + +These points are in priority order: if we have to sacrifice +efficiency to make a migration seamless for clients, we will +do so, etc. + +This is accomplished by introducing two high level changes: + +- A dual-attached state for tenants, used in a control-plane-orchestrated + migration procedure that preserves availability during a migration. +- Warm secondary locations for tenants, where on-disk content is primed + for a fast migration of the tenant from its current attachment to this + secondary location. + +## Motivation + +Migrating tenants between pageservers is essential to operating a service +at scale, in several contexts: + +1. Responding to a pageserver node failure by migrating tenants to other pageservers +2. Balancing load and capacity across pageservers, for example when a user expands their + database and they need to migrate to a pageserver with more capacity. +3. Restarting pageservers for upgrades and maintenance + +The current situation steps for migration are: + +- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)). +- attach to new node +- re-configure endpoints to use the new node + +Once [generation numbers](025-generation-numbers.md) are implemented, +the detach step is no longer critical for correctness. So, we can + +- attach to a new node, +- re-configure endpoints to use the new node, and then +- detach from the old node. + +However, this still does not meet our seamless/fast/efficient goals: + +- Not fast: The new node will have to download potentially large amounts + of data from S3, which may take many minutes. +- Not seamless: If we attach to a new pageserver before detaching an old one, + the new one might delete some objects that interrupt availability of reads on the old one. +- Not efficient: the old pageserver will continue uploading + S3 content during the migration that will never be read. + +The user expectations for availability are: + +- For planned maintenance, there should be zero availability + gap. This expectation is fulfilled by this RFC. +- For unplanned changes (e.g. node failures), there should be + minimal availability gap. This RFC provides the _mechanism_ + to fail over quickly, but does not provide the failure _detection_ + nor failover _policy_. + +## Non Goals + +- Defining service tiers with different storage strategies: the same + level of HA & overhead will apply to all tenants. This doesn't rule out + adding such tiers in future. +- Enabling pageserver failover in the absence of a control plane: the control + plane will remain the source of truth for what should be attached where. +- Totally avoiding availability gaps on unplanned migrations during + a failure (we expect a small, bounded window of + read unavailability of very recent LSNs) +- Workload balancing: this RFC defines the mechanism for moving tenants + around, not the higher level logic for deciding who goes where. +- Defining all possible configuration flows for tenants: the migration process + defined in this RFC demonstrates the sufficiency of the pageserver API, but + is not the only kind of configuration change the control plane will ever do. + The APIs defined here should let the control plane move tenants around in + whatever way is needed while preserving data safety and read availability. + +## Impacted components + +Pageserver, control plane + +## Terminology + +- **Attachment**: a tenant is _attached_ to a pageserver if it has + been issued a generation number, and is running an instance of + the `Tenant` type, ingesting the WAL, and available to serve + page reads. +- **Location**: locations are a superset of attachments. A location + is a combination of a tenant and a pageserver. We may _attach_ at a _location_. + +- **Secondary location**: a location which is not currently attached. +- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features. + +## Implementation (high level) + +### Warm secondary locations + +To enable faster migrations, we will identify at least one _secondary location_ +for each tenant. This secondary location will keep a warm cache of layers +for the tenant, so that if it is later attached, it can catch up with the +latest LSN quickly: rather than downloading everything, it only has to replay +the recent part of the WAL to advance from the remote_consistent_offset to the +most recent LSN in the WAL. + +The control plane is responsible for selecting secondary locations, and +calling into pageservers to configure tenants into a secondary mode at this +new location, as well as attaching the tenant in its existing primary location. + +The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap) +to advise secondaries of which layers should be downloaded. + +### Location modes + +Currently, we consider a tenant to be in one of two states on a pageserver: + +- Attached: active `Tenant` object, and layers on local disk +- Detached: no layers on local disk, no runtime state. + +We will extend this with finer-grained modes, whose purpose will become +clear in later sections: + +- **AttachedSingle**: equivalent the existing attached state. +- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but + does not do deletions. +- **AttachedStale**: like AttachedSingle, holds a stale generation, + do not do any remote storage operations. +- **Secondary**: keep local state on disk, periodically update from S3. +- **Detached**: equivalent to existing detached state. + +To control these finer grained states, a new pageserver API endpoint will be added. + +### Cutover procedure + +Define old location and new location as "Node A" and "Node B". Consider +the case where both nodes are available, and Node B was previously configured +as a secondary location for the tenant we are migrating. + +The cutover procedure is orchestrated by the control plane, calling into +the pageservers' APIs: + +1. Call to Node A requesting it to flush to S3 and enter AttachedStale state +2. Increment generation, and call to Node B requesting it to enter AttachedMulti + state with the new generation. +3. Call to Node B, requesting it to download the latest hot layers from remote storage, + according to the latest heatmap flushed by Node A. +4. Wait for Node B's WAL ingestion to catch up with node A's +5. Update endpoints to use node B instead of node A +6. Call to node B requesting it to enter state AttachedSingle. +7. Call to node A requesting it to enter state Secondary + +The following table summarizes how the state of the system advances: + +| Step | Node A | Node B | Node used by endpoints | +| :-----------: | :------------: | :------------: | :--------------------: | +| 1 (_initial_) | AttachedSingle | Secondary | A | +| 2 | AttachedStale | AttachedMulti | A | +| 3 | AttachedStale | AttachedMulti | A | +| 4 | AttachedStale | AttachedMulti | A | +| 5 (_cutover_) | AttachedStale | AttachedMulti | B | +| 6 | AttachedStale | AttachedSingle | B | +| 7 (_final_) | Secondary | AttachedSingle | B | + +The procedure described for a clean handover from a live node to a secondary +is also used for failure cases and for migrations to a location that is not +configured as a secondary, by simply skipping irrelevant steps, as described in +the following sections. + +#### Migration from an unresponsive node + +If node A is unavailable, then all calls into +node A are skipped and we don't wait for B to catch up before +switching updating the endpoints to use B. + +#### Migration to a location that is not a secondary + +If node B is initially in Detached state, the procedure is identical. Since Node B +is coming from a Detached state rather than Secondary, the download of layers and +catch up with WAL will take much longer. + +We might do this if: + +- Attached and secondary locations are both critically low on disk, and we need + to migrate to a third node with more resources available. +- We are migrating a tenant which does not use secondary locations to save on cost. + +#### Permanent migration away from a node + +In the final step of the migration, we generally request the original node to enter a Secondary +state. This is typical if we are doing a planned migration during maintenance, or to +balance CPU/network load away from a node. + +One might also want to permanently migrate away: this can be done by simply removing the secondary +location after the migration is complete, or as an optimization by substituting the Detached state +for the Secondary state in the final step. + +#### Cutover diagram + +```mermaid +sequenceDiagram +participant CP as Control plane +participant A as Node A +participant B as Node B +participant E as Endpoint + +CP->>A: PUT Flush & go to AttachedStale +note right of A: A continues to ingest WAL +CP->>B: PUT AttachedMulti +CP->>B: PUT Download layers from latest heatmap +note right of B: B downloads from S3 +loop Poll until download complete +CP->>B: GET download status +end +activate B +note right of B: B ingests WAL +loop Poll until catch up +CP->>B: GET visible WAL +CP->>A: GET visible WAL +end +deactivate B +CP->>E: Configure to use Node B +E->>B: Connect for reads +CP->>B: PUT AttachedSingle +CP->>A: PUT Secondary +``` + +#### Cutover from an unavailable pageserver + +This case is far simpler: we may skip straight to our intended +end state. + +```mermaid +sequenceDiagram +participant A as Node A +participant CP as Control plane +participant B as Node B +participant E as Endpoint + +note right of A: Node A offline +activate A +CP->>B: PUT AttachedSingle +CP->>E: Configure to use Node B +E->>B: Connect for reads +deactivate A +``` + +## Implementation (detail) + +### Purpose of AttachedMulti, AttachedStale + +#### AttachedMulti + +Ordinarily, an attached pageserver whose generation is the latest may delete +layers at will (e.g. during compaction). If a previous generation pageserver +is also still attached, and in use by endpoints, then this layer deletion could +lead to a loss of availability for the endpoint when reading from the previous +generation pageserver. + +The _AttachedMulti_ state simply disables deletions. These will be enqueued +in `RemoteTimelineClient` until the control plane transitions the +node into AttachedSingle, which unblocks deletions. Other remote storage operations +such as uploads are not blocked. + +AttachedMulti is not required for data safety, only to preserve availability +on pageservers running with stale generations. + +A node enters AttachedMulti only when explicitly asked to by the control plane. It should +only remain in this state for the duration of a migration. + +If a control plane bug leaves +the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued +deletions. This may be accomplished simply, by dropping enqueued deletions when some modest +threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions, +it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or +by timeline deletion. + +During AttachedMulti, the Tenant is free to drop layers from local disk in response to +disk pressure: only the deletion of remote layers is blocked. + +#### AttachedStale + +Currently, a pageserver with a stale generation number will continue to +upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation +will not be read back by future generations of pageservers. + +The _AttachedStale_ state disables S3 uploads. The stale pageserver +will continue to ingest the WAL and write layers to local disk, but not to +do any uploads to S3. + +A node may enter AttachedStale in two ways: + +- Explicitly, when control plane calls into the node at the start of a migration. +- Implicitly, when the node tries to validate some deletions and discovers + that its generation is stale. + +The AttachedStale state also disables sending consumption metrics from +that location: it is interpreted as an indication that some other pageserver +is already attached or is about to be attached, and that new pageserver will +be responsible for sending consumption metrics. + +#### Disk Pressure & AttachedStale + +Over long periods of time, a tenant location in AttachedStale will accumulate data +on local disk, as it cannot evict any layers written since it entered the +AttachStale state. We rely on the control plane to revert the location to +Secondary or Detached at the end of a migration. + +This scenario is particularly noteworthy when evacuating all tenants on a pageserver: +since _all_ the attached tenants will go into AttachedStale, we will be doing no +uploads at all, therefore ingested data will cause disk usage to increase continuously. +Under nominal conditions, the available disk space on pageservers should be sufficient +to complete the evacuation before this becomes a problem, but we must also handle +the case where we hit a low disk situation while in this state. + +The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task` +touches each Tenant when it determines that a low-disk condition requires +some layer eviction. Having selected layers for eviction, the eviction +task calls `Timeline::evict_layers`. + +**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted +layers are not yet uploaded to S3, then the block on uploads will be lifted. This +will result in leaking some objects once a migration is complete, but will enable +the node to manage its disk space properly: if a node is left with some tenants +in AttachedStale indefinitely due to a network partition or control plane bug, +these tenants will not cause a full disk condition. + +### Warm secondary updates + +#### Layer heatmap + +The secondary location's job is to serve reads **with the same quality of service as the original location +was serving them around the time of a migration**. This does not mean the secondary +location needs the whole set of layers: inactive layers that might soon +be evicted on the attached pageserver need not be downloaded by the +secondary. A totally idle tenant only needs to maintain enough on-disk +state to enable a fast cold start (i.e. the most recent image layers are +typically sufficient). + +To enable this, we introduce the concept of a _layer heatmap_, which +acts as an advisory input to secondary locations to decide which +layers to download from S3. + +#### Attached pageserver + +The attached pageserver, if in state AttachedSingle, periodically +uploads a serialized heat map to S3. It may skip this if there +is no change since the last time it uploaded (e.g. if the tenant +is totally idle). + +Additionally, when the tenant is flushed to remote storage prior to a migration +(the first step in [cutover procedure](#cutover-procedure)), +the heatmap is written out. This enables a future attached pageserver +to get an up to date view when deciding which layers to download. + +#### Secondary location behavior + +Secondary warm locations run a simple loop, implemented separately from +the main `Tenant` type, which represents attached tenants: + +- Download the layer heatmap +- Select any "hot enough" layers to download, if there is sufficient + free disk space. +- Download layers, if they were not previously evicted (see below) +- Download the latest index_part.json +- Check if any layers currently on disk are no longer referenced by + IndexPart & delete them + +Note that the heatmap is only advisory: if a secondary location has plenty +of disk space, it may choose to retain layers that aren't referenced +by the heatmap, as long as they are still referenced by the IndexPart. Conversely, +if a node is very low on disk space, it might opt to raise the heat threshold required +to both downloading a layer, until more disk space is available. + +#### Secondary locations & disk pressure + +Secondary locations are subject to eviction on disk pressure, just as +attached locations are. For eviction purposes, the access time of a +layer in a secondary location will be the access time given in the heatmap, +rather than the literal time at which the local layer file was accessed. + +The heatmap will indicate which layers are in local storage on the attached +location. The secondary will always attempt to get back to having that +set of layers on disk, but to avoid flapping, it will remember the access +time of the layer it was most recently asked to evict, and layers whose +access time is below that will not be re-downloaded. + +The resulting behavior is that after a layer is evicted from a secondary +location, it is only re-downloaded once the attached pageserver accesses +the layer and uploads a heatmap reflecting that access time. On a pageserver +restart, the secondary location will attempt to download all layers in +the heatmap again, if they are not on local disk. + +This behavior will be slightly different when secondary locations are +used for "low energy tenants", but that is beyond the scope of this RFC. + +### Location configuration API + +Currently, the `/tenant//config` API defines various +tunables like compaction settings, which apply to the tenant irrespective +of which pageserver it is running on. + +A new "location config" structure will be introduced, which defines +configuration which is per-tenant, but local to a particular pageserver, +such as the attachment mode and whether it is a secondary. + +The pageserver will expose a new per-tenant API for setting +the state: `/tenant//location/config`. + +Body content: + +``` +{ + state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}', + generation: Option, + configuration: `Option` + flush: bool +} +``` + +Existing `/attach` and `/detach` endpoint will have the same +behavior as calling `/location/config` with `AttachedSingle` and `Detached` +states respectively. These endpoints will be deprecated and later +removed. + +The generation attribute is mandatory for entering `AttachedSingle` or +`AttachedMulti`. + +The configuration attribute is mandatory when entering any state other +than `Detached`. This configuration is the same as the body for +the existing `/tenant//config` endpoint. + +The `flush` argument indicates whether the pageservers should flush +to S3 before proceeding: this only has any effect if the node is +currently in AttachedSingle or AttachedMulti. This is used +during the first phase of migration, when transitioning the +old pageserver to AttachedSingle. + +The `/re-attach` API response will be extended to include a `state` as +well as a `generation`, enabling the pageserver to enter the +correct state for each tenant on startup. + +### Database schema for locations + +A new table `ProjectLocation`: + +- pageserver_id: int +- tenant_id: TenantId +- generation: Option +- state: `enum(Secondary, AttachedSingle, AttachedMulti)` + +Notes: + +- It is legacy for a Project to have zero `ProjectLocation`s +- The `pageserver` column in `Project` now means "to which pageserver should + endpoints connect", rather than simply which pageserver is attached. +- The `generation` column in `Project` remains, and is incremented and used + to set the generation of `ProjectLocation` rows when they are set into + an attached state. +- The `Detached` state is implicitly represented as the absence of + a `ProjectLocation`. + +### Executing migrations + +Migrations will be implemented as Go functions, within the +existing `Operation` framework in the control plane. These +operations are persistent, such that they will always keep +trying until completion: this property is important to avoid +leaving garbage behind on pageservers, such as AttachedStale +locations. + +### Recovery from failures during migration + +During migration, the control plane may encounter failures of either +the original or new pageserver, or both: + +- If the original fails, skip past waiting for the new pageserver + to catch up, and put it into AttachedSingle immediately. +- If the new node fails, put the old pageserver into Secondary + and then back into AttachedSingle (this has the effect of + retaining on-disk state and granting it a fresh generation number). +- If both nodes fail, keep trying until one of them is available + again. + +### Control plane -> Pageserver reconciliation + +A migration may be done while the old node is unavailable, +in which case the old node may still be running in an AttachedStale +state. + +In this case, it is undesirable to have the migration `Operation` +stay alive until the old node eventually comes back online +and can be cleaned up. To handle this, the control plane +should run a background reconciliation process to compare +a pageserver's attachments with the database, and clean up +any that shouldn't be there any more. + +Note that there will be no work to do if the old node was really +offline, as during startup it will call into `/re-attach` and +be updated that way. The reconciliation will only be needed +if the node was unavailable but still running. + +## Alternatives considered + +### Only enabling secondary locations for tenants on a higher service tier + +This will make sense in future, especially for tiny databases that may be +downloaded from S3 in milliseconds when needed. + +However, it is not wise to do it immediately, because pageservers contain +a mixture of higher and lower tier workloads. If we had 1 tenant with +a secondary location and 9 without, then those other 9 tenants will do +a lot of I/O as they try to recover from S3, which may degrade the +service of the tenant which had a secondary location. + +Until we segregate tenant on different service tiers on different pageserver +nodes, or implement & test QoS to ensure that tenants with secondaries are +not harmed by tenants without, we should use the same failover approach +for all the tenants. + +### Hot secondary locations (continuous WAL replay) + +Instead of secondary locations populating their caches from S3, we could +have them consume the WAL from safekeepers. The downsides of this would be: + +- Double load on safekeepers, which are a less scalable service than S3 +- Secondary locations' on-disk state would end up subtly different to + the remote state, which would make synchronizing with S3 more complex/expensive + when going into attached state. + +The downside of only updating secondary locations from S3 is that we will +have a delay during migration from replaying the LSN range between what's +in S3 and what's in the pageserver. This range will be very small on +planned migrations, as we have the old pageserver flush to S3 immediately +before attaching the new pageserver. On unplanned migrations (old pageserver +is unavailable), the range of LSNs to replay is bounded by the flush frequency +on the old pageserver. However, the migration doesn't have to wait for the +replay: it's just that not-yet-replayed LSNs will be unavailable for read +until the new pageserver catches up. + +We expect that pageserver reads of the most recent LSNs will be relatively +rare, as for an active endpoint those pages will usually still be in the postgres +page cache: this leads us to prefer synchronizing from S3 on secondary +locations, rather than consuming the WAL from safekeepers. + +### Cold secondary locations + +It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then +we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes). + +Warm caches on secondary locations are necessary to meet +our availability goals. + +### Pageserver-granularity failover + +Instead of migrating tenants individually, we could have entire spare nodes, +and on a node death, move all its work to one of these spares. + +This approach is avoided for several reasons: + +- we would still need fine-grained tenant migration for other + purposes such as balancing load +- by sharing the spare capacity over many peers rather than one spare node, + these peers may use the capacity for other purposes, until it is needed + to handle migrated tenants. e.g. for keeping a deeper cache of their + attached tenants. + +### Readonly during migration + +We could simplify migrations by making both previous and new nodes go into a +readonly state, then flush remote content from the previous node, then activate +attachment on the secondary node. + +The downside to this approach is a potentially large gap in readability of +recent LSNs while loading data onto the new node. To avoid this, it is worthwhile +to incur the extra cost of double-replaying the WAL onto old and new nodes' local +storage during a migration. + +### Peer-to-peer pageserver communication + +Rather than uploading the heatmap to S3, attached pageservers could make it +available to peers. + +Currently, pageservers have no peer to peer communication, so adding this +for heatmaps would incur significant overhead in deployment and configuration +of the service, and ensuring that when a new pageserver is deployed, other +pageservers are updated to be aware of it. + +As well as simplifying implementation, putting heatmaps in S3 will be useful +for future analytics purposes -- gathering aggregated statistics on activity +pattersn across many tenants may be done directly from data in S3. diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 7b133c61af..9e89327e84 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } -#[derive(serde::Serialize)] +#[derive(serde::Serialize, serde::Deserialize)] pub struct EventChunk<'a, T: Clone> { pub events: std::borrow::Cow<'a, [T]>, } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index f354296be2..68620787bb 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -363,8 +363,15 @@ pub struct TimelineInfo { pub latest_gc_cutoff_lsn: Lsn, #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, + + /// The LSN that we have succesfully uploaded to remote storage #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, + + /// The LSN that we are advertizing to safekeepers + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn_visible: Lsn, + pub current_logical_size: Option, // is None when timeline is Unloaded /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index a4adae6146..2b808779f4 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -29,3 +29,4 @@ workspace_hack.workspace = true [dev-dependencies] tempfile.workspace = true test-context.workspace = true +rand.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 1ddd156a08..a92b87632b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -20,6 +20,7 @@ use std::{ use anyhow::{bail, Context}; +use serde::{Deserialize, Serialize}; use tokio::io; use toml_edit::Item; use tracing::info; @@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; +/// As defined in S3 docs +pub const MAX_KEYS_PER_DELETE: usize = 1000; + const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; /// Path on the remote storage, relative to some inner prefix. @@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct RemotePath(PathBuf); +impl Serialize for RemotePath { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(self) + } +} + +impl<'de> Deserialize<'de> for RemotePath { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let str = String::deserialize(deserializer)?; + Ok(Self(PathBuf::from(&str))) + } +} + impl std::fmt::Display for RemotePath { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0.display()) @@ -88,6 +111,10 @@ impl RemotePath { pub fn extension(&self) -> Option<&str> { self.0.extension()?.to_str() } + + pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> { + self.0.strip_prefix(&p.0) + } } /// Storage (potentially remote) API to manage its state. diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 9262f1e88f..fc6d7fa61b 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -33,11 +33,10 @@ use tracing::debug; use super::StorageMetadata; use crate::{ - Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; -const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000; - pub(super) mod metrics; use self::metrics::{AttemptOutcome, RequestKind}; @@ -48,10 +47,47 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + concurrency_limiter: ConcurrencyLimiter, +} + +struct ConcurrencyLimiter { // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. - concurrency_limiter: Arc, + write: Arc, + read: Arc, +} + +impl ConcurrencyLimiter { + fn for_kind(&self, kind: RequestKind) -> &Arc { + match kind { + RequestKind::Get => &self.read, + RequestKind::Put => &self.write, + RequestKind::List => &self.read, + RequestKind::Delete => &self.write, + } + } + + async fn acquire( + &self, + kind: RequestKind, + ) -> Result, tokio::sync::AcquireError> { + self.for_kind(kind).acquire().await + } + + async fn acquire_owned( + &self, + kind: RequestKind, + ) -> Result { + Arc::clone(self.for_kind(kind)).acquire_owned().await + } + + fn new(limit: usize) -> ConcurrencyLimiter { + Self { + read: Arc::new(Semaphore::new(limit)), + write: Arc::new(Semaphore::new(limit)), + } + } } #[derive(Default)] @@ -118,7 +154,7 @@ impl S3Bucket { bucket_name: aws_config.bucket_name.clone(), max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), + concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), }) } @@ -157,7 +193,7 @@ impl S3Bucket { let started_at = start_counting_cancelled_wait(kind); let permit = self .concurrency_limiter - .acquire() + .acquire(kind) .await .expect("semaphore is never closed"); @@ -173,8 +209,7 @@ impl S3Bucket { let started_at = start_counting_cancelled_wait(kind); let permit = self .concurrency_limiter - .clone() - .acquire_owned() + .acquire_owned(kind) .await .expect("semaphore is never closed"); @@ -500,7 +535,7 @@ impl RemoteStorage for S3Bucket { delete_objects.push(obj_id); } - for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) { + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { let started_at = start_measuring_requests(kind); let resp = self diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 982c01a9be..b220349829 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs { fn create_s3_client( max_keys_per_list_response: Option, ) -> anyhow::Result> { + use rand::Rng; + let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; - let random_prefix_part = std::time::SystemTime::now() + + // due to how time works, we've had test runners use the same nanos as bucket prefixes. + // millis is just a debugging aid for easier finding the prefix later. + let millis = std::time::SystemTime::now() .duration_since(UNIX_EPOCH) .context("random s3 test prefix part calculation")? - .as_nanos(); + .as_millis(); + + // because nanos can be the same for two threads so can millis, add randomness + let random = rand::thread_rng().gen::(); + let remote_storage_config = RemoteStorageConfig { max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), max_sync_errors: NonZeroU32::new(5).unwrap(), storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: remote_storage_s3_bucket, bucket_region: remote_storage_s3_region, - prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")), + prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")), endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 163c8c0467..88d50905c6 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -89,6 +89,22 @@ impl Generation { Self::Broken => panic!("Attempted to use a broken generation"), } } + + pub fn next(&self) -> Generation { + match self { + Self::Valid(n) => Self::Valid(*n + 1), + Self::None => Self::Valid(1), + Self::Broken => panic!("Attempted to use a broken generation"), + } + } + + pub fn into(self) -> Option { + if let Self::Valid(v) = self { + Some(v) + } else { + None + } + } } impl Serialize for Generation { diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 527e486fd0..dd54cd6ecd 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -24,6 +24,9 @@ pub enum ApiError { #[error("Precondition failed: {0}")] PreconditionFailed(Box), + #[error("Shutting down")] + ShuttingDown, + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -52,6 +55,10 @@ impl ApiError { self.to_string(), StatusCode::PRECONDITION_FAILED, ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f69c0603d5..7f17970c4c 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> { } } +/// When you will store a secret but want to make sure it won't +/// be accidentally logged, wrap it in a SecretString, whose Debug +/// implementation does not expose the contents. +#[derive(Clone, Eq, PartialEq)] +pub struct SecretString(String); + +impl SecretString { + pub fn get_contents(&self) -> &str { + self.0.as_str() + } +} + +impl std::fmt::Debug for SecretString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[SECRET]") + } +} + #[cfg(test)] mod tests { use metrics::{core::Opts, IntCounterVec}; diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 3254fa4501..15e972505e 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -431,14 +431,14 @@ impl CgroupWatcher { .context("failed to request upscale")?; let memory_high = - self.get_high_bytes().context("failed to get memory.high")?; + self.get_memory_high_bytes().context("failed to get memory.high")?; let new_high = memory_high + self.config.memory_high_increase_by_bytes; info!( current_high_bytes = memory_high, new_high_bytes = new_high, "updating memory.high" ); - self.set_high_bytes(new_high) + self.set_memory_high_bytes(new_high) .context("failed to set memory.high")?; last_memory_high_increase_at = Some(Instant::now()); continue; @@ -556,14 +556,6 @@ impl CgroupWatcher { } } -/// Represents a set of limits we apply to a cgroup to control memory usage. -/// -/// Setting these values also affects the thresholds for receiving usage alerts. -#[derive(Debug)] -pub struct MemoryLimits { - pub high: u64, -} - // Methods for manipulating the actual cgroup impl CgroupWatcher { /// Get a handle on the freezer subsystem. @@ -624,50 +616,29 @@ impl CgroupWatcher { } /// Set cgroup memory.high threshold. - pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> { + pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> { + self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)) + } + + /// Set the cgroup's memory.high to 'max', disabling it. + pub fn unset_memory_high(&self) -> anyhow::Result<()> { + self.set_memory_high_internal(MaxValue::Max) + } + + fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> { self.memory() .context("failed to get memory subsystem")? .set_mem(cgroups_rs::memory::SetMemory { low: None, - high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)), + high: Some(value), min: None, max: None, }) - .context("failed to set memory.high") - } - - /// Set cgroup memory.high and memory.max. - pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> { - info!(limits.high, path = self.path(), "writing new memory limits",); - self.memory() - .context("failed to get memory subsystem while setting memory limits")? - .set_mem(cgroups_rs::memory::SetMemory { - min: None, - low: None, - high: Some(MaxValue::Value( - u64::min(limits.high, i64::MAX as u64) as i64 - )), - max: None, - }) - .context("failed to set memory limits") - } - - /// Given some amount of available memory, set the desired cgroup memory limits - pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> { - let new_high = self.config.calculate_memory_high_value(available_memory); - let limits = MemoryLimits { high: new_high }; - info!( - path = self.path(), - memory = ?limits, - "setting cgroup memory", - ); - self.set_limits(&limits) - .context("failed to set cgroup memory limits")?; - Ok(()) + .map_err(anyhow::Error::from) } /// Get memory.high threshold. - pub fn get_high_bytes(&self) -> anyhow::Result { + pub fn get_memory_high_bytes(&self) -> anyhow::Result { let high = self .memory() .context("failed to get memory subsystem while getting memory statistics")? diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 376017d784..09863c8936 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -16,7 +16,7 @@ use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced}; +use crate::cgroup::{CgroupWatcher, Sequenced}; use crate::dispatcher::Dispatcher; use crate::filecache::{FileCacheConfig, FileCacheState}; use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources}; @@ -106,6 +106,51 @@ impl Runner { kill, }; + // If we have both the cgroup and file cache integrations enabled, it's possible for + // temporary failures to result in cgroup throttling (from memory.high), that in turn makes + // it near-impossible to connect to the file cache (because it times out). Unfortunately, + // we *do* still want to determine the file cache size before setting the cgroup's + // memory.high, so it's not as simple as just swapping the order. + // + // Instead, the resolution here is that on vm-monitor startup (note: happens on each + // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we + // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit + // of a hacky solution, but helps with reliability. + if let Some(name) = &args.cgroup { + // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state + // now, and then set limits later. + info!("initializing cgroup"); + + let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send) + .context("failed to create cgroup manager")?; + + info!("temporarily unsetting memory.high"); + + // Temporarily un-set cgroup memory.high; see above. + cgroup + .unset_memory_high() + .context("failed to unset memory.high")?; + + let cgroup = Arc::new(cgroup); + + let cgroup_clone = Arc::clone(&cgroup); + spawn_with_cancel( + token.clone(), + |_| error!("cgroup watcher terminated"), + async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await }, + ); + + state.cgroup = Some(cgroup); + } else { + // *NOTE*: We need to forget the sender so that its drop impl does not get ran. + // This allows us to poll it in `Monitor::run` regardless of whether we + // are managing a cgroup or not. If we don't forget it, all receives will + // immediately return an error because the sender is droped and it will + // claim all select! statements, effectively turning `Monitor::run` into + // `loop { fail to receive }`. + mem::forget(requesting_send); + } + let mut file_cache_reserved_bytes = 0; let mem = get_total_system_memory(); @@ -119,7 +164,7 @@ impl Runner { false => FileCacheConfig::default_in_memory(), }; - let mut file_cache = FileCacheState::new(connstr, config, token.clone()) + let mut file_cache = FileCacheState::new(connstr, config, token) .await .context("failed to create file cache")?; @@ -152,35 +197,15 @@ impl Runner { state.filecache = Some(file_cache); } - if let Some(name) = &args.cgroup { - let (mut cgroup, cgroup_event_stream) = - CgroupWatcher::new(name.clone(), requesting_send) - .context("failed to create cgroup manager")?; - + if let Some(cgroup) = &state.cgroup { let available = mem - file_cache_reserved_bytes; + let value = cgroup.config.calculate_memory_high_value(available); + + info!(value, "setting memory.high"); cgroup - .set_memory_limits(available) - .context("failed to set cgroup memory limits")?; - - let cgroup = Arc::new(cgroup); - - // Some might call this . . . cgroup v2 - let cgroup_clone = Arc::clone(&cgroup); - - spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move { - cgroup_clone.watch(notified_recv, cgroup_event_stream).await - }); - - state.cgroup = Some(cgroup); - } else { - // *NOTE*: We need to forget the sender so that its drop impl does not get ran. - // This allows us to poll it in `Monitor::run` regardless of whether we - // are managing a cgroup or not. If we don't forget it, all receives will - // immediately return an error because the sender is droped and it will - // claim all select! statements, effectively turning `Monitor::run` into - // `loop { fail to receive }`. - mem::forget(requesting_send); + .set_memory_high_bytes(value) + .context("failed to set cgroup memory.high")?; } Ok(state) @@ -257,14 +282,11 @@ impl Runner { new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory); } - let limits = MemoryLimits { - // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here - // since it is properly initialized in the previous cgroup if let block - high: new_cgroup_mem_high, - }; + // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here + // since it is properly initialized in the previous cgroup if let block cgroup - .set_limits(&limits) - .context("failed to set cgroup memory limits")?; + .set_memory_high_bytes(new_cgroup_mem_high) + .context("failed to set cgroup memory.high")?; let message = format!( "set cgroup memory.high to {} MiB, of new max {} MiB", @@ -327,12 +349,9 @@ impl Runner { name = cgroup.path(), "updating cgroup memory.high", ); - let limits = MemoryLimits { - high: new_cgroup_mem_high, - }; cgroup - .set_limits(&limits) - .context("failed to set file cache size")?; + .set_memory_high_bytes(new_cgroup_mem_high) + .context("failed to set cgroup memory.high")?; } Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b6a2117f9c..d8a00b677b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; @@ -20,6 +21,7 @@ use metrics::set_build_info_metric; use pageserver::{ config::{defaults::*, PageServerConf}, context::{DownloadBehavior, RequestContext}, + deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, task_mgr::TaskKind, task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, @@ -346,9 +348,22 @@ fn start_pageserver( } }; + // Top-level cancellation token for the process + let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); + // Set up remote storage client let remote_storage = create_remote_storage_client(conf)?; + // Set up deletion queue + let (deletion_queue, deletion_workers) = DeletionQueue::new( + remote_storage.clone(), + ControlPlaneClient::new(conf, &shutdown_pageserver), + conf, + ); + if let Some(deletion_workers) = deletion_workers { + deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); + } + // Up to this point no significant I/O has been done: this should have been fast. Record // duration prior to starting I/O intensive phase of startup. startup_checkpoint("initial", "Starting loading tenants"); @@ -379,13 +394,13 @@ fn start_pageserver( }; // Scan the local 'tenants/' directory and start loading the tenants - let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); - + let deletion_queue_client = deletion_queue.new_client(); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), + deletion_queue_client, }, order, shutdown_pageserver.clone(), @@ -481,9 +496,10 @@ fn start_pageserver( http::routes::State::new( conf, http_auth.clone(), - remote_storage, + remote_storage.clone(), broker_client.clone(), disk_usage_eviction_state, + deletion_queue.new_client(), ) .context("Failed to initialize router state")?, ); @@ -611,7 +627,12 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + )); unreachable!() } }) @@ -623,7 +644,7 @@ fn create_remote_storage_client( let config = if let Some(config) = &conf.remote_storage_config { config } else { - // No remote storage configured. + tracing::warn!("no remote storage configured, this is a deprecated configuration"); return Ok(None); }; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8ee7f28c11..c3f2f14a74 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -11,6 +11,7 @@ use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; +use utils::logging::SecretString; use once_cell::sync::OnceCell; use reqwest::Url; @@ -207,6 +208,9 @@ pub struct PageServerConf { pub background_task_maximum_delay: Duration, pub control_plane_api: Option, + + /// JWT token for use with the control plane API. + pub control_plane_api_token: Option, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -283,6 +287,7 @@ struct PageServerConfigBuilder { background_task_maximum_delay: BuilderValue, control_plane_api: BuilderValue>, + control_plane_api_token: BuilderValue>, } impl Default for PageServerConfigBuilder { @@ -347,6 +352,7 @@ impl Default for PageServerConfigBuilder { .unwrap()), control_plane_api: Set(None), + control_plane_api_token: Set(None), } } } @@ -475,8 +481,8 @@ impl PageServerConfigBuilder { self.background_task_maximum_delay = BuilderValue::Set(delay); } - pub fn control_plane_api(&mut self, api: Url) { - self.control_plane_api = BuilderValue::Set(Some(api)) + pub fn control_plane_api(&mut self, api: Option) { + self.control_plane_api = BuilderValue::Set(api) } pub fn build(self) -> anyhow::Result { @@ -567,6 +573,9 @@ impl PageServerConfigBuilder { control_plane_api: self .control_plane_api .ok_or(anyhow!("missing control_plane_api"))?, + control_plane_api_token: self + .control_plane_api_token + .ok_or(anyhow!("missing control_plane_api_token"))?, }) } } @@ -580,6 +589,27 @@ impl PageServerConf { self.workdir.join(TENANTS_SEGMENT_NAME) } + pub fn deletion_prefix(&self) -> PathBuf { + self.workdir.join("deletion") + } + + pub fn deletion_list_path(&self, sequence: u64) -> PathBuf { + // Encode a version in the filename, so that if we ever switch away from JSON we can + // increment this. + const VERSION: u8 = 1; + + self.deletion_prefix() + .join(format!("{sequence:016x}-{VERSION:02x}.list")) + } + + pub fn deletion_header_path(&self) -> PathBuf { + // Encode a version in the filename, so that if we ever switch away from JSON we can + // increment this. + const VERSION: u8 = 1; + + self.deletion_prefix().join(format!("header-{VERSION:02x}")) + } + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenants_path().join(tenant_id.to_string()) } @@ -747,7 +777,14 @@ impl PageServerConf { }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?), + "control_plane_api" => { + let parsed = parse_toml_string(key, item)?; + if parsed.is_empty() { + builder.control_plane_api(None) + } else { + builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + } + }, _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -917,6 +954,7 @@ impl PageServerConf { ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, control_plane_api: None, + control_plane_api_token: None, } } } @@ -1140,7 +1178,8 @@ background_task_maximum_delay = '334 s' background_task_maximum_delay: humantime::parse_duration( defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY )?, - control_plane_api: None + control_plane_api: None, + control_plane_api_token: None }, "Correct defaults should be used when no config values are provided" ); @@ -1196,7 +1235,8 @@ background_task_maximum_delay = '334 s' test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::from_secs(334), - control_plane_api: None + control_plane_api: None, + control_plane_api_token: None }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 192eb16789..3375392373 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,7 +1,9 @@ use std::collections::HashMap; -use hyper::StatusCode; -use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; +use pageserver_api::control_api::{ + ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, +}; +use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; use utils::{ @@ -12,25 +14,34 @@ use utils::{ use crate::config::PageServerConf; -// Backoffs when control plane requests do not succeed: compromise between reducing load -// on control plane, and retrying frequently when we are blocked on a control plane -// response to make progress. -const BACKOFF_INCREMENT: f64 = 0.1; -const BACKOFF_MAX: f64 = 10.0; - /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) -pub(crate) struct ControlPlaneClient { +pub struct ControlPlaneClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, cancel: CancellationToken, } +/// Represent operations which internally retry on all errors other than +/// cancellation token firing: the only way they can fail is ShuttingDown. +pub enum RetryForeverError { + ShuttingDown, +} + +#[async_trait::async_trait] +pub trait ControlPlaneGenerationsApi { + async fn re_attach(&self) -> Result, RetryForeverError>; + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError>; +} + impl ControlPlaneClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { + pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { let mut url = match conf.control_plane_api.as_ref() { Some(u) => u.clone(), None => return None, @@ -42,39 +53,78 @@ impl ControlPlaneClient { segs.pop_if_empty().push(""); } - let client = reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"); + let mut client = reqwest::ClientBuilder::new(); + + if let Some(jwt) = &conf.control_plane_api_token { + let mut headers = hyper::HeaderMap::new(); + headers.insert("Authorization", jwt.get_contents().parse().unwrap()); + client = client.default_headers(headers); + } Some(Self { - http_client: client, + http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), }) } - async fn try_re_attach( + async fn retry_http_forever( &self, - url: Url, - request: &ReAttachRequest, - ) -> anyhow::Result { - match self.http_client.post(url).json(request).send().await { - Err(e) => Err(anyhow::Error::from(e)), - Ok(r) => { - if r.status() == StatusCode::OK { - r.json::() - .await - .map_err(anyhow::Error::from) - } else { - Err(anyhow::anyhow!("Unexpected status {}", r.status())) - } + url: &url::Url, + request: R, + ) -> Result + where + R: Serialize, + T: DeserializeOwned, + { + #[derive(thiserror::Error, Debug)] + enum RemoteAttemptError { + #[error("shutdown")] + Shutdown, + #[error("remote: {0}")] + Remote(reqwest::Error), + } + + match backoff::retry( + || async { + let response = self + .http_client + .post(url.clone()) + .json(&request) + .send() + .await + .map_err(RemoteAttemptError::Remote)?; + + response + .error_for_status_ref() + .map_err(RemoteAttemptError::Remote)?; + response + .json::() + .await + .map_err(RemoteAttemptError::Remote) + }, + |_| false, + 3, + u32::MAX, + "calling control plane generation validation API", + backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown), + ) + .await + { + Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown), + Err(RemoteAttemptError::Remote(_)) => { + panic!("We retry forever, this should never be reached"); } + Ok(r) => Ok(r), } } +} - /// Block until we get a successful response - pub(crate) async fn re_attach(&self) -> anyhow::Result> { +#[async_trait::async_trait] +impl ControlPlaneGenerationsApi for ControlPlaneClient { + /// Block until we get a successful response, or error out if we are shut down + async fn re_attach(&self) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -83,37 +133,47 @@ impl ControlPlaneClient { node_id: self.node_id, }; - let mut attempt = 0; - loop { - let result = self.try_re_attach(re_attach_path.clone(), &request).await; - match result { - Ok(res) => { - tracing::info!( - "Received re-attach response with {} tenants", - res.tenants.len() - ); + let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; + tracing::info!( + "Received re-attach response with {} tenants", + response.tenants.len() + ); - return Ok(res - .tenants - .into_iter() - .map(|t| (t.id, Generation::new(t.generation))) - .collect::>()); - } - Err(e) => { - tracing::error!("Error re-attaching tenants, retrying: {e:#}"); - backoff::exponential_backoff( - attempt, - BACKOFF_INCREMENT, - BACKOFF_MAX, - &self.cancel, - ) - .await; - if self.cancel.is_cancelled() { - return Err(anyhow::anyhow!("Shutting down")); - } - attempt += 1; - } - } - } + Ok(response + .tenants + .into_iter() + .map(|t| (t.id, Generation::new(t.generation))) + .collect::>()) + } + + /// Block until we get a successful response, or error out if we are shut down + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError> { + let re_attach_path = self + .base_url + .join("validate") + .expect("Failed to build validate path"); + + let request = ValidateRequest { + tenants: tenants + .into_iter() + .map(|(id, gen)| ValidateRequestTenant { + id, + gen: gen + .into() + .expect("Generation should always be valid for a Tenant doing deletions"), + }) + .collect(), + }; + + let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; + + Ok(response + .tenants + .into_iter() + .map(|rt| (rt.id, rt.valid)) + .collect()) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs new file mode 100644 index 0000000000..4c0d399789 --- /dev/null +++ b/pageserver/src/deletion_queue.rs @@ -0,0 +1,1312 @@ +mod deleter; +mod list_writer; +mod validator; + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::remote_layer_path; +use crate::tenant::remote_timeline_client::remote_timeline_path; +use crate::virtual_file::VirtualFile; +use anyhow::Context; +use hex::FromHex; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use serde::Deserialize; +use serde::Serialize; +use serde_with::serde_as; +use thiserror::Error; +use tokio; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use tracing::{self, debug, error}; +use utils::crashsafe::path_with_suffix_extension; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::AtomicLsn; +use utils::lsn::Lsn; + +use self::deleter::Deleter; +use self::list_writer::DeletionOp; +use self::list_writer::ListWriter; +use self::list_writer::RecoverOp; +use self::validator::Validator; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; +use validator::ValidatorQueueMessage; + +use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; + +// TODO: adminstrative "panic button" config property to disable all deletions +// TODO: configurable for how long to wait before executing deletions + +/// We aggregate object deletions from many tenants in one place, for several reasons: +/// - Coalesce deletions into fewer DeleteObjects calls +/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes +/// to flush any outstanding deletions. +/// - Globally control throughput of deletions, as these are a low priority task: do +/// not compete with the same S3 clients/connections used for higher priority uploads. +/// - Enable gating deletions on validation of a tenant's generation number, to make +/// it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md) +/// +/// There are two kinds of deletion: deferred and immediate. A deferred deletion +/// may be intentionally delayed to protect passive readers of S3 data, and is +/// subject to a generation number validation step. An immediate deletion is +/// ready to execute immediately, and is only queued up so that it can be coalesced +/// with other deletions in flight. +/// +/// Deferred deletions pass through three steps: +/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into +/// DeletionLists, which are persisted to disk. +/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing +/// the keys in the list onward for actual deletion. Also validate remote_consistent_lsn +/// updates for running timelines. +/// - Deleter: accumulate object keys that the validator has validated, and execute them in +/// batches of 1000 keys via DeleteObjects. +/// +/// Non-deferred deletions, such as during timeline deletion, bypass the first +/// two stages and are passed straight into the Deleter. +/// +/// Internally, each stage is joined by a channel to the next. On disk, there is only +/// one queue (of DeletionLists), which is written by the frontend and consumed +/// by the backend. +#[derive(Clone)] +pub struct DeletionQueue { + client: DeletionQueueClient, + + // Parent cancellation token for the tokens passed into background workers + cancel: CancellationToken, +} + +/// Opaque wrapper around individual worker tasks, to avoid making the +/// worker objects themselves public +pub struct DeletionQueueWorkers +where + C: ControlPlaneGenerationsApi + Send + Sync, +{ + frontend: ListWriter, + backend: Validator, + executor: Deleter, +} + +impl DeletionQueueWorkers +where + C: ControlPlaneGenerationsApi + Send + Sync + 'static, +{ + pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> { + let jh_frontend = runtime.spawn(async move { + self.frontend + .background() + .instrument(tracing::info_span!(parent:None, "deletion frontend")) + .await + }); + let jh_backend = runtime.spawn(async move { + self.backend + .background() + .instrument(tracing::info_span!(parent:None, "deletion backend")) + .await + }); + let jh_executor = runtime.spawn(async move { + self.executor + .background() + .instrument(tracing::info_span!(parent:None, "deletion executor")) + .await + }); + + runtime.spawn({ + async move { + jh_frontend.await.expect("error joining frontend worker"); + jh_backend.await.expect("error joining backend worker"); + drop(jh_executor.await.expect("error joining executor worker")); + } + }) + } +} + +/// A FlushOp is just a oneshot channel, where we send the transmit side down +/// another channel, and the receive side will receive a message when the channel +/// we're flushing has reached the FlushOp we sent into it. +/// +/// The only extra behavior beyond the channel is that the notify() method does not +/// return an error when the receive side has been dropped, because in this use case +/// it is harmless (the code that initiated the flush no longer cares about the result). +#[derive(Debug)] +struct FlushOp { + tx: tokio::sync::oneshot::Sender<()>, +} + +impl FlushOp { + fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) { + let (tx, rx) = tokio::sync::oneshot::channel::<()>(); + (Self { tx }, rx) + } + + fn notify(self) { + if self.tx.send(()).is_err() { + // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush. + debug!("deletion queue flush from dropped client"); + }; + } +} + +#[derive(Clone, Debug)] +pub struct DeletionQueueClient { + tx: tokio::sync::mpsc::Sender, + executor_tx: tokio::sync::mpsc::Sender, + + lsn_table: Arc>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct TenantDeletionList { + /// For each Timeline, a list of key fragments to append to the timeline remote path + /// when reconstructing a full key + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + timelines: HashMap>, + + /// The generation in which this deletion was emitted: note that this may not be the + /// same as the generation of any layers being deleted. The generation of the layer + /// has already been absorbed into the keys in `objects` + generation: Generation, +} + +impl TenantDeletionList { + pub(crate) fn len(&self) -> usize { + self.timelines.values().map(|v| v.len()).sum() + } +} + +/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string +fn to_hex_map(input: &HashMap, serializer: S) -> Result +where + S: serde::Serializer, + V: Serialize, + I: AsRef<[u8]>, +{ + let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone())); + + transformed + .collect::>() + .serialize(serializer) +} + +/// For HashMaps using a FromHex key, where we would like to decode the key +fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, + V: Deserialize<'de>, + I: FromHex + std::hash::Hash + Eq, +{ + let hex_map = HashMap::::deserialize(deserializer)?; + hex_map + .into_iter() + .map(|(k, v)| { + I::from_hex(k) + .map(|k| (k, v)) + .map_err(|_| serde::de::Error::custom("Invalid hex ID")) + }) + .collect() +} + +/// Files ending with this suffix will be ignored and erased +/// during recovery as startup. +const TEMP_SUFFIX: &str = ".tmp"; + +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +struct DeletionList { + /// Serialization version, for future use + version: u8, + + /// Used for constructing a unique key for each deletion list we write out. + sequence: u64, + + /// To avoid repeating tenant/timeline IDs in every key, we store keys in + /// nested HashMaps by TenantTimelineID. Each Tenant only appears once + /// with one unique generation ID: if someone tries to push a second generation + /// ID for the same tenant, we will start a new DeletionList. + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + tenants: HashMap, + + /// Avoid having to walk `tenants` to calculate the number of keys in + /// the nested deletion lists + size: usize, + + /// Set to true when the list has undergone validation with the control + /// plane and the remaining contents of `tenants` are valid. A list may + /// also be implicitly marked valid by DeletionHeader.validated_sequence + /// advancing to >= DeletionList.sequence + #[serde(default)] + #[serde(skip_serializing_if = "std::ops::Not::not")] + validated: bool, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +struct DeletionHeader { + /// Serialization version, for future use + version: u8, + + /// The highest sequence number (inclusive) that has been validated. All deletion + /// lists on disk with a sequence <= this value are safe to execute. + validated_sequence: u64, +} + +impl DeletionHeader { + const VERSION_LATEST: u8 = 1; + + fn new(validated_sequence: u64) -> Self { + Self { + version: Self::VERSION_LATEST, + validated_sequence, + } + } + + async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { + debug!("Saving deletion list header {:?}", self); + let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; + let header_path = conf.deletion_header_path(); + let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); + VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) + .await + .map_err(Into::into) + } +} + +impl DeletionList { + const VERSION_LATEST: u8 = 1; + fn new(sequence: u64) -> Self { + Self { + version: Self::VERSION_LATEST, + sequence, + tenants: HashMap::new(), + size: 0, + validated: false, + } + } + + fn is_empty(&self) -> bool { + self.tenants.is_empty() + } + + fn len(&self) -> usize { + self.size + } + + /// Returns true if the push was accepted, false if the caller must start a new + /// deletion list. + fn push( + &mut self, + tenant: &TenantId, + timeline: &TimelineId, + generation: Generation, + objects: &mut Vec, + ) -> bool { + if objects.is_empty() { + // Avoid inserting an empty TimelineDeletionList: this preserves the property + // that if we have no keys, then self.objects is empty (used in Self::is_empty) + return true; + } + + let tenant_entry = self + .tenants + .entry(*tenant) + .or_insert_with(|| TenantDeletionList { + timelines: HashMap::new(), + generation, + }); + + if tenant_entry.generation != generation { + // Only one generation per tenant per list: signal to + // caller to start a new list. + return false; + } + + let timeline_entry = tenant_entry + .timelines + .entry(*timeline) + .or_insert_with(Vec::new); + + let timeline_remote_path = remote_timeline_path(tenant, timeline); + + self.size += objects.len(); + timeline_entry.extend(objects.drain(..).map(|p| { + p.strip_prefix(&timeline_remote_path) + .expect("Timeline paths always start with the timeline prefix") + .to_string_lossy() + .to_string() + })); + true + } + + fn into_remote_paths(self) -> Vec { + let mut result = Vec::new(); + for (tenant, tenant_deletions) in self.tenants.into_iter() { + for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() { + let timeline_remote_path = remote_timeline_path(&tenant, &timeline); + result.extend( + timeline_layers + .into_iter() + .map(|l| timeline_remote_path.join(&PathBuf::from(l))), + ); + } + } + + result + } + + async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { + let path = conf.deletion_list_path(self.sequence); + let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); + + let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); + VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) + .await + .map_err(Into::into) + } +} + +impl std::fmt::Display for DeletionList { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletionList", + self.sequence, + self.tenants.len(), + self.size + ) + } +} + +struct PendingLsn { + projected: Lsn, + result_slot: Arc, +} + +struct TenantLsnState { + timelines: HashMap, + + // In what generation was the most recent update proposed? + generation: Generation, +} + +#[derive(Default)] +struct VisibleLsnUpdates { + tenants: HashMap, +} + +impl VisibleLsnUpdates { + fn new() -> Self { + Self { + tenants: HashMap::new(), + } + } +} + +impl std::fmt::Debug for VisibleLsnUpdates { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len()) + } +} + +#[derive(Error, Debug)] +pub enum DeletionQueueError { + #[error("Deletion queue unavailable during shutdown")] + ShuttingDown, +} + +impl DeletionQueueClient { + pub(crate) fn broken() -> Self { + // Channels whose receivers are immediately dropped. + let (tx, _rx) = tokio::sync::mpsc::channel(1); + let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1); + Self { + tx, + executor_tx, + lsn_table: Arc::default(), + } + } + + /// This is cancel-safe. If you drop the future before it completes, the message + /// is not pushed, although in the context of the deletion queue it doesn't matter: once + /// we decide to do a deletion the decision is always final. + async fn do_push( + &self, + queue: &tokio::sync::mpsc::Sender, + msg: T, + ) -> Result<(), DeletionQueueError> { + match queue.send(msg).await { + Ok(_) => Ok(()), + Err(e) => { + // This shouldn't happen, we should shut down all tenants before + // we shut down the global delete queue. If we encounter a bug like this, + // we may leak objects as deletions won't be processed. + error!("Deletion queue closed while pushing, shutting down? ({e})"); + Err(DeletionQueueError::ShuttingDown) + } + } + } + + pub(crate) async fn recover( + &self, + attached_tenants: HashMap, + ) -> Result<(), DeletionQueueError> { + self.do_push( + &self.tx, + ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }), + ) + .await + } + + /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside + /// world, it must validate its generation number before doing so. Rather than do this synchronously, + /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most + /// recently validated separately. + /// + /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates. The + /// backend will later wake up and notice that the tenant's generation requires validation. + pub(crate) async fn update_remote_consistent_lsn( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + current_generation: Generation, + lsn: Lsn, + result_slot: Arc, + ) { + let mut locked = self + .lsn_table + .write() + .expect("Lock should never be poisoned"); + + let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState { + timelines: HashMap::new(), + generation: current_generation, + }); + + if tenant_entry.generation != current_generation { + // Generation might have changed if we were detached and then re-attached: in this case, + // state from the previous generation cannot be trusted. + tenant_entry.timelines.clear(); + tenant_entry.generation = current_generation; + } + + tenant_entry.timelines.insert( + timeline_id, + PendingLsn { + projected: lsn, + result_slot, + }, + ); + } + + /// Submit a list of layers for deletion: this function will return before the deletion is + /// persistent, but it may be executed at any time after this function enters: do not push + /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer + /// references them). + /// + /// The `current_generation` is the generation of this pageserver's current attachment. The + /// generations in `layers` are the generations in which those layers were written. + pub(crate) async fn push_layers( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + current_generation: Generation, + layers: Vec<(LayerFileName, Generation)>, + ) -> Result<(), DeletionQueueError> { + if current_generation.is_none() { + debug!("Enqueuing deletions in legacy mode, skipping queue"); + let mut layer_paths = Vec::new(); + for (layer, generation) in layers { + layer_paths.push(remote_layer_path( + &tenant_id, + &timeline_id, + &layer, + generation, + )); + } + self.push_immediate(layer_paths).await?; + return self.flush_immediate().await; + } + + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(layers.len() as u64); + self.do_push( + &self.tx, + ListWriterQueueMessage::Delete(DeletionOp { + tenant_id, + timeline_id, + layers, + generation: current_generation, + objects: Vec::new(), + }), + ) + .await + } + + /// This is cancel-safe. If you drop the future the flush may still happen in the background. + async fn do_flush( + &self, + queue: &tokio::sync::mpsc::Sender, + msg: T, + rx: tokio::sync::oneshot::Receiver<()>, + ) -> Result<(), DeletionQueueError> { + self.do_push(queue, msg).await?; + if rx.await.is_err() { + // This shouldn't happen if tenants are shut down before deletion queue. If we + // encounter a bug like this, then a flusher will incorrectly believe it has flushed + // when it hasn't, possibly leading to leaking objects. + error!("Deletion queue dropped flush op while client was still waiting"); + Err(DeletionQueueError::ShuttingDown) + } else { + Ok(()) + } + } + + /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList) + /// + /// This is cancel-safe. If you drop the future the flush may still happen in the background. + pub async fn flush(&self) -> Result<(), DeletionQueueError> { + let (flush_op, rx) = FlushOp::new(); + self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx) + .await + } + + // Wait until all previous deletions are executed + pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> { + debug!("flush_execute: flushing to deletion lists..."); + // Flush any buffered work to deletion lists + self.flush().await?; + + // Flush the backend into the executor of deletion lists + let (flush_op, rx) = FlushOp::new(); + debug!("flush_execute: flushing backend..."); + self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx) + .await?; + debug!("flush_execute: finished flushing backend..."); + + // Flush any immediate-mode deletions (the above backend flush will only flush + // the executor if deletions had flowed through the backend) + debug!("flush_execute: flushing execution..."); + let (flush_op, rx) = FlushOp::new(); + self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx) + .await?; + debug!("flush_execute: finished flushing execution..."); + Ok(()) + } + + /// This interface bypasses the persistent deletion queue, and any validation + /// that this pageserver is still elegible to execute the deletions. It is for + /// use in timeline deletions, where the control plane is telling us we may + /// delete everything in the timeline. + /// + /// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`. + pub(crate) async fn push_immediate( + &self, + objects: Vec, + ) -> Result<(), DeletionQueueError> { + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(objects.len() as u64); + self.executor_tx + .send(DeleterMessage::Delete(objects)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown) + } + + /// Companion to push_immediate. When this returns Ok, all prior objects sent + /// into push_immediate have been deleted from remote storage. + pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> { + let (flush_op, rx) = FlushOp::new(); + self.executor_tx + .send(DeleterMessage::Flush(flush_op)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + + rx.await.map_err(|_| DeletionQueueError::ShuttingDown) + } +} + +impl DeletionQueue { + pub fn new_client(&self) -> DeletionQueueClient { + self.client.clone() + } + + /// Caller may use the returned object to construct clients with new_client. + /// Caller should tokio::spawn the background() members of the two worker objects returned: + /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. + /// + /// If remote_storage is None, then the returned workers will also be None. + pub fn new( + remote_storage: Option, + control_plane_client: Option, + conf: &'static PageServerConf, + ) -> (Self, Option>) + where + C: ControlPlaneGenerationsApi + Send + Sync, + { + // Deep channel: it consumes deletions from all timelines and we do not want to block them + let (tx, rx) = tokio::sync::mpsc::channel(16384); + + // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions + let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16); + + // Shallow channel: it carries lists of paths, and we expect the main queueing to + // happen in the backend (persistent), not in this queue. + let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16); + + let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())); + + // The deletion queue has an independent cancellation token to + // the general pageserver shutdown token, because it stays alive a bit + // longer to flush after Tenants have all been torn down. + let cancel = CancellationToken::new(); + + let remote_storage = match remote_storage { + None => { + return ( + Self { + client: DeletionQueueClient { + tx, + executor_tx, + lsn_table: lsn_table.clone(), + }, + cancel, + }, + None, + ) + } + Some(r) => r, + }; + + ( + Self { + client: DeletionQueueClient { + tx, + executor_tx: executor_tx.clone(), + lsn_table: lsn_table.clone(), + }, + cancel: cancel.clone(), + }, + Some(DeletionQueueWorkers { + frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()), + backend: Validator::new( + conf, + backend_rx, + executor_tx, + control_plane_client, + lsn_table.clone(), + cancel.clone(), + ), + executor: Deleter::new(remote_storage, executor_rx, cancel.clone()), + }), + ) + } + + pub async fn shutdown(&mut self, timeout: Duration) { + self.cancel.cancel(); + + match tokio::time::timeout(timeout, self.client.flush()).await { + Ok(Ok(())) => { + tracing::info!("Deletion queue flushed successfully on shutdown") + } + Ok(Err(DeletionQueueError::ShuttingDown)) => { + // This is not harmful for correctness, but is unexpected: the deletion + // queue's workers should stay alive as long as there are any client handles instantiated. + tracing::warn!("Deletion queue stopped prematurely"); + } + Err(_timeout) => { + tracing::warn!("Timed out flushing deletion queue on shutdown") + } + } + } +} + +#[cfg(test)] +mod test { + use hex_literal::hex; + use std::{ + io::ErrorKind, + path::{Path, PathBuf}, + time::Duration, + }; + use tracing::info; + + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; + use tokio::task::JoinHandle; + + use crate::{ + control_plane_client::RetryForeverError, + repository::Key, + tenant::{ + harness::TenantHarness, remote_timeline_client::remote_timeline_path, + storage_layer::DeltaFileName, + }, + }; + + use super::*; + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + + pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), + lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), + }); + + // When you need a second layer in a test. + pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), + lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), + }); + + struct TestSetup { + harness: TenantHarness, + remote_fs_dir: PathBuf, + storage: GenericRemoteStorage, + mock_control_plane: MockControlPlane, + deletion_queue: DeletionQueue, + worker_join: JoinHandle<()>, + } + + impl TestSetup { + /// Simulate a pageserver restart by destroying and recreating the deletion queue + async fn restart(&mut self) { + let (deletion_queue, workers) = DeletionQueue::new( + Some(self.storage.clone()), + Some(self.mock_control_plane.clone()), + self.harness.conf, + ); + + tracing::debug!("Spawning worker for new queue queue"); + let worker_join = workers + .unwrap() + .spawn_with(&tokio::runtime::Handle::current()); + + let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join); + let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue); + + tracing::debug!("Joining worker from previous queue"); + old_deletion_queue.cancel.cancel(); + old_worker_join + .await + .expect("Failed to join workers for previous deletion queue"); + } + + fn set_latest_generation(&self, gen: Generation) { + let tenant_id = self.harness.tenant_id; + self.mock_control_plane + .latest_generation + .lock() + .unwrap() + .insert(tenant_id, gen); + } + + /// Returns remote layer file name, suitable for use in assert_remote_files + fn write_remote_layer( + &self, + file_name: LayerFileName, + gen: Generation, + ) -> anyhow::Result { + let tenant_id = self.harness.tenant_id; + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); + std::fs::create_dir_all(&remote_timeline_path)?; + let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + + let content: Vec = format!("placeholder contents of {file_name}").into(); + + std::fs::write( + remote_timeline_path.join(remote_layer_file_name.clone()), + content, + )?; + + Ok(remote_layer_file_name) + } + } + + #[derive(Debug, Clone)] + struct MockControlPlane { + pub latest_generation: std::sync::Arc>>, + } + + impl MockControlPlane { + fn new() -> Self { + Self { + latest_generation: Arc::default(), + } + } + } + + #[async_trait::async_trait] + impl ControlPlaneGenerationsApi for MockControlPlane { + #[allow(clippy::diverging_sub_expression)] // False positive via async_trait + async fn re_attach(&self) -> Result, RetryForeverError> { + unimplemented!() + } + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError> { + let mut result = HashMap::new(); + + let latest_generation = self.latest_generation.lock().unwrap(); + + for (tenant_id, generation) in tenants { + if let Some(latest) = latest_generation.get(&tenant_id) { + result.insert(tenant_id, *latest == generation); + } + } + + Ok(result) + } + } + + fn setup(test_name: &str) -> anyhow::Result { + let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); + let harness = TenantHarness::create(test_name)?; + + // We do not load() the harness: we only need its config and remote_storage + + // Set up a GenericRemoteStorage targetting a directory + let remote_fs_dir = harness.conf.workdir.join("remote_fs"); + std::fs::create_dir_all(remote_fs_dir)?; + let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; + let storage_config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + ) + .unwrap(), + max_sync_errors: std::num::NonZeroU32::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + ) + .unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + + let mock_control_plane = MockControlPlane::new(); + + let (deletion_queue, worker) = DeletionQueue::new( + Some(storage.clone()), + Some(mock_control_plane.clone()), + harness.conf, + ); + + let worker = worker.unwrap(); + let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); + + Ok(TestSetup { + harness, + remote_fs_dir, + storage, + mock_control_plane, + deletion_queue, + worker_join, + }) + } + + // TODO: put this in a common location so that we can share with remote_timeline_client's tests + fn assert_remote_files(expected: &[&str], remote_path: &Path) { + let mut expected: Vec = expected.iter().map(|x| String::from(*x)).collect(); + expected.sort(); + + let mut found: Vec = Vec::new(); + let dir = match std::fs::read_dir(remote_path) { + Ok(d) => d, + Err(e) => { + if e.kind() == ErrorKind::NotFound { + if expected.is_empty() { + // We are asserting prefix is empty: it is expected that the dir is missing + return; + } else { + assert_eq!(expected, Vec::::new()); + unreachable!(); + } + } else { + panic!( + "Unexpected error listing {}: {e}", + remote_path.to_string_lossy() + ); + } + } + }; + + for entry in dir.flatten() { + let entry_name = entry.file_name(); + let fname = entry_name.to_str().unwrap(); + found.push(String::from(fname)); + } + found.sort(); + + assert_eq!(expected, found); + } + + fn assert_local_files(expected: &[&str], directory: &Path) { + let dir = match std::fs::read_dir(directory) { + Ok(d) => d, + Err(_) => { + assert_eq!(expected, &Vec::::new()); + return; + } + }; + let mut found = Vec::new(); + for dentry in dir { + let dentry = dentry.unwrap(); + let file_name = dentry.file_name(); + let file_name_str = file_name.to_string_lossy(); + found.push(file_name_str.to_string()); + } + found.sort(); + assert_eq!(expected, found); + } + + #[tokio::test] + async fn deletion_queue_smoke() -> anyhow::Result<()> { + // Basic test that the deletion queue processes the deletions we pass into it + let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let tenant_id = ctx.harness.tenant_id; + + let content: Vec = "victim1 contents".into(); + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + let deletion_prefix = ctx.harness.conf.deletion_prefix(); + + // Exercise the distinction between the generation of the layers + // we delete, and the generation of the running Tenant. + let layer_generation = Generation::new(0xdeadbeef); + let now_generation = Generation::new(0xfeedbeef); + + let remote_layer_file_name_1 = + format!("{}{}", layer_file_name_1, layer_generation.get_suffix()); + + // Set mock control plane state to valid for our generation + ctx.set_latest_generation(now_generation); + + // Inject a victim file to remote storage + info!("Writing"); + std::fs::create_dir_all(&remote_timeline_path)?; + std::fs::write( + remote_timeline_path.join(remote_layer_file_name_1.clone()), + content, + )?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + + // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) + info!("Pushing"); + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation, + [(layer_file_name_1.clone(), layer_generation)].to_vec(), + ) + .await?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + + assert_local_files(&[], &deletion_prefix); + + // File should still be there after we write a deletion list (we haven't pushed enough to execute anything) + info!("Flushing"); + client.flush().await?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + assert_local_files(&["0000000000000001-01.list"], &deletion_prefix); + + // File should go away when we execute + info!("Flush-executing"); + client.flush_execute().await?; + assert_remote_files(&[], &remote_timeline_path); + assert_local_files(&["header-01"], &deletion_prefix); + + // Flushing on an empty queue should succeed immediately, and not write any lists + info!("Flush-executing on empty"); + client.flush_execute().await?; + assert_local_files(&["header-01"], &deletion_prefix); + + Ok(()) + } + + #[tokio::test] + async fn deletion_queue_validation() -> anyhow::Result<()> { + let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + // Generation that the control plane thinks is current + let latest_generation = Generation::new(0xdeadbeef); + // Generation that our DeletionQueue thinks the tenant is running with + let stale_generation = latest_generation.previous(); + // Generation that our example layer file was written with + let layer_generation = stale_generation.previous(); + + ctx.set_latest_generation(latest_generation); + + let tenant_id = ctx.harness.tenant_id; + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + + // Initial state: a remote layer exists + let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; + assert_remote_files(&[&remote_layer_name], &remote_timeline_path); + + tracing::debug!("Pushing..."); + client + .push_layers( + tenant_id, + TIMELINE_ID, + stale_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // We enqueued the operation in a stale generation: it should have failed validation + tracing::debug!("Flushing..."); + tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; + assert_remote_files(&[&remote_layer_name], &remote_timeline_path); + + tracing::debug!("Pushing..."); + client + .push_layers( + tenant_id, + TIMELINE_ID, + latest_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // We enqueued the operation in a fresh generation: it should have passed validation + tracing::debug!("Flushing..."); + tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; + assert_remote_files(&[], &remote_timeline_path); + + Ok(()) + } + + #[tokio::test] + async fn deletion_queue_recovery() -> anyhow::Result<()> { + // Basic test that the deletion queue processes the deletions we pass into it + let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + let tenant_id = ctx.harness.tenant_id; + + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + let deletion_prefix = ctx.harness.conf.deletion_prefix(); + + let layer_generation = Generation::new(0xdeadbeef); + let now_generation = Generation::new(0xfeedbeef); + + // Inject a deletion in the generation before generation_now: after restart, + // this deletion should _not_ get executed (only the immediately previous + // generation gets that treatment) + let remote_layer_file_name_historical = + ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation.previous(), + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // Inject a deletion in the generation before generation_now: after restart, + // this deletion should get executed, because we execute deletions in the + // immediately previous generation on the same node. + let remote_layer_file_name_previous = + ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation, + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(), + ) + .await?; + + client.flush().await?; + assert_remote_files( + &[ + &remote_layer_file_name_historical, + &remote_layer_file_name_previous, + ], + &remote_timeline_path, + ); + + // Different generatinos for the same tenant will cause two separate + // deletion lists to be emitted. + assert_local_files( + &["0000000000000001-01.list", "0000000000000002-01.list"], + &deletion_prefix, + ); + + // Simulate a node restart: the latest generation advances + let now_generation = now_generation.next(); + ctx.set_latest_generation(now_generation); + + // Restart the deletion queue + drop(client); + ctx.restart().await; + let client = ctx.deletion_queue.new_client(); + client + .recover(HashMap::from([(tenant_id, now_generation)])) + .await?; + + info!("Flush-executing"); + client.flush_execute().await?; + // The deletion from immediately prior generation was executed, the one from + // an older generation was not. + assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path); + Ok(()) + } +} + +/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence +/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. +#[cfg(test)] +pub(crate) mod mock { + use tracing::info; + + use crate::tenant::remote_timeline_client::remote_layer_path; + + use super::*; + use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }; + + pub struct ConsumerState { + rx: tokio::sync::mpsc::Receiver, + executor_rx: tokio::sync::mpsc::Receiver, + } + + impl ConsumerState { + async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize { + let mut executed = 0; + + info!("Executing all pending deletions"); + + // Transform all executor messages to generic frontend messages + while let Ok(msg) = self.executor_rx.try_recv() { + match msg { + DeleterMessage::Delete(objects) => { + for path in objects { + match remote_storage.delete(&path).await { + Ok(_) => { + debug!("Deleted {path}"); + } + Err(e) => { + error!("Failed to delete {path}, leaking object! ({e})"); + } + } + executed += 1; + } + } + DeleterMessage::Flush(flush_op) => { + flush_op.notify(); + } + } + } + + while let Ok(msg) = self.rx.try_recv() { + match msg { + ListWriterQueueMessage::Delete(op) => { + let mut objects = op.objects; + for (layer, generation) in op.layers { + objects.push(remote_layer_path( + &op.tenant_id, + &op.timeline_id, + &layer, + generation, + )); + } + + for path in objects { + info!("Executing deletion {path}"); + match remote_storage.delete(&path).await { + Ok(_) => { + debug!("Deleted {path}"); + } + Err(e) => { + error!("Failed to delete {path}, leaking object! ({e})"); + } + } + executed += 1; + } + } + ListWriterQueueMessage::Flush(op) => { + op.notify(); + } + ListWriterQueueMessage::FlushExecute(op) => { + // We have already executed all prior deletions because mock does them inline + op.notify(); + } + ListWriterQueueMessage::Recover(_) => { + // no-op in mock + } + } + info!("All pending deletions have been executed"); + } + + executed + } + } + + pub struct MockDeletionQueue { + tx: tokio::sync::mpsc::Sender, + executor_tx: tokio::sync::mpsc::Sender, + executed: Arc, + remote_storage: Option, + consumer: std::sync::Mutex, + lsn_table: Arc>, + } + + impl MockDeletionQueue { + pub fn new(remote_storage: Option) -> Self { + let (tx, rx) = tokio::sync::mpsc::channel(16384); + let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384); + + let executed = Arc::new(AtomicUsize::new(0)); + + Self { + tx, + executor_tx, + executed, + remote_storage, + consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }), + lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), + } + } + + pub fn get_executed(&self) -> usize { + self.executed.load(Ordering::Relaxed) + } + + #[allow(clippy::await_holding_lock)] + pub async fn pump(&self) { + if let Some(remote_storage) = &self.remote_storage { + // Permit holding mutex across await, because this is only ever + // called once at a time in tests. + let mut locked = self.consumer.lock().unwrap(); + let count = locked.consume(remote_storage).await; + self.executed.fetch_add(count, Ordering::Relaxed); + } + } + + pub(crate) fn new_client(&self) -> DeletionQueueClient { + DeletionQueueClient { + tx: self.tx.clone(), + executor_tx: self.executor_tx.clone(), + lsn_table: self.lsn_table.clone(), + } + } + } +} diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs new file mode 100644 index 0000000000..5c6e7dc9d7 --- /dev/null +++ b/pageserver/src/deletion_queue/deleter.rs @@ -0,0 +1,156 @@ +//! The deleter is the final stage in the deletion queue. It accumulates remote +//! paths to delete, and periodically executes them in batches of up to 1000 +//! using the DeleteObjects request. +//! +//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller +//! number of full-sized DeleteObjects requests, rather than a larger number of +//! smaller requests. + +use remote_storage::GenericRemoteStorage; +use remote_storage::RemotePath; +use remote_storage::MAX_KEYS_PER_DELETE; +use std::time::Duration; +use tokio_util::sync::CancellationToken; +use tracing::info; +use tracing::warn; + +use crate::metrics; + +use super::DeletionQueueError; +use super::FlushOp; + +const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); + +pub(super) enum DeleterMessage { + Delete(Vec), + Flush(FlushOp), +} + +/// Non-persistent deletion queue, for coalescing multiple object deletes into +/// larger DeleteObjects requests. +pub(super) struct Deleter { + // Accumulate up to 1000 keys for the next deletion operation + accumulator: Vec, + + rx: tokio::sync::mpsc::Receiver, + + cancel: CancellationToken, + remote_storage: GenericRemoteStorage, +} + +impl Deleter { + pub(super) fn new( + remote_storage: GenericRemoteStorage, + rx: tokio::sync::mpsc::Receiver, + cancel: CancellationToken, + ) -> Self { + Self { + remote_storage, + rx, + cancel, + accumulator: Vec::new(), + } + } + + /// Wrap the remote `delete_objects` with a failpoint + async fn remote_delete(&self) -> Result<(), anyhow::Error> { + fail::fail_point!("deletion-queue-before-execute", |_| { + info!("Skipping execution, failpoint set"); + metrics::DELETION_QUEUE + .remote_errors + .with_label_values(&["failpoint"]) + .inc(); + Err(anyhow::anyhow!("failpoint hit")) + }); + + self.remote_storage.delete_objects(&self.accumulator).await + } + + /// Block until everything in accumulator has been executed + async fn flush(&mut self) -> Result<(), DeletionQueueError> { + while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { + match self.remote_delete().await { + Ok(()) => { + // Note: we assume that the remote storage layer returns Ok(()) if some + // or all of the deleted objects were already gone. + metrics::DELETION_QUEUE + .keys_executed + .inc_by(self.accumulator.len() as u64); + info!( + "Executed deletion batch {}..{}", + self.accumulator + .first() + .expect("accumulator should be non-empty"), + self.accumulator + .last() + .expect("accumulator should be non-empty"), + ); + self.accumulator.clear(); + } + Err(e) => { + warn!("DeleteObjects request failed: {e:#}, will retry"); + metrics::DELETION_QUEUE + .remote_errors + .with_label_values(&["execute"]) + .inc(); + } + }; + } + if self.cancel.is_cancelled() { + // Expose an error because we may not have actually flushed everything + Err(DeletionQueueError::ShuttingDown) + } else { + Ok(()) + } + } + + pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> { + self.accumulator.reserve(MAX_KEYS_PER_DELETE); + + loop { + if self.cancel.is_cancelled() { + return Err(DeletionQueueError::ShuttingDown); + } + + let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { + Ok(Some(m)) => m, + Ok(None) => { + // All queue senders closed + info!("Shutting down"); + return Err(DeletionQueueError::ShuttingDown); + } + Err(_) => { + // Timeout, we hit deadline to execute whatever we have in hand. These functions will + // return immediately if no work is pending + self.flush().await?; + + continue; + } + }; + + match msg { + DeleterMessage::Delete(mut list) => { + while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE { + if self.accumulator.len() == MAX_KEYS_PER_DELETE { + self.flush().await?; + // If we have received this number of keys, proceed with attempting to execute + assert_eq!(self.accumulator.len(), 0); + } + + let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len(); + let take_count = std::cmp::min(available_slots, list.len()); + for path in list.drain(list.len() - take_count..) { + self.accumulator.push(path); + } + } + } + DeleterMessage::Flush(flush_op) => { + // If flush() errors, we drop the flush_op and the caller will get + // an error recv()'ing their oneshot channel. + self.flush().await?; + flush_op.notify(); + } + } + } + } +} diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs new file mode 100644 index 0000000000..618a59f8fe --- /dev/null +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -0,0 +1,487 @@ +//! The list writer is the first stage in the deletion queue. It accumulates +//! layers to delete, and periodically writes out these layers into a persistent +//! DeletionList. +//! +//! The purpose of writing DeletionLists is to decouple the decision to +//! delete an object from the validation required to execute it: even if +//! validation is not possible, e.g. due to a control plane outage, we can +//! still persist our intent to delete an object, in a way that would +//! survive a restart. +//! +//! DeletionLists are passed onwards to the Validator. + +use super::DeletionHeader; +use super::DeletionList; +use super::FlushOp; +use super::ValidatorQueueMessage; + +use std::collections::HashMap; +use std::fs::create_dir_all; +use std::time::Duration; + +use regex::Regex; +use remote_storage::RemotePath; +use tokio_util::sync::CancellationToken; +use tracing::debug; +use tracing::info; +use tracing::warn; +use utils::generation::Generation; +use utils::id::TenantId; +use utils::id::TimelineId; + +use crate::config::PageServerConf; +use crate::deletion_queue::TEMP_SUFFIX; +use crate::metrics; +use crate::tenant::remote_timeline_client::remote_layer_path; +use crate::tenant::storage_layer::LayerFileName; + +// The number of keys in a DeletionList before we will proactively persist it +// (without reaching a flush deadline). This aims to deliver objects of the order +// of magnitude 1MB when we are under heavy delete load. +const DELETION_LIST_TARGET_SIZE: usize = 16384; + +// Ordinarily, we only flush to DeletionList periodically, to bound the window during +// which we might leak objects from not flushing a DeletionList after +// the objects are already unlinked from timeline metadata. +const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000); + +// If someone is waiting for a flush to DeletionList, only delay a little to accumulate +// more objects before doing the flush. +const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100); + +#[derive(Debug)] +pub(super) struct DeletionOp { + pub(super) tenant_id: TenantId, + pub(super) timeline_id: TimelineId, + // `layers` and `objects` are both just lists of objects. `layers` is used if you do not + // have a config object handy to project it to a remote key, and need the consuming worker + // to do it for you. + pub(super) layers: Vec<(LayerFileName, Generation)>, + pub(super) objects: Vec, + + /// The _current_ generation of the Tenant attachment in which we are enqueuing + /// this deletion. + pub(super) generation: Generation, +} + +#[derive(Debug)] +pub(super) struct RecoverOp { + pub(super) attached_tenants: HashMap, +} + +#[derive(Debug)] +pub(super) enum ListWriterQueueMessage { + Delete(DeletionOp), + // Wait until all prior deletions make it into a persistent DeletionList + Flush(FlushOp), + // Wait until all prior deletions have been executed (i.e. objects are actually deleted) + FlushExecute(FlushOp), + // Call once after re-attaching to control plane, to notify the deletion queue about + // latest attached generations & load any saved deletion lists from disk. + Recover(RecoverOp), +} + +pub(super) struct ListWriter { + conf: &'static PageServerConf, + + // Incoming frontend requests to delete some keys + rx: tokio::sync::mpsc::Receiver, + + // Outbound requests to the backend to execute deletion lists we have composed. + tx: tokio::sync::mpsc::Sender, + + // The list we are currently building, contains a buffer of keys to delete + // and our next sequence number + pending: DeletionList, + + // These FlushOps should notify the next time we flush + pending_flushes: Vec, + + // Worker loop is torn down when this fires. + cancel: CancellationToken, + + // Safety guard to do recovery exactly once + recovered: bool, +} + +impl ListWriter { + // Initially DeletionHeader.validated_sequence is zero. The place we start our + // sequence numbers must be higher than that. + const BASE_SEQUENCE: u64 = 1; + + pub(super) fn new( + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + cancel: CancellationToken, + ) -> Self { + Self { + pending: DeletionList::new(Self::BASE_SEQUENCE), + conf, + rx, + tx, + pending_flushes: Vec::new(), + cancel, + recovered: false, + } + } + + /// Try to flush `list` to persistent storage + /// + /// This does not return errors, because on failure to flush we do not lose + /// any state: flushing will be retried implicitly on the next deadline + async fn flush(&mut self) { + if self.pending.is_empty() { + for f in self.pending_flushes.drain(..) { + f.notify(); + } + return; + } + + match self.pending.save(self.conf).await { + Ok(_) => { + info!(sequence = self.pending.sequence, "Stored deletion list"); + + for f in self.pending_flushes.drain(..) { + f.notify(); + } + + // Take the list we've accumulated, replace it with a fresh list for the next sequence + let next_list = DeletionList::new(self.pending.sequence + 1); + let list = std::mem::replace(&mut self.pending, next_list); + + if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await { + // This is allowed to fail: it will only happen if the backend worker is shut down, + // so we can just drop this on the floor. + info!("Deletion list dropped, this is normal during shutdown ({e:#})"); + } + } + Err(e) => { + metrics::DELETION_QUEUE.unexpected_errors.inc(); + warn!( + sequence = self.pending.sequence, + "Failed to write deletion list, will retry later ({e:#})" + ); + } + } + } + + /// Load the header, to learn the sequence number up to which deletions + /// have been validated. We will apply validated=true to DeletionLists + /// <= this sequence when loading them. + /// + /// It is not an error for the header to not exist: we return None, and + /// the caller should act as if validated_sequence is 0 + async fn load_validated_sequence(&self) -> Result, anyhow::Error> { + let header_path = self.conf.deletion_header_path(); + match tokio::fs::read(&header_path).await { + Ok(header_bytes) => { + match serde_json::from_slice::(&header_bytes) { + Ok(h) => Ok(Some(h.validated_sequence)), + Err(e) => { + warn!( + "Failed to deserialize deletion header, ignoring {}: {e:#}", + header_path.display() + ); + // This should never happen unless we make a mistake with our serialization. + // Ignoring a deletion header is not consequential for correctnes because all deletions + // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up. + metrics::DELETION_QUEUE.unexpected_errors.inc(); + Ok(None) + } + } + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + debug!( + "Deletion header {} not found, first start?", + header_path.display() + ); + Ok(None) + } else { + Err(anyhow::anyhow!(e)) + } + } + } + } + + async fn recover( + &mut self, + attached_tenants: HashMap, + ) -> Result<(), anyhow::Error> { + debug!( + "recovering with {} attached tenants", + attached_tenants.len() + ); + + // Load the header + let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0); + + self.pending.sequence = validated_sequence + 1; + + let deletion_directory = self.conf.deletion_prefix(); + let mut dir = match tokio::fs::read_dir(&deletion_directory).await { + Ok(d) => d, + Err(e) => { + warn!( + "Failed to open deletion list directory {}: {e:#}", + deletion_directory.display(), + ); + + // Give up: if we can't read the deletion list directory, we probably can't + // write lists into it later, so the queue won't work. + return Err(e.into()); + } + }; + + let list_name_pattern = + Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); + + let header_path = self.conf.deletion_header_path(); + let mut seqs: Vec = Vec::new(); + while let Some(dentry) = dir.next_entry().await? { + let file_name = dentry.file_name(); + let dentry_str = file_name.to_string_lossy(); + + if Some(file_name.as_os_str()) == header_path.file_name() { + // Don't try and parse the header's name like a list + continue; + } + + if dentry_str.ends_with(TEMP_SUFFIX) { + info!("Cleaning up temporary file {dentry_str}"); + let absolute_path = deletion_directory.join(dentry.file_name()); + if let Err(e) = tokio::fs::remove_file(&absolute_path).await { + // Non-fatal error: we will just leave the file behind but not + // try and load it. + warn!( + "Failed to clean up temporary file {}: {e:#}", + absolute_path.display() + ); + } + + continue; + } + + let file_name = dentry.file_name().to_owned(); + let basename = file_name.to_string_lossy(); + let seq_part = if let Some(m) = list_name_pattern.captures(&basename) { + m.name("sequence") + .expect("Non optional group should be present") + .as_str() + } else { + warn!("Unexpected key in deletion queue: {basename}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + }; + + let seq: u64 = match u64::from_str_radix(seq_part, 16) { + Ok(s) => s, + Err(e) => { + warn!("Malformed key '{basename}': {e}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + } + }; + seqs.push(seq); + } + seqs.sort(); + + // Start our next deletion list from after the last location validated by + // previous process lifetime, or after the last location found (it is updated + // below after enumerating the deletion lists) + self.pending.sequence = validated_sequence + 1; + if let Some(max_list_seq) = seqs.last() { + self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1); + } + + for s in seqs { + let list_path = self.conf.deletion_list_path(s); + + let list_bytes = tokio::fs::read(&list_path).await?; + + let mut deletion_list = match serde_json::from_slice::(&list_bytes) { + Ok(l) => l, + Err(e) => { + // Drop the list on the floor: any objects it referenced will be left behind + // for scrubbing to clean up. This should never happen unless we have a serialization bug. + warn!(sequence = s, "Failed to deserialize deletion list: {e}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + } + }; + + if deletion_list.sequence <= validated_sequence { + // If the deletion list falls below valid_seq, we may assume that it was + // already validated the last time this pageserver ran. Otherwise, we still + // load it, as it may still contain content valid in this generation. + deletion_list.validated = true; + } else { + // Special case optimization: if a tenant is still attached, and no other + // generation was issued to another node in the interval while we restarted, + // then we may treat deletion lists from the previous generation as if they + // belong to our currently attached generation, and proceed to validate & execute. + for (tenant_id, tenant_list) in &mut deletion_list.tenants { + if let Some(attached_gen) = attached_tenants.get(tenant_id) { + if attached_gen.previous() == tenant_list.generation { + tenant_list.generation = *attached_gen; + } + } + } + } + + info!( + validated = deletion_list.validated, + sequence = deletion_list.sequence, + "Recovered deletion list" + ); + + // We will drop out of recovery if this fails: it indicates that we are shutting down + // or the backend has panicked + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(deletion_list.len() as u64); + self.tx + .send(ValidatorQueueMessage::Delete(deletion_list)) + .await?; + } + + info!(next_sequence = self.pending.sequence, "Replay complete"); + + Ok(()) + } + + /// This is the front-end ingest, where we bundle up deletion requests into DeletionList + /// and write them out, for later validation by the backend and execution by the executor. + pub(super) async fn background(&mut self) { + info!("Started deletion frontend worker"); + + // Synchronous, but we only do it once per process lifetime so it's tolerable + if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) { + tracing::error!( + "Failed to create deletion list directory {}, deletions will not be executed ({e})", + self.conf.deletion_prefix().display() + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + return; + } + + while !self.cancel.is_cancelled() { + let timeout = if self.pending_flushes.is_empty() { + FRONTEND_DEFAULT_TIMEOUT + } else { + FRONTEND_FLUSHING_TIMEOUT + }; + + let msg = match tokio::time::timeout(timeout, self.rx.recv()).await { + Ok(Some(msg)) => msg, + Ok(None) => { + // Queue sender destroyed, shutting down + break; + } + Err(_) => { + // Hit deadline, flush. + self.flush().await; + continue; + } + }; + + match msg { + ListWriterQueueMessage::Delete(op) => { + assert!( + self.recovered, + "Cannot process deletions before recovery. This is a bug." + ); + + debug!( + "Delete: ingesting {} layers, {} other objects", + op.layers.len(), + op.objects.len() + ); + + let mut layer_paths = Vec::new(); + for (layer, generation) in op.layers { + layer_paths.push(remote_layer_path( + &op.tenant_id, + &op.timeline_id, + &layer, + generation, + )); + } + layer_paths.extend(op.objects); + + if !self.pending.push( + &op.tenant_id, + &op.timeline_id, + op.generation, + &mut layer_paths, + ) { + self.flush().await; + let retry_succeeded = self.pending.push( + &op.tenant_id, + &op.timeline_id, + op.generation, + &mut layer_paths, + ); + if !retry_succeeded { + // Unexpected: after we flush, we should have + // drained self.pending, so a conflict on + // generation numbers should be impossible. + tracing::error!( + "Failed to enqueue deletions, leaking objects. This is a bug." + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + } + } + } + ListWriterQueueMessage::Flush(op) => { + if self.pending.is_empty() { + // Execute immediately + debug!("Flush: No pending objects, flushing immediately"); + op.notify() + } else { + // Execute next time we flush + debug!("Flush: adding to pending flush list for next deadline flush"); + self.pending_flushes.push(op); + } + } + ListWriterQueueMessage::FlushExecute(op) => { + debug!("FlushExecute: passing through to backend"); + // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute + if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await { + info!("Can't flush, shutting down ({e})"); + // Caller will get error when their oneshot sender was dropped. + } + } + ListWriterQueueMessage::Recover(op) => { + if self.recovered { + tracing::error!( + "Deletion queue recovery called more than once. This is a bug." + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + // Non-fatal: although this is a bug, since we did recovery at least once we may proceed. + continue; + } + + if let Err(e) = self.recover(op.attached_tenants).await { + // This should only happen in truly unrecoverable cases, like the recovery finding that the backend + // queue receiver has been dropped, or something is critically broken with + // the local filesystem holding deletion lists. + info!( + "Deletion queue recover aborted, deletion queue will not proceed ({e})" + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + return; + } else { + self.recovered = true; + } + } + } + + if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() { + self.flush().await; + } + } + info!("Deletion queue shut down."); + } +} diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs new file mode 100644 index 0000000000..64603045d2 --- /dev/null +++ b/pageserver/src/deletion_queue/validator.rs @@ -0,0 +1,414 @@ +//! The validator is responsible for validating DeletionLists for execution, +//! based on whethe the generation in the DeletionList is still the latest +//! generation for a tenant. +//! +//! The purpose of validation is to ensure split-brain safety in the cluster +//! of pageservers: a deletion may only be executed if the tenant generation +//! that originated it is still current. See docs/rfcs/025-generation-numbers.md +//! The purpose of accumulating lists before validating them is to reduce load +//! on the control plane API by issuing fewer, larger requests. +//! +//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn +//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn +//! to decide when old +//! +//! Deletions are passed onward to the Deleter. + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use tokio_util::sync::CancellationToken; +use tracing::debug; +use tracing::info; +use tracing::warn; + +use crate::config::PageServerConf; +use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::control_plane_client::RetryForeverError; +use crate::metrics; + +use super::deleter::DeleterMessage; +use super::DeletionHeader; +use super::DeletionList; +use super::DeletionQueueError; +use super::FlushOp; +use super::VisibleLsnUpdates; + +// After this length of time, do any validation work that is pending, +// even if we haven't accumulated many keys to delete. +// +// This also causes updates to remote_consistent_lsn to be validated, even +// if there were no deletions enqueued. +const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); + +// If we have received this number of keys, proceed with attempting to execute +const AUTOFLUSH_KEY_COUNT: usize = 16384; + +#[derive(Debug)] +pub(super) enum ValidatorQueueMessage { + Delete(DeletionList), + Flush(FlushOp), +} +pub(super) struct Validator +where + C: ControlPlaneGenerationsApi, +{ + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + + // Client for calling into control plane API for validation of deletes + control_plane_client: Option, + + // DeletionLists which are waiting generation validation. Not safe to + // execute until [`validate`] has processed them. + pending_lists: Vec, + + // DeletionLists which have passed validation and are ready to execute. + validated_lists: Vec, + + // Sum of all the lengths of lists in pending_lists + pending_key_count: usize, + + // Lsn validation state: we read projected LSNs and write back visible LSNs + // after validation. This is the LSN equivalent of `pending_validation_lists`: + // it is drained in [`validate`] + lsn_table: Arc>, + + // If we failed to rewrite a deletion list due to local filesystem I/O failure, + // we must remember that and refuse to advance our persistent validated sequence + // number past the failure. + list_write_failed: Option, + + cancel: CancellationToken, +} + +impl Validator +where + C: ControlPlaneGenerationsApi, +{ + pub(super) fn new( + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + control_plane_client: Option, + lsn_table: Arc>, + cancel: CancellationToken, + ) -> Self { + Self { + conf, + rx, + tx, + control_plane_client, + lsn_table, + pending_lists: Vec::new(), + validated_lists: Vec::new(), + pending_key_count: 0, + list_write_failed: None, + cancel, + } + } + /// Process any outstanding validations of generations of pending LSN updates or pending + /// DeletionLists. + /// + /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists + /// go into the queue of ready-to-execute lists. + async fn validate(&mut self) -> Result<(), DeletionQueueError> { + let mut tenant_generations = HashMap::new(); + for list in &self.pending_lists { + for (tenant_id, tenant_list) in &list.tenants { + // Note: DeletionLists are in logical time order, so generation always + // goes up. By doing a simple insert() we will always end up with + // the latest generation seen for a tenant. + tenant_generations.insert(*tenant_id, tenant_list.generation); + } + } + + let pending_lsn_updates = { + let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned"); + std::mem::take(&mut *lsn_table) + }; + for (tenant_id, update) in &pending_lsn_updates.tenants { + let entry = tenant_generations + .entry(*tenant_id) + .or_insert(update.generation); + if update.generation > *entry { + *entry = update.generation; + } + } + + if tenant_generations.is_empty() { + // No work to do + return Ok(()); + } + + let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client { + match control_plane_client + .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) + .await + { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + // The only way a validation call returns an error is when the cancellation token fires + return Err(DeletionQueueError::ShuttingDown); + } + } + } else { + // Control plane API disabled. In legacy mode we consider everything valid. + tenant_generations.keys().map(|k| (*k, true)).collect() + }; + + let mut validated_sequence: Option = None; + + // Apply the validation results to the pending LSN updates + for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants { + let validated_generation = tenant_generations + .get(&tenant_id) + .expect("Map was built from the same keys we're reading"); + + let valid = tenants_valid + .get(&tenant_id) + .copied() + // If the tenant was missing from the validation response, it has been deleted. + // The Timeline that requested the LSN update is probably already torn down, + // or will be torn down soon. In this case, drop the update by setting valid=false. + .unwrap_or(false); + + if valid && *validated_generation == tenant_lsn_state.generation { + for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines { + pending_lsn.result_slot.store(pending_lsn.projected); + } + } else { + // If we failed validation, then do not apply any of the projected updates + warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); + } + } + + // Apply the validation results to the pending deletion lists + for list in &mut self.pending_lists { + // Filter the list based on whether the server responded valid: true. + // If a tenant is omitted in the response, it has been deleted, and we should + // proceed with deletion. + let mut mutated = false; + list.tenants.retain(|tenant_id, tenant| { + let validated_generation = tenant_generations + .get(tenant_id) + .expect("Map was built from the same keys we're reading"); + + // If the tenant was missing from the validation response, it has been deleted. + // This means that a deletion is valid, but also redundant since the tenant's + // objects should have already been deleted. Treat it as invalid to drop the + // redundant deletion. + let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false); + + // A list is valid if it comes from the current _or previous_ generation. + // - The previous generation case is permitted due to how we store deletion lists locally: + // if we see the immediately previous generation in a locally stored deletion list, + // it proves that this node's disk was used for both current & previous generations, + // and therefore no other node was involved in between: the two generations may be + // logically treated as the same. + // - In that previous generation case, we rewrote it to the current generation + // in recover(), so the comparison here is simply an equality. + + let this_list_valid = valid + && (tenant.generation == *validated_generation); + + if !this_list_valid { + warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); + mutated = true; + } + this_list_valid + }); + list.validated = true; + + if mutated { + // Save the deletion list if we had to make changes due to stale generations. The + // saved list is valid for execution. + if let Err(e) = list.save(self.conf).await { + // Highly unexpected. Could happen if e.g. disk full. + // If we didn't save the trimmed list, it is _not_ valid to execute. + warn!("Failed to save modified deletion list {list}: {e:#}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + + // Rather than have a complex retry process, just drop it and leak the objects, + // scrubber will clean up eventually. + list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution. + + // We must remember this failure, to prevent later writing out a header that + // would imply the unwritable list was valid on disk. + if self.list_write_failed.is_none() { + self.list_write_failed = Some(list.sequence); + } + } + } + + validated_sequence = Some(list.sequence); + } + + if let Some(validated_sequence) = validated_sequence { + if let Some(list_write_failed) = self.list_write_failed { + // Rare error case: we failed to write out a deletion list to excise invalid + // entries, so we cannot advance the header's valid sequence number past that point. + // + // In this state we will continue to validate, execute and delete deletion lists, + // we just cannot update the header. It should be noticed and fixed by a human due to + // the nonzero value of our unexpected_errors metric. + warn!( + sequence_number = list_write_failed, + "Cannot write header because writing a deletion list failed earlier", + ); + } else { + // Write the queue header to record how far validation progressed. This avoids having + // to rewrite each DeletionList to set validated=true in it. + let header = DeletionHeader::new(validated_sequence); + + // Drop result because the validated_sequence is an optimization. If we fail to save it, + // then restart, we will drop some deletion lists, creating work for scrubber. + // The save() function logs a warning on error. + if let Err(e) = header.save(self.conf).await { + warn!("Failed to write deletion queue header: {e:#}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + } + } + } + + // Transfer the validated lists to the validated queue, for eventual execution + self.validated_lists.append(&mut self.pending_lists); + + Ok(()) + } + + async fn cleanup_lists(&mut self, list_paths: Vec) { + for list_path in list_paths { + debug!("Removing deletion list {}", list_path.display()); + + if let Err(e) = tokio::fs::remove_file(&list_path).await { + // Unexpected: we should have permissions and nothing else should + // be touching these files. We will leave the file behind. Subsequent + // pageservers will try and load it again: hopefully whatever storage + // issue (probably permissions) has been fixed by then. + tracing::error!("Failed to delete {}: {e:#}", list_path.display()); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + break; + } + } + } + + async fn flush(&mut self) -> Result<(), DeletionQueueError> { + tracing::debug!("Flushing with {} pending lists", self.pending_lists.len()); + + // Issue any required generation validation calls to the control plane + self.validate().await?; + + // After successful validation, nothing is pending: any lists that + // made it through validation will be in validated_lists. + assert!(self.pending_lists.is_empty()); + self.pending_key_count = 0; + + tracing::debug!( + "Validation complete, have {} validated lists", + self.validated_lists.len() + ); + + // Return quickly if we have no validated lists to execute. This avoids flushing the + // executor when an idle backend hits its autoflush interval + if self.validated_lists.is_empty() { + return Ok(()); + } + + // Drain `validated_lists` into the executor + let mut executing_lists = Vec::new(); + for list in self.validated_lists.drain(..) { + let list_path = self.conf.deletion_list_path(list.sequence); + let objects = list.into_remote_paths(); + self.tx + .send(DeleterMessage::Delete(objects)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + executing_lists.push(list_path); + } + + self.flush_executor().await?; + + // Erase the deletion lists whose keys have all be deleted from remote storage + self.cleanup_lists(executing_lists).await; + + Ok(()) + } + + async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> { + // Flush the executor, so that all the keys referenced by these deletion lists + // are actually removed from remote storage. This is a precondition to deleting + // the deletion lists themselves. + let (flush_op, rx) = FlushOp::new(); + self.tx + .send(DeleterMessage::Flush(flush_op)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + + rx.await.map_err(|_| DeletionQueueError::ShuttingDown) + } + + pub(super) async fn background(&mut self) { + tracing::info!("Started deletion backend worker"); + + while !self.cancel.is_cancelled() { + let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { + Ok(Some(m)) => m, + Ok(None) => { + // All queue senders closed + info!("Shutting down"); + break; + } + Err(_) => { + // Timeout, we hit deadline to execute whatever we have in hand. These functions will + // return immediately if no work is pending. + match self.flush().await { + Ok(()) => {} + Err(DeletionQueueError::ShuttingDown) => { + // If we are shutting down, then auto-flush can safely be skipped + } + } + + continue; + } + }; + + match msg { + ValidatorQueueMessage::Delete(list) => { + if list.validated { + // A pre-validated list may only be seen during recovery, if we are recovering + // a DeletionList whose on-disk state has validated=true + self.validated_lists.push(list) + } else { + self.pending_key_count += list.len(); + self.pending_lists.push(list); + } + + if self.pending_key_count > AUTOFLUSH_KEY_COUNT { + match self.flush().await { + Ok(()) => {} + Err(DeletionQueueError::ShuttingDown) => { + // If we are shutting down, then auto-flush can safely be skipped + } + } + } + } + ValidatorQueueMessage::Flush(op) => { + match self.flush().await { + Ok(()) => { + op.notify(); + } + Err(DeletionQueueError::ShuttingDown) => { + // If we fail due to shutting down, we will just drop `op` to propagate that status. + } + } + } + } + } + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4988641d6a..f5c1224f01 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1093,6 +1093,9 @@ components: remote_consistent_lsn: type: string format: hex + remote_consistent_lsn_visible: + type: string + format: hex ancestor_timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a8e914ba08..e61a9dcf3f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use anyhow::{anyhow, Context, Result}; +use futures::TryFutureExt; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; @@ -24,6 +25,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; @@ -34,7 +36,7 @@ use crate::tenant::mgr::{ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::timeline::Timeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use utils::{ @@ -61,6 +63,7 @@ pub struct State { remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, + deletion_queue_client: DeletionQueueClient, } impl State { @@ -70,6 +73,7 @@ impl State { remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, + deletion_queue_client: DeletionQueueClient, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -82,8 +86,17 @@ impl State { remote_storage, broker_client, disk_usage_eviction_state, + deletion_queue_client, }) } + + fn tenant_resources(&self) -> TenantSharedResources { + TenantSharedResources { + broker_client: self.broker_client.clone(), + remote_storage: self.remote_storage.clone(), + deletion_queue_client: self.deletion_queue_client.clone(), + } + } } #[inline(always)] @@ -283,7 +296,12 @@ async fn build_timeline_info_common( }; let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); - let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); + let remote_consistent_lsn_projected = timeline + .get_remote_consistent_lsn_projected() + .unwrap_or(Lsn(0)); + let remote_consistent_lsn_visible = timeline + .get_remote_consistent_lsn_visible() + .unwrap_or(Lsn(0)); let walreceiver_status = timeline.walreceiver_status(); @@ -293,7 +311,8 @@ async fn build_timeline_info_common( ancestor_timeline_id, ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - remote_consistent_lsn, + remote_consistent_lsn: remote_consistent_lsn_projected, + remote_consistent_lsn_visible, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), @@ -492,24 +511,23 @@ async fn tenant_attach_handler( let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - if let Some(remote_storage) = &state.remote_storage { - mgr::attach_tenant( - state.conf, - tenant_id, - generation, - tenant_conf, - state.broker_client.clone(), - remote_storage.clone(), - &ctx, - ) - .instrument(info_span!("tenant_attach", %tenant_id)) - .await?; - } else { + if state.remote_storage.is_none() { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" ))); } + mgr::attach_tenant( + state.conf, + tenant_id, + generation, + tenant_conf, + state.tenant_resources(), + &ctx, + ) + .instrument(info_span!("tenant_attach", %tenant_id)) + .await?; + json_response(StatusCode::ACCEPTED, ()) } @@ -570,6 +588,7 @@ async fn tenant_load_handler( generation, state.broker_client.clone(), state.remote_storage.clone(), + state.deletion_queue_client.clone(), &ctx, ) .instrument(info_span!("load", %tenant_id)) @@ -911,8 +930,7 @@ async fn tenant_create_handler( tenant_conf, target_tenant_id, generation, - state.broker_client.clone(), - state.remote_storage.clone(), + state.tenant_resources(), &ctx, ) .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id)) @@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn deletion_queue_flush( + r: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&r); + + if state.remote_storage.is_none() { + // Nothing to do if remote storage is disabled. + return json_response(StatusCode::OK, ()); + } + + let execute = parse_query_param(&r, "execute")?.unwrap_or(false); + + let flush = async { + if execute { + state.deletion_queue_client.flush_execute().await + } else { + state.deletion_queue_client.flush().await + } + } + // DeletionQueueError's only case is shutting down. + .map_err(|_| ApiError::ShuttingDown); + + tokio::select! { + res = flush => { + res.map(|()| json_response(StatusCode::OK, ()))? + } + _ = cancel.cancelled() => { + Err(ApiError::ShuttingDown) + } + } +} + async fn active_timeline_of_active_tenant( tenant_id: TenantId, timeline_id: TimelineId, @@ -1463,6 +1514,9 @@ pub fn make_router( .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) + .put("/v1/deletion_queue/flush", |r| { + api_handler(r, deletion_queue_flush) + }) .put("/v1/tenant/:tenant_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3049ad6a4e..e370e063ba 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,7 +3,8 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; -mod control_plane_client; +pub mod control_plane_client; +pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; @@ -27,6 +28,7 @@ pub mod failpoint_support; use std::path::Path; use crate::task_mgr::TaskKind; +use deletion_queue::DeletionQueue; use tracing::info; /// Current storage format version @@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; -#[tracing::instrument] -pub async fn shutdown_pageserver(exit_code: i32) { +#[tracing::instrument(skip_all, fields(%exit_code))] +pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) { ) .await; + // Best effort to persist any outstanding deletions, to avoid leaking objects + if let Some(mut deletion_queue) = deletion_queue { + deletion_queue.shutdown(Duration::from_secs(5)).await; + } + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 98dee095a3..f85f525630 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -264,6 +264,46 @@ pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheS }, }); +pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_page_cache_acquire_pinned_slot_seconds", + "Time spent acquiring a pinned slot in the page cache", + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + +pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_page_cache_find_victim_iters_total", + "Counter for the number of iterations in the find_victim loop", + ) + .expect("failed to define a metric") +}); + +static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "page_cache_errors_total", + "Number of timeouts while acquiring a pinned slot in the page cache", + &["error_kind"] + ) + .expect("failed to define a metric") +}); + +#[derive(IntoStaticStr)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum PageCacheErrorKind { + AcquirePinnedSlotTimeout, + EvictIterLimit, +} + +pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) { + PAGE_CACHE_ERRORS + .get_metric_with_label_values(&[error_kind.into()]) + .unwrap() + .inc(); +} + pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wait_lsn_seconds", @@ -291,6 +331,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_resident_physical_size_global", + "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions." + ) + .expect("failed to define a metric") +}); + static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", @@ -301,6 +349,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_remote_physical_size_global", + "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions." + ) + .expect("failed to define a metric") +}); + pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_layers_total", @@ -887,6 +943,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy .expect("failed to define a metric") }); +pub(crate) struct DeletionQueueMetrics { + pub(crate) keys_submitted: IntCounter, + pub(crate) keys_dropped: IntCounter, + pub(crate) keys_executed: IntCounter, + pub(crate) dropped_lsn_updates: IntCounter, + pub(crate) unexpected_errors: IntCounter, + pub(crate) remote_errors: IntCounterVec, +} +pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { + DeletionQueueMetrics{ + + keys_submitted: register_int_counter!( + "pageserver_deletion_queue_submitted_total", + "Number of objects submitted for deletion" + ) + .expect("failed to define a metric"), + + keys_dropped: register_int_counter!( + "pageserver_deletion_queue_dropped_total", + "Number of object deletions dropped due to stale generation." + ) + .expect("failed to define a metric"), + + keys_executed: register_int_counter!( + "pageserver_deletion_queue_executed_total", + "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed." + ) + .expect("failed to define a metric"), + + dropped_lsn_updates: register_int_counter!( + "pageserver_deletion_queue_dropped_lsn_updates_total", + "Updates to remote_consistent_lsn dropped due to stale generation number." + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_deletion_queue_unexpected_errors_total", + "Number of unexpected condiions that may stall the queue: any value above zero is unexpected." + ) + .expect("failed to define a metric"), + remote_errors: register_int_counter_vec!( + "pageserver_deletion_queue_remote_errors_total", + "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects", + &["op_kind"], + ) + .expect("failed to define a metric") +} +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -1161,7 +1265,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - pub resident_physical_size_gauge: UIntGauge, + resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -1239,10 +1343,29 @@ impl TimelineMetrics { } pub fn record_new_file_metrics(&self, sz: u64) { - self.resident_physical_size_gauge.add(sz); + self.resident_physical_size_add(sz); self.num_persistent_files_created.inc_by(1); self.persistent_bytes_written.inc_by(sz); } + + pub fn resident_physical_size_sub(&self, sz: u64) { + self.resident_physical_size_gauge.sub(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz); + } + + pub fn resident_physical_size_add(&self, sz: u64) { + self.resident_physical_size_gauge.add(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz); + } + + pub fn resident_physical_size_set(&self, sz: u64) { + self.resident_physical_size_gauge.set(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz); + } + + pub fn resident_physical_size_get(&self) -> u64 { + self.resident_physical_size_gauge.get() + } } impl Drop for TimelineMetrics { @@ -1250,7 +1373,10 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); - let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + { + RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + } let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); @@ -1304,10 +1430,43 @@ use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +/// Maintain a per timeline gauge in addition to the global gauge. +struct PerTimelineRemotePhysicalSizeGauge { + last_set: u64, + gauge: UIntGauge, +} + +impl PerTimelineRemotePhysicalSizeGauge { + fn new(per_timeline_gauge: UIntGauge) -> Self { + Self { + last_set: per_timeline_gauge.get(), + gauge: per_timeline_gauge, + } + } + fn set(&mut self, sz: u64) { + self.gauge.set(sz); + if sz < self.last_set { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + } else { + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + }; + self.last_set = sz; + } + fn get(&self) -> u64 { + self.gauge.get() + } +} + +impl Drop for PerTimelineRemotePhysicalSizeGauge { + fn drop(&mut self) { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + } +} + pub struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, + remote_physical_size_gauge: Mutex>, calls_unfinished_gauge: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, @@ -1325,18 +1484,24 @@ impl RemoteTimelineClientMetrics { } } - pub fn remote_physical_size_gauge(&self) -> UIntGauge { + pub(crate) fn remote_physical_size_set(&self, sz: u64) { let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - guard - .get_or_insert_with(|| { + let gauge = guard.get_or_insert_with(|| { + PerTimelineRemotePhysicalSizeGauge::new( REMOTE_PHYSICAL_SIZE .get_metric_with_label_values(&[ &self.tenant_id.to_string(), &self.timeline_id.to_string(), ]) - .unwrap() - }) - .clone() + .unwrap(), + ) + }); + gauge.set(sz); + } + + pub(crate) fn remote_physical_size_get(&self) -> u64 { + let guard = self.remote_physical_size_gauge.lock().unwrap(); + guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) } pub fn remote_operation_time( @@ -1675,6 +1840,9 @@ pub fn preinitialize_metrics() { Lazy::force(c); }); + // Deletion queue stats + Lazy::force(&DELETION_QUEUE); + // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] .into_iter() diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 38b169ea85..97ca2bfea7 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -75,7 +75,11 @@ use std::{ collections::{hash_map::Entry, HashMap}, convert::TryInto, - sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, + sync::{ + atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, + Arc, Weak, + }, + time::Duration, }; use anyhow::Context; @@ -165,6 +169,8 @@ struct Slot { struct SlotInner { key: Option, + // for `coalesce_readers_permit` + permit: std::sync::Mutex>, buf: &'static mut [u8; PAGE_SZ], } @@ -207,6 +213,22 @@ impl Slot { } } +impl SlotInner { + /// If there is aready a reader, drop our permit and share its permit, just like we share read access. + fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc { + let mut guard = self.permit.lock().unwrap(); + if let Some(existing_permit) = guard.upgrade() { + drop(guard); + drop(permit); + existing_permit + } else { + let permit = Arc::new(permit); + *guard = Arc::downgrade(&permit); + permit + } + } +} + pub struct PageCache { /// This contains the mapping from the cache key to buffer slot that currently /// contains the page, if any. @@ -224,6 +246,8 @@ pub struct PageCache { /// The actual buffers with their metadata. slots: Box<[Slot]>, + pinned_slots: Arc, + /// Index of the next candidate to evict, for the Clock replacement algorithm. /// This is interpreted modulo the page cache size. next_evict_slot: AtomicUsize, @@ -231,23 +255,28 @@ pub struct PageCache { size_metrics: &'static PageCacheSizeMetrics, } +struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit); + /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked /// until the guard is dropped. /// -pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>); +pub struct PageReadGuard<'i> { + _permit: Arc, + slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>, +} impl std::ops::Deref for PageReadGuard<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { - self.0.buf + self.slot_guard.buf } } impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { fn as_ref(&self) -> &[u8; PAGE_SZ] { - self.0.buf + self.slot_guard.buf } } @@ -264,6 +293,8 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { pub struct PageWriteGuard<'i> { inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>, + _permit: PinnedSlotsPermit, + // Are the page contents currently valid? // Used to mark pages as invalid that are assigned but not yet filled with data. valid: bool, @@ -348,6 +379,10 @@ impl PageCache { lsn: Lsn, ctx: &RequestContext, ) -> Option<(Lsn, PageReadGuard)> { + let Ok(permit) = self.try_get_pinned_slot_permit().await else { + return None; + }; + crate::metrics::PAGE_CACHE .for_ctx(ctx) .read_accesses_materialized_page @@ -362,7 +397,10 @@ impl PageCache { lsn, }; - if let Some(guard) = self.try_lock_for_read(&mut cache_key).await { + if let Some(guard) = self + .try_lock_for_read(&mut cache_key, &mut Some(permit)) + .await + { if let CacheKey::MaterializedPage { hash_key: _, lsn: available_lsn, @@ -445,6 +483,29 @@ impl PageCache { // "mappings" after this section. But the routines in this section should // not require changes. + async fn try_get_pinned_slot_permit(&self) -> anyhow::Result { + let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer(); + match tokio::time::timeout( + // Choose small timeout, neon_smgr does its own retries. + // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869 + Duration::from_secs(10), + Arc::clone(&self.pinned_slots).acquire_owned(), + ) + .await + { + Ok(res) => Ok(PinnedSlotsPermit( + res.expect("this semaphore is never closed"), + )), + Err(_timeout) => { + timer.stop_and_discard(); + crate::metrics::page_cache_errors_inc( + crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, + ); + anyhow::bail!("timeout: there were page guards alive for all page cache slots") + } + } + } + /// Look up a page in the cache. /// /// If the search criteria is not exact, *cache_key is updated with the key @@ -454,7 +515,11 @@ impl PageCache { /// /// If no page is found, returns None and *cache_key is left unmodified. /// - async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option { + async fn try_lock_for_read( + &self, + cache_key: &mut CacheKey, + permit: &mut Option, + ) -> Option { let cache_key_orig = cache_key.clone(); if let Some(slot_idx) = self.search_mapping(cache_key) { // The page was found in the mapping. Lock the slot, and re-check @@ -464,7 +529,10 @@ impl PageCache { let inner = slot.inner.read().await; if inner.key.as_ref() == Some(cache_key) { slot.inc_usage_count(); - return Some(PageReadGuard(inner)); + return Some(PageReadGuard { + _permit: inner.coalesce_readers_permit(permit.take().unwrap()), + slot_guard: inner, + }); } else { // search_mapping might have modified the search key; restore it. *cache_key = cache_key_orig; @@ -507,6 +575,8 @@ impl PageCache { cache_key: &mut CacheKey, ctx: &RequestContext, ) -> anyhow::Result { + let mut permit = Some(self.try_get_pinned_slot_permit().await?); + let (read_access, hit) = match cache_key { CacheKey::MaterializedPage { .. } => { unreachable!("Materialized pages use lookup_materialized_page") @@ -523,17 +593,21 @@ impl PageCache { let mut is_first_iteration = true; loop { // First check if the key already exists in the cache. - if let Some(read_guard) = self.try_lock_for_read(cache_key).await { + if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await { + debug_assert!(permit.is_none()); if is_first_iteration { hit.inc(); } return Ok(ReadBufResult::Found(read_guard)); } + debug_assert!(permit.is_some()); is_first_iteration = false; // Not found. Find a victim buffer - let (slot_idx, mut inner) = - self.find_victim().context("Failed to find evict victim")?; + let (slot_idx, mut inner) = self + .find_victim(permit.as_ref().unwrap()) + .await + .context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -555,7 +629,16 @@ impl PageCache { inner.key = Some(cache_key.clone()); slot.set_usage_count(1); + debug_assert!( + { + let guard = inner.permit.lock().unwrap(); + guard.upgrade().is_none() + }, + "we hold a write lock, so, no one else should have a permit" + ); + return Ok(ReadBufResult::NotFound(PageWriteGuard { + _permit: permit.take().unwrap(), inner, valid: false, })); @@ -566,7 +649,11 @@ impl PageCache { /// found, returns None. /// /// When locking a page for writing, the search criteria is always "exact". - async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option { + async fn try_lock_for_write( + &self, + cache_key: &CacheKey, + permit: &mut Option, + ) -> Option { if let Some(slot_idx) = self.search_mapping_for_write(cache_key) { // The page was found in the mapping. Lock the slot, and re-check // that it's still what we expected (because we don't released the mapping @@ -575,7 +662,18 @@ impl PageCache { let inner = slot.inner.write().await; if inner.key.as_ref() == Some(cache_key) { slot.inc_usage_count(); - return Some(PageWriteGuard { inner, valid: true }); + debug_assert!( + { + let guard = inner.permit.lock().unwrap(); + guard.upgrade().is_none() + }, + "we hold a write lock, so, no one else should have a permit" + ); + return Some(PageWriteGuard { + _permit: permit.take().unwrap(), + inner, + valid: true, + }); } } None @@ -586,15 +684,20 @@ impl PageCache { /// Similar to lock_for_read(), but the returned buffer is write-locked and /// may be modified by the caller even if it's already found in the cache. async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result { + let mut permit = Some(self.try_get_pinned_slot_permit().await?); loop { // First check if the key already exists in the cache. - if let Some(write_guard) = self.try_lock_for_write(cache_key).await { + if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await { + debug_assert!(permit.is_none()); return Ok(WriteBufResult::Found(write_guard)); } + debug_assert!(permit.is_some()); // Not found. Find a victim buffer - let (slot_idx, mut inner) = - self.find_victim().context("Failed to find evict victim")?; + let (slot_idx, mut inner) = self + .find_victim(permit.as_ref().unwrap()) + .await + .context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -616,7 +719,16 @@ impl PageCache { inner.key = Some(cache_key.clone()); slot.set_usage_count(1); + debug_assert!( + { + let guard = inner.permit.lock().unwrap(); + guard.upgrade().is_none() + }, + "we hold a write lock, so, no one else should have a permit" + ); + return Ok(WriteBufResult::NotFound(PageWriteGuard { + _permit: permit.take().unwrap(), inner, valid: false, })); @@ -769,7 +881,10 @@ impl PageCache { /// Find a slot to evict. /// /// On return, the slot is empty and write-locked. - fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard)> { + async fn find_victim( + &self, + _permit_witness: &PinnedSlotsPermit, + ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard)> { let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { @@ -782,13 +897,40 @@ impl PageCache { let mut inner = match slot.inner.try_write() { Ok(inner) => inner, Err(_err) => { - // If we have looped through the whole buffer pool 10 times - // and still haven't found a victim buffer, something's wrong. - // Maybe all the buffers were in locked. That could happen in - // theory, if you have more threads holding buffers locked than - // there are buffers in the pool. In practice, with a reasonably - // large buffer pool it really shouldn't happen. if iters > iter_limit { + // NB: Even with the permits, there's no hard guarantee that we will find a slot with + // any particular number of iterations: other threads might race ahead and acquire and + // release pins just as we're scanning the array. + // + // Imagine that nslots is 2, and as starting point, usage_count==1 on all + // slots. There are two threads running concurrently, A and B. A has just + // acquired the permit from the semaphore. + // + // A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search + // B: Acquire permit. + // B: Look at slot 2, decrement its usage_count to zero and continue the search + // B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1. + // B: Release pin and permit again + // B: Acquire permit. + // B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1. + // B: Release pin and permit again + // + // Now we're back in the starting situation that both slots have + // usage_count 1, but A has now been through one iteration of the + // find_victim() loop. This can repeat indefinitely and on each + // iteration, A's iteration count increases by one. + // + // So, even though the semaphore for the permits is fair, the victim search + // itself happens in parallel and is not fair. + // Hence even with a permit, a task can theoretically be starved. + // To avoid this, we'd need tokio to give priority to tasks that are holding + // permits for longer. + // Note that just yielding to tokio during iteration without such + // priority boosting is likely counter-productive. We'd just give more opportunities + // for B to bump usage count, further starving A. + crate::metrics::page_cache_errors_inc( + crate::metrics::PageCacheErrorKind::EvictIterLimit, + ); anyhow::bail!("exceeded evict iter limit"); } continue; @@ -799,6 +941,7 @@ impl PageCache { self.remove_mapping(old_key); inner.key = None; } + crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64); return Ok((slot_idx, inner)); } } @@ -826,7 +969,11 @@ impl PageCache { let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap(); Slot { - inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }), + inner: tokio::sync::RwLock::new(SlotInner { + key: None, + buf, + permit: std::sync::Mutex::new(Weak::new()), + }), usage_count: AtomicU8::new(0), } }) @@ -838,6 +985,7 @@ impl PageCache { slots, next_evict_slot: AtomicUsize::new(0), size_metrics, + pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)), } } } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 047fa761c3..7a94c3449d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -37,7 +37,7 @@ impl Key { | self.field6 as i128 } - pub fn from_i128(x: i128) -> Self { + pub const fn from_i128(x: i128) -> Self { Key { field1: ((x >> 120) & 0xf) as u8, field2: ((x >> 104) & 0xFFFF) as u32, diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 650bc119b6..017322ffb2 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -456,7 +456,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(1).await; + shutdown_pageserver(None, 1).await; } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1c92c618fa..47bfd4a8ef 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; +use crate::deletion_queue::DeletionQueueClient; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT_ACTIVATION; @@ -117,7 +118,7 @@ mod span; pub mod metadata; mod par_fsync; -mod remote_timeline_client; +pub mod remote_timeline_client; pub mod storage_layer; pub mod config; @@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: Option, + pub deletion_queue_client: DeletionQueueClient, } /// @@ -197,6 +199,9 @@ pub struct Tenant { // provides access to timeline data sitting in the remote storage pub(crate) remote_storage: Option, + // Access to global deletion queue for when this tenant wants to schedule a deletion + deletion_queue_client: DeletionQueueClient, + /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, @@ -523,15 +528,20 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, generation: Generation, - broker_client: storage_broker::BrokerClientChannel, + resources: TenantSharedResources, tenants: &'static tokio::sync::RwLock, - remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> anyhow::Result> { // TODO dedup with spawn_load let tenant_conf = Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?; + let TenantSharedResources { + broker_client, + remote_storage, + deletion_queue_client, + } = resources; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( TenantState::Attaching, @@ -540,7 +550,8 @@ impl Tenant { wal_redo_manager, tenant_id, generation, - Some(remote_storage.clone()), + remote_storage.clone(), + deletion_queue_client, )); // Do all the hard work in the background @@ -571,7 +582,7 @@ impl Tenant { let pending_deletion = { match DeleteTenantFlow::should_resume_deletion( conf, - Some(&remote_storage), + remote_storage.as_ref(), &tenant_clone, ) .await @@ -660,6 +671,7 @@ impl Tenant { for timeline_id in remote_timeline_ids { let client = RemoteTimelineClient::new( remote_storage.clone(), + self.deletion_queue_client.clone(), self.conf, self.tenant_id, timeline_id, @@ -726,6 +738,7 @@ impl Tenant { remote_metadata, TimelineResources { remote_client: Some(remote_client), + deletion_queue_client: self.deletion_queue_client.clone(), }, ctx, ) @@ -750,6 +763,7 @@ impl Tenant { timeline_id, &index_part.metadata, Some(remote_timeline_client), + self.deletion_queue_client.clone(), None, ) .await @@ -851,6 +865,7 @@ impl Tenant { tenant_id, Generation::broken(), None, + DeletionQueueClient::broken(), )) } @@ -895,6 +910,7 @@ impl Tenant { tenant_id, generation, remote_storage.clone(), + resources.deletion_queue_client.clone(), ); let tenant = Arc::new(tenant); @@ -1302,6 +1318,7 @@ impl Tenant { timeline_id, &local_metadata, Some(remote_client), + self.deletion_queue_client.clone(), init_order, ) .await @@ -1351,6 +1368,7 @@ impl Tenant { timeline_id, &local_metadata, None, + self.deletion_queue_client.clone(), init_order, ) .await @@ -2242,6 +2260,9 @@ impl Tenant { Ok(timeline) } + // Allow too_many_arguments because a constructor's argument list naturally grows with the + // number of attributes in the struct: breaking these out into a builder wouldn't be helpful. + #[allow(clippy::too_many_arguments)] fn new( state: TenantState, conf: &'static PageServerConf, @@ -2250,6 +2271,7 @@ impl Tenant { tenant_id: TenantId, generation: Generation, remote_storage: Option, + deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -2317,6 +2339,7 @@ impl Tenant { gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, + deletion_queue_client, state, cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), @@ -2856,6 +2879,7 @@ impl Tenant { let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { let remote_client = RemoteTimelineClient::new( remote_storage.clone(), + self.deletion_queue_client.clone(), self.conf, self.tenant_id, timeline_id, @@ -2866,7 +2890,10 @@ impl Tenant { None }; - TimelineResources { remote_client } + TimelineResources { + remote_client, + deletion_queue_client: self.deletion_queue_client.clone(), + } } /// Creates intermediate timeline structure and its files. @@ -3322,6 +3349,7 @@ pub mod harness { use utils::logging; use utils::lsn::Lsn; + use crate::deletion_queue::mock::MockDeletionQueue; use crate::{ config::PageServerConf, repository::Key, @@ -3383,6 +3411,7 @@ pub mod harness { pub generation: Generation, pub remote_storage: GenericRemoteStorage, pub remote_fs_dir: PathBuf, + pub deletion_queue: MockDeletionQueue, } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); @@ -3431,6 +3460,7 @@ pub mod harness { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), }; let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { conf, @@ -3439,6 +3469,7 @@ pub mod harness { generation: Generation::new(0xdeadbeef), remote_storage, remote_fs_dir, + deletion_queue, }) } @@ -3463,6 +3494,7 @@ pub mod harness { self.tenant_id, self.generation, Some(self.remote_storage.clone()), + self.deletion_queue.new_client(), )); tenant .load(None, ctx) @@ -4193,7 +4225,8 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await; + let harness = TenantHarness::create("test_bulk_insert")?; + let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4240,7 +4273,8 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await; + let harness = TenantHarness::create("test_random_updates")?; + let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 74faee1115..17bcc9eb5f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -20,7 +20,10 @@ use utils::crashsafe; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::control_plane_client::ControlPlaneClient; +use crate::control_plane_client::{ + ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, +}; +use crate::deletion_queue::DeletionQueueClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::delete::DeleteTenantFlow; @@ -116,7 +119,28 @@ pub async fn init_tenant_mgr( // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { - Some(client.re_attach().await?) + let result = match client.re_attach().await { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + anyhow::bail!("Shut down while waiting for control plane re-attach response") + } + }; + + // The deletion queue needs to know about the startup attachment state to decide which (if any) stored + // deletion list entries may still be valid. We provide that by pushing a recovery operation into + // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions + // are processed, even though we don't block on recovery completing here. + // + // Must only do this if remote storage is enabled, otherwise deletion queue + // is not running and channel push will fail. + if resources.remote_storage.is_some() { + resources + .deletion_queue_client + .recover(result.clone()) + .await?; + } + + Some(result) } else { info!("Control plane API not configured, tenant generations are disabled"); None @@ -285,29 +309,21 @@ pub(crate) fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); - if let Some(remote_storage) = resources.remote_storage { - match Tenant::spawn_attach( - conf, - tenant_id, - generation, - resources.broker_client, - tenants, - remote_storage, - ctx, - ) { - Ok(tenant) => tenant, - Err(e) => { - error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) - } - } - } else { + if resources.remote_storage.is_none() { warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); Tenant::create_broken_tenant( conf, tenant_id, "attaching mark file present but no remote storage configured".to_string(), ) + } else { + match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) { + Ok(tenant) => tenant, + Err(e) => { + error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + } + } } } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); @@ -438,8 +454,7 @@ pub async fn create_tenant( tenant_conf: TenantConfOpt, tenant_id: TenantId, generation: Generation, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, + resources: TenantSharedResources, ctx: &RequestContext, ) -> Result, TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -450,13 +465,9 @@ pub async fn create_tenant( // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 - let tenant_resources = TenantSharedResources { - broker_client, - remote_storage, - }; let created_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, - generation, tenant_resources, None, &TENANTS, ctx)?; + generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -622,6 +633,7 @@ pub async fn load_tenant( generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + deletion_queue_client: DeletionQueueClient, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -635,6 +647,7 @@ pub async fn load_tenant( let resources = TenantSharedResources { broker_client, remote_storage, + deletion_queue_client }; let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx) .with_context(|| { @@ -702,8 +715,7 @@ pub async fn attach_tenant( tenant_id: TenantId, generation: Generation, tenant_conf: TenantConfOpt, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: GenericRemoteStorage, + resources: TenantSharedResources, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -718,10 +730,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - let resources = TenantSharedResources { - broker_client, - remote_storage: Some(remote_storage), - }; + let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 6f42b54ac2..ee99151ef2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -116,8 +116,12 @@ //! # Completion //! //! Once an operation has completed, we update -//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates -//! to safekeepers that they can delete the WAL up to that LSN. +//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately, +//! and submit a request through the DeletionQueue to update +//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has +//! validated that our generation is not stale. It is this visible value +//! that is advertized to safekeepers as a signal that that they can +//! delete the WAL up to that LSN. //! //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait //! for all pending operations to complete. It does not prevent more @@ -200,7 +204,6 @@ //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map -mod delete; mod download; pub mod index; mod upload; @@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; +use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -324,6 +328,8 @@ pub struct RemoteTimelineClient { metrics: Arc, storage_impl: GenericRemoteStorage, + + deletion_queue_client: DeletionQueueClient, } impl RemoteTimelineClient { @@ -335,6 +341,7 @@ impl RemoteTimelineClient { /// pub fn new( remote_storage: GenericRemoteStorage, + deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_id: TenantId, timeline_id: TimelineId, @@ -352,6 +359,7 @@ impl RemoteTimelineClient { timeline_id, generation, storage_impl: remote_storage, + deletion_queue_client, upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), } @@ -413,13 +421,24 @@ impl RemoteTimelineClient { Ok(()) } - pub fn last_uploaded_consistent_lsn(&self) -> Option { - match &*self.upload_queue.lock().unwrap() { + pub fn remote_consistent_lsn_projected(&self) -> Option { + match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, - UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn), - UploadQueue::Stopped(q) => { - Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn) - } + UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), + UploadQueue::Stopped(q) => q + .upload_queue_for_deletion + .get_last_remote_consistent_lsn_projected(), + } + } + + pub fn remote_consistent_lsn_visible(&self) -> Option { + match &mut *self.upload_queue.lock().unwrap() { + UploadQueue::Uninitialized => None, + UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), + UploadQueue::Stopped(q) => Some( + q.upload_queue_for_deletion + .get_last_remote_consistent_lsn_visible(), + ), } } @@ -434,11 +453,11 @@ impl RemoteTimelineClient { } else { 0 }; - self.metrics.remote_physical_size_gauge().set(size); + self.metrics.remote_physical_size_set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_gauge().get() + self.metrics.remote_physical_size_get() } // @@ -643,7 +662,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: Vec, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -663,10 +682,10 @@ impl RemoteTimelineClient { // Decorate our list of names with each name's generation, dropping // makes that are unexpectedly missing from our metadata. let with_generations: Vec<_> = names - .iter() + .into_iter() .filter_map(|name| { // Remove from latest_files, learning the file's remote generation in the process - let meta = upload_queue.latest_files.remove(name); + let meta = upload_queue.latest_files.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -688,19 +707,17 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue, metadata); } - // schedule the actual deletions - for (name, generation) in with_generations { - let op = UploadOp::Delete(Delete { - file_kind: RemoteOpFileKind::Layer, - layer_file_name: name.clone(), - scheduled_from_timeline_delete: false, - generation, - }); - self.calls_unfinished_metric_begin(&op); - upload_queue.queued_operations.push_back(op); - info!("scheduled layer file deletion {name}"); + for (name, gen) in &with_generations { + info!("scheduling deletion of layer {}{}", name, gen.get_suffix()); } + // schedule the actual deletions + let op = UploadOp::Delete(Delete { + layers: with_generations, + }); + self.calls_unfinished_metric_begin(&op); + upload_queue.queued_operations.push_back(op); + // Launch the tasks immediately, if possible self.launch_queued_tasks(upload_queue); }; @@ -833,9 +850,7 @@ impl RemoteTimelineClient { pub(crate) async fn delete_all(self: &Arc) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_and_timeline_id(); - let (mut receiver, deletions_queued) = { - let mut deletions_queued = 0; - + let layers: Vec = { let mut locked = self.upload_queue.lock().unwrap(); let stopped = locked.stopped_mut()?; @@ -847,42 +862,30 @@ impl RemoteTimelineClient { stopped .upload_queue_for_deletion - .queued_operations - .reserve(stopped.upload_queue_for_deletion.latest_files.len()); - - // schedule the actual deletions - for (name, meta) in &stopped.upload_queue_for_deletion.latest_files { - let op = UploadOp::Delete(Delete { - file_kind: RemoteOpFileKind::Layer, - layer_file_name: name.clone(), - scheduled_from_timeline_delete: true, - generation: meta.generation, - }); - - self.calls_unfinished_metric_begin(&op); - stopped - .upload_queue_for_deletion - .queued_operations - .push_back(op); - - info!("scheduled layer file deletion {name}"); - deletions_queued += 1; - } - - self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion); - - ( - self.schedule_barrier(&mut stopped.upload_queue_for_deletion), - deletions_queued, - ) + .latest_files + .drain() + .map(|(file_name, meta)| { + remote_layer_path( + &self.tenant_id, + &self.timeline_id, + &file_name, + meta.generation, + ) + }) + .collect() }; - receiver.changed().await.context("upload queue shut down")?; + let layer_deletion_count = layers.len(); + self.deletion_queue_client.push_immediate(layers).await?; // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id); + // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // taking the burden of listing all the layers that we already know we should delete. + self.deletion_queue_client.flush_immediate().await?; + let remaining = backoff::retry( || async { self.storage_impl @@ -910,17 +913,9 @@ impl RemoteTimelineClient { }) .collect(); + let not_referenced_count = remaining.len(); if !remaining.is_empty() { - backoff::retry( - || async { self.storage_impl.delete_objects(&remaining).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "delete_objects", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")), - ) - .await - .context("delete_objects")?; + self.deletion_queue_client.push_immediate(remaining).await?; } fail::fail_point!("timeline-delete-before-index-delete", |_| { @@ -931,18 +926,14 @@ impl RemoteTimelineClient { let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); - debug!("deleting index part"); + debug!("enqueuing index part deletion"); + self.deletion_queue_client + .push_immediate([index_file_path].to_vec()) + .await?; - backoff::retry( - || async { self.storage_impl.delete(&index_file_path).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "delete_index", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")), - ) - .await - .context("delete_index")?; + // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait + // for a flush to a persistent deletion list so that we may be sure deletion will occur. + self.deletion_queue_client.flush_immediate().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -950,7 +941,7 @@ impl RemoteTimelineClient { ))? }); - info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json"); + info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json"); Ok(()) } @@ -1140,21 +1131,16 @@ impl RemoteTimelineClient { } res } - UploadOp::Delete(delete) => { - let path = &self - .conf - .timeline_path(&self.tenant_id, &self.timeline_id) - .join(delete.layer_file_name.file_name()); - delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation) - .measure_remote_op( - self.tenant_id, - self.timeline_id, - delete.file_kind, - RemoteOpKind::Delete, - Arc::clone(&self.metrics), - ) - .await - } + UploadOp::Delete(delete) => self + .deletion_queue_client + .push_layers( + self.tenant_id, + self.timeline_id, + self.generation, + delete.layers.clone(), + ) + .await + .map_err(|e| anyhow::anyhow!(e)), UploadOp::Barrier(_) => { // unreachable. Barrier operations are handled synchronously in // launch_queued_tasks @@ -1210,18 +1196,12 @@ impl RemoteTimelineClient { } // The task has completed successfully. Remove it from the in-progress list. - { + let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(stopped) => { - // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion) - // then stop() took care of it so we just return. - // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc. - match &task.op { - UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion), - _ => None - } + UploadQueue::Stopped(_stopped) => { + None }, UploadQueue::Initialized(qi) => { Some(qi) } }; @@ -1236,23 +1216,51 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); - match task.op { + let lsn_update = match task.op { UploadOp::UploadLayer(_, _) => { upload_queue.num_inprogress_layer_uploads -= 1; + None } UploadOp::UploadMetadata(_, lsn) => { upload_queue.num_inprogress_metadata_uploads -= 1; - upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check? + // XXX monotonicity check? + + upload_queue.projected_remote_consistent_lsn = Some(lsn); + if self.generation.is_none() { + // Legacy mode: skip validating generation + upload_queue.visible_remote_consistent_lsn.store(lsn); + None + } else { + Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) + } } UploadOp::Delete(_) => { upload_queue.num_inprogress_deletions -= 1; + None } UploadOp::Barrier(_) => unreachable!(), }; // Launch any queued tasks that were unblocked by this one. self.launch_queued_tasks(upload_queue); + lsn_update + }; + + if let Some((lsn, slot)) = lsn_update { + // Updates to the remote_consistent_lsn we advertise to pageservers + // are all routed through the DeletionQueue, to enforce important + // data safety guarantees (see docs/rfcs/025-generation-numbers.md) + self.deletion_queue_client + .update_remote_consistent_lsn( + self.tenant_id, + self.timeline_id, + self.generation, + lsn, + slot, + ) + .await; } + self.calls_unfinished_metric_end(&task.op); } @@ -1278,8 +1286,8 @@ impl RemoteTimelineClient { reason: "metadata uploads are tiny", }, ), - UploadOp::Delete(delete) => ( - delete.file_kind, + UploadOp::Delete(_delete) => ( + RemoteOpFileKind::Layer, RemoteOpKind::Delete, DontTrackSize { reason: "should we track deletes? positive or negative sign?", @@ -1341,7 +1349,10 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), - last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn, + projected_remote_consistent_lsn: None, + visible_remote_consistent_lsn: initialized + .visible_remote_consistent_lsn + .clone(), num_inprogress_layer_uploads: 0, num_inprogress_metadata_uploads: 0, num_inprogress_deletions: 0, @@ -1405,13 +1416,13 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, layer_file_name: &LayerFileName, - layer_meta: &LayerFileMetadata, + generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", layer_file_name.file_name(), - layer_meta.generation.get_suffix() + generation.get_suffix() ); RemotePath::from_string(&path).expect("Failed to construct path") @@ -1554,7 +1565,6 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { - // Use a current-thread runtime in the test let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = harness.load().await; @@ -1580,6 +1590,7 @@ mod tests { timeline_id: TIMELINE_ID, generation, storage_impl: self.harness.remote_storage.clone(), + deletion_queue_client: self.harness.deletion_queue.new_client(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &self.harness.tenant_id, @@ -1749,7 +1760,7 @@ mod tests { ) .unwrap(); client - .schedule_layer_file_deletion(&[layer_file_name_1.clone()]) + .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec()) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -1775,6 +1786,7 @@ mod tests { // Finish them client.wait_completion().await.unwrap(); + harness.deletion_queue.pump().await; assert_remote_files( &[ diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs deleted file mode 100644 index 7324559223..0000000000 --- a/pageserver/src/tenant/remote_timeline_client/delete.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! Helper functions to delete files from remote storage with a RemoteStorage -use anyhow::Context; -use std::path::Path; -use tracing::debug; - -use remote_storage::GenericRemoteStorage; - -use crate::{ - config::PageServerConf, - tenant::{remote_timeline_client::remote_path, Generation}, -}; - -pub(super) async fn delete_layer<'a>( - conf: &'static PageServerConf, - storage: &'a GenericRemoteStorage, - local_layer_path: &'a Path, - generation: Generation, -) -> anyhow::Result<()> { - fail::fail_point!("before-delete-layer", |_| { - anyhow::bail!("failpoint before-delete-layer") - }); - debug!("Deleting layer from remote storage: {local_layer_path:?}",); - - let path_to_delete = remote_path(conf, local_layer_path, generation)?; - - // We don't want to print an error if the delete failed if the file has - // already been deleted. Thankfully, in this situation S3 already - // does not yield an error. While OS-provided local file system APIs do yield - // errors, we avoid them in the `LocalFs` wrapper. - storage - .delete(&path_to_delete) - .await - .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}")) -} diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 9863215529..5c173c613f 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>( .timeline_path(&tenant_id, &timeline_id) .join(layer_file_name.file_name()); - let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata); + let remote_path = remote_layer_path( + &tenant_id, + &timeline_id, + layer_file_name, + layer_metadata.generation, + ); // Perform a rename inspired by durable_rename from file_utils.c. // The sequence: diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 78ac1338db..9b62ba1c50 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime}; use crate::context::{ AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, }; +use crate::deletion_queue::DeletionQueueClient; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ @@ -143,6 +144,7 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: Option, + pub deletion_queue_client: DeletionQueueClient, } pub struct Timeline { @@ -521,9 +523,23 @@ impl Timeline { self.disk_consistent_lsn.load() } - pub fn get_remote_consistent_lsn(&self) -> Option { + /// remote_consistent_lsn from the perspective of the tenant's current generation, + /// not validated with control plane yet. + /// See [`Self::get_remote_consistent_lsn_visible`]. + pub fn get_remote_consistent_lsn_projected(&self) -> Option { if let Some(remote_client) = &self.remote_client { - remote_client.last_uploaded_consistent_lsn() + remote_client.remote_consistent_lsn_projected() + } else { + None + } + } + + /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, + /// i.e. a value of remote_consistent_lsn_projected which has undergone + /// generation validation in the deletion queue. + pub fn get_remote_consistent_lsn_visible(&self) -> Option { + if let Some(remote_client) = &self.remote_client { + remote_client.remote_consistent_lsn_visible() } else { None } @@ -543,7 +559,7 @@ impl Timeline { } pub fn resident_physical_size(&self) -> u64 { - self.metrics.resident_physical_size_gauge.get() + self.metrics.resident_physical_size_get() } /// @@ -1293,10 +1309,7 @@ impl Timeline { // will treat the file as a local layer again, count it towards resident size, // and it'll be like the layer removal never happened. // The bump in resident size is perhaps unexpected but overall a robust behavior. - self.metrics - .resident_physical_size_gauge - .sub(layer_file_size); - + self.metrics.resident_physical_size_sub(layer_file_size); self.metrics.evictions.inc(); if let Some(delta) = local_layer_residence_duration { @@ -1820,7 +1833,7 @@ impl Timeline { for (layer, m) in needs_upload { rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?; } - rtc.schedule_layer_file_deletion(&needs_cleanup)?; + rtc.schedule_layer_file_deletion(needs_cleanup)?; rtc.schedule_index_upload_for_file_changes()?; // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. @@ -1830,9 +1843,7 @@ impl Timeline { "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); - self.metrics - .resident_physical_size_gauge - .set(total_physical_size); + self.metrics.resident_physical_size_set(total_physical_size); timer.stop_and_record(); Ok(()) @@ -3875,7 +3886,7 @@ impl Timeline { // Also schedule the deletions in remote storage if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; + remote_client.schedule_layer_file_deletion(layer_names_to_delete)?; } Ok(()) @@ -4210,7 +4221,7 @@ impl Timeline { } if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; + remote_client.schedule_layer_file_deletion(layer_names_to_delete)?; } apply.flush(); @@ -4382,7 +4393,7 @@ impl Timeline { // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. - self_clone.metrics.resident_physical_size_gauge.add(*size); + self_clone.metrics.resident_physical_size_add(*size); // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 18588cf0fd..7d55388f44 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -14,6 +14,7 @@ use utils::{ use crate::{ config::PageServerConf, + deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ metadata::TimelineMetadata, @@ -407,6 +408,7 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, local_metadata: &TimelineMetadata, remote_client: Option, + deletion_queue_client: DeletionQueueClient, init_order: Option<&InitializationOrder>, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. @@ -416,7 +418,10 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. - TimelineResources { remote_client }, + TimelineResources { + remote_client, + deletion_queue_client, + }, init_order, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 3c88d31f24..0a387bd779 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -263,7 +263,7 @@ impl LayerManager { let desc = layer.layer_desc(); if !layer.is_remote_layer() { layer.delete_resident_layer_file()?; - metrics.resident_physical_size_gauge.sub(desc.file_size); + metrics.resident_physical_size_sub(desc.file_size); } // TODO Removing from the bottom of the layer map is expensive. diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7d1e9b4a39..0831b9ceda 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection( })?; if let Some(last_lsn) = status_update { - let timeline_remote_consistent_lsn = - timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); + let timeline_remote_consistent_lsn = timeline + .get_remote_consistent_lsn_visible() + .unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. let last_received_lsn = last_lsn; diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 28822335b0..08b1cb8866 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,5 +1,3 @@ -use crate::metrics::RemoteOpFileKind; - use super::storage_layer::LayerFileName; use super::Generation; use crate::tenant::metadata::TimelineMetadata; @@ -11,6 +9,7 @@ use std::fmt::Debug; use chrono::NaiveDateTime; use std::sync::Arc; use tracing::info; +use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; @@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized { /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. /// Safekeeper can rely on it to make decisions for WAL storage. - pub(crate) last_uploaded_consistent_lsn: Lsn, + /// + /// visible_remote_consistent_lsn is only updated after our generation has been validated with + /// the control plane (unlesss a timeline's generation is None, in which case + /// we skip validation) + pub(crate) projected_remote_consistent_lsn: Option, + pub(crate) visible_remote_consistent_lsn: Arc, // Breakdown of different kinds of tasks currently in-progress pub(crate) num_inprogress_layer_uploads: usize, @@ -81,6 +85,14 @@ impl UploadQueueInitialized { pub(super) fn no_pending_work(&self) -> bool { self.inprogress_tasks.is_empty() && self.queued_operations.is_empty() } + + pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn { + self.visible_remote_consistent_lsn.load() + } + + pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { + self.projected_remote_consistent_lsn + } } #[derive(Clone, Copy)] @@ -114,9 +126,8 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), - // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent - // safekeepers from garbage-collecting anything. - last_uploaded_consistent_lsn: Lsn(0), + projected_remote_consistent_lsn: None, + visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, num_inprogress_layer_uploads: 0, @@ -158,7 +169,10 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), - last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(), + projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), + visible_remote_consistent_lsn: Arc::new( + index_part.metadata.disk_consistent_lsn().into(), + ), // what follows are boring default initializations task_counter: 0, num_inprogress_layer_uploads: 0, @@ -201,12 +215,11 @@ pub(crate) struct UploadTask { pub(crate) op: UploadOp, } +/// A deletion of some layers within the lifetime of a timeline. This is not used +/// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) file_kind: RemoteOpFileKind, - pub(crate) layer_file_name: LayerFileName, - pub(crate) scheduled_from_timeline_delete: bool, - pub(crate) generation: Generation, + pub(crate) layers: Vec<(LayerFileName, Generation)>, } #[derive(Debug)] @@ -217,7 +230,7 @@ pub(crate) enum UploadOp { /// Upload the metadata file UploadMetadata(IndexPart, Lsn), - /// Delete a layer file + /// Delete layer files Delete(Delete), /// Barrier. When the barrier operation is reached, @@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp { UploadOp::UploadMetadata(_, lsn) => { write!(f, "UploadMetadata(lsn: {})", lsn) } - UploadOp::Delete(delete) => write!( - f, - "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})", - delete.layer_file_name.file_name(), - delete.scheduled_from_timeline_delete, - delete.generation - ), + UploadOp::Delete(delete) => { + write!(f, "Delete({} layers)", delete.layers.len(),) + } UploadOp::Barrier(_) => write!(f, "Barrier"), } } diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index debbbce117..8b0035b8e8 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -741,6 +741,13 @@ NeonProcessUtility( break; case T_DropdbStmt: HandleDropDb(castNode(DropdbStmt, parseTree)); + /* + * We do this here to hack around the fact that Postgres performs the drop + * INSIDE of standard_ProcessUtility, which means that if we try to + * abort the drop normally it'll be too late. DROP DATABASE can't be inside + * of a transaction block anyway, so this should be fine to do. + */ + NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL); break; case T_CreateRoleStmt: HandleCreateRole(castNode(CreateRoleStmt, parseTree)); diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 4be75e1dad..22f20a4c0b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -14,7 +14,6 @@ */ #include -#include #include #include @@ -38,9 +37,6 @@ #include "storage/fd.h" #include "storage/pg_shmem.h" #include "storage/buf_internals.h" -#include "storage/procsignal.h" -#include "postmaster/bgworker.h" -#include "postmaster/interrupt.h" /* * Local file cache is used to temporary store relations pages in local file system. @@ -66,9 +62,6 @@ #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) -#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */ -#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */ - typedef struct FileCacheEntry { BufferTag key; @@ -91,14 +84,12 @@ static int lfc_desc = 0; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; -static int lfc_free_space_watermark; static char* lfc_path; static FileCacheControl* lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif -static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */ void FileCacheMonitorMain(Datum main_arg); @@ -254,80 +245,6 @@ lfc_change_limit_hook(int newval, void *extra) LWLockRelease(lfc_lock); } -/* - * Local file system state monitor check available free space. - * If it is lower than lfc_free_space_watermark then we shrink size of local cache - * but throwing away least recently accessed chunks. - * First time low space watermark is reached cache size is divided by two, - * second time by four,... Finally we remove all chunks from local cache. - * - * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler. - * We only throw away cached chunks but do not prevent from filling cache by new chunks. - * - * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark - * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second. - * Calling statvfs each second should not add any noticeable overhead. - */ -void -FileCacheMonitorMain(Datum main_arg) -{ - /* - * Choose file system state monitor interval so that space can not be exosted - * during this period but not longer than MAX_MONITOR_INTERVAL (10 sec) - */ - uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE); - - /* Establish signal handlers. */ - pqsignal(SIGUSR1, procsignal_sigusr1_handler); - pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, SignalHandlerForShutdownRequest); - BackgroundWorkerUnblockSignals(); - - /* Periodically dump buffers until terminated. */ - while (!ShutdownRequestPending) - { - if (lfc_size_limit != 0) - { - struct statvfs sfs; - if (statvfs(lfc_path, &sfs) < 0) - { - elog(WARNING, "Failed to obtain status of %s: %m", lfc_path); - } - else - { - if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB) - { - if (lfc_shrinking_factor < 31) { - lfc_shrinking_factor += 1; - } - lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL); - } - else - lfc_shrinking_factor = 0; /* reset to initial value */ - } - } - pg_usleep(monitor_interval); - } -} - -static void -lfc_register_free_space_monitor(void) -{ - BackgroundWorker bgw; - memset(&bgw, 0, sizeof(bgw)); - bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; - bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; - snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); - snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain"); - snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor"); - snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor"); - bgw.bgw_restart_time = 5; - bgw.bgw_notify_pid = 0; - bgw.bgw_main_arg = (Datum) 0; - - RegisterBackgroundWorker(&bgw); -} - void lfc_init(void) { @@ -364,19 +281,6 @@ lfc_init(void) lfc_change_limit_hook, NULL); - DefineCustomIntVariable("neon.free_space_watermark", - "Minimal free space in local file system after reaching which local file cache will be truncated", - NULL, - &lfc_free_space_watermark, - 1024, /* 1GB */ - 0, - INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MB, - NULL, - NULL, - NULL); - DefineCustomStringVariable("neon.file_cache_path", "Path to local file cache (can be raw device)", NULL, @@ -391,9 +295,6 @@ lfc_init(void) if (lfc_max_size == 0) return; - if (lfc_free_space_watermark != 0) - lfc_register_free_space_monitor(); - prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = lfc_shmem_startup; #if PG_VERSION_NUM>=150000 diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index cbab0c6f07..92498d3ecd 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -42,6 +42,7 @@ reqwest-middleware.workspace = true reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true +rustc-hash.workspace = true rustls-pemfile.workspace = true rustls.workspace = true scopeguard.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index ff73f2b625..03c9029862 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -160,6 +160,19 @@ impl BackendType<'_, ClientCredentials<'_>> { Test(_) => Some("test".to_owned()), } } + + /// Get username from the credentials. + pub fn get_user(&self) -> &str { + use BackendType::*; + + match self { + Console(_, creds) => creds.user, + Postgres(_, creds) => creds.user, + Link(_) => "link", + Test(_) => "test", + } + } + /// Authenticate the client via the requested backend, possibly using credentials. #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs index e771e5d7ed..a7ef15d342 100644 --- a/proxy/src/http/conn_pool.rs +++ b/proxy/src/http/conn_pool.rs @@ -17,11 +17,12 @@ use std::{ use tokio::time; use tokio_postgres::AsyncMessage; -use crate::{auth, console}; +use crate::{ + auth, console, + metrics::{Ids, MetricCounter, USAGE_METRICS}, +}; use crate::{compute, config}; -use super::sql_over_http::MAX_RESPONSE_SIZE; - use crate::proxy::ConnectMechanism; use tracing::{error, warn}; @@ -400,7 +401,6 @@ async fn connect_to_compute_once( .user(&conn_info.username) .password(&conn_info.password) .dbname(&conn_info.dbname) - .max_backend_message_size(MAX_RESPONSE_SIZE) .connect_timeout(timeout) .connect(tokio_postgres::NoTls) .await?; @@ -412,6 +412,10 @@ async fn connect_to_compute_once( span.in_scope(|| { info!(%conn_info, %session, "new connection"); }); + let ids = Ids { + endpoint_id: node_info.aux.endpoint_id.to_string(), + branch_id: node_info.aux.branch_id.to_string(), + }; tokio::spawn( poll_fn(move |cx| { @@ -450,10 +454,18 @@ async fn connect_to_compute_once( Ok(Client { inner: client, session: tx, + ids, }) } pub struct Client { pub inner: tokio_postgres::Client, session: tokio::sync::watch::Sender, + ids: Ids, +} + +impl Client { + pub fn metrics(&self) -> Arc { + USAGE_METRICS.register(self.ids.clone()) + } } diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index fe57096105..b74b3e9646 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -3,10 +3,12 @@ use std::sync::Arc; use anyhow::bail; use futures::pin_mut; use futures::StreamExt; -use hashbrown::HashMap; use hyper::body::HttpBody; +use hyper::header; use hyper::http::HeaderName; use hyper::http::HeaderValue; +use hyper::Response; +use hyper::StatusCode; use hyper::{Body, HeaderMap, Request}; use serde_json::json; use serde_json::Map; @@ -16,7 +18,11 @@ use tokio_postgres::types::Type; use tokio_postgres::GenericClient; use tokio_postgres::IsolationLevel; use tokio_postgres::Row; +use tracing::error; +use tracing::instrument; use url::Url; +use utils::http::error::ApiError; +use utils::http::json::json_response; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; @@ -39,7 +45,6 @@ enum Payload { Batch(BatchQueryData), } -pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); @@ -182,7 +187,45 @@ pub async fn handle( sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, -) -> anyhow::Result<(Value, HashMap)> { +) -> Result, ApiError> { + let result = handle_inner(request, sni_hostname, conn_pool, session_id).await; + + let mut response = match result { + Ok(r) => r, + Err(e) => { + let message = format!("{:?}", e); + let code = match e.downcast_ref::() { + Some(e) => match e.code() { + Some(e) => serde_json::to_value(e.code()).unwrap(), + None => Value::Null, + }, + None => Value::Null, + }; + error!( + ?code, + "sql-over-http per-client task finished with an error: {e:#}" + ); + // TODO: this shouldn't always be bad request. + json_response( + StatusCode::BAD_REQUEST, + json!({ "message": message, "code": code }), + )? + } + }; + response.headers_mut().insert( + "Access-Control-Allow-Origin", + hyper::http::HeaderValue::from_static("*"), + ); + Ok(response) +} + +#[instrument(name = "sql-over-http", skip_all)] +async fn handle_inner( + request: Request, + sni_hostname: Option, + conn_pool: Arc, + session_id: uuid::Uuid, +) -> anyhow::Result> { // // Determine the destination and connection params // @@ -233,13 +276,18 @@ pub async fn handle( let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?; + let mut response = Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/json"); + // // Now execute the query and return the result // + let mut size = 0; let result = match payload { - Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode) - .await - .map(|x| (x, HashMap::default())), + Payload::Single(query) => { + query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await + } Payload::Batch(batch_query) => { let mut results = Vec::new(); let mut builder = client.inner.build_transaction(); @@ -254,7 +302,8 @@ pub async fn handle( } let transaction = builder.start().await?; for query in batch_query.queries { - let result = query_to_json(&transaction, query, raw_output, array_mode).await; + let result = + query_to_json(&transaction, query, &mut size, raw_output, array_mode).await; match result { Ok(r) => results.push(r), Err(e) => { @@ -264,26 +313,27 @@ pub async fn handle( } } transaction.commit().await?; - let mut headers = HashMap::default(); if txn_read_only { - headers.insert( + response = response.header( TXN_READ_ONLY.clone(), HeaderValue::try_from(txn_read_only.to_string())?, ); } if txn_deferrable { - headers.insert( + response = response.header( TXN_DEFERRABLE.clone(), HeaderValue::try_from(txn_deferrable.to_string())?, ); } if let Some(txn_isolation_level) = txn_isolation_level_raw { - headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); } - Ok((json!({ "results": results }), headers)) + Ok(json!({ "results": results })) } }; + let metrics = client.metrics(); + if allow_pool { let current_span = tracing::Span::current(); // return connection to the pool @@ -293,12 +343,30 @@ pub async fn handle( }); } - result + match result { + Ok(value) => { + // how could this possibly fail + let body = serde_json::to_string(&value).expect("json serialization should not fail"); + let len = body.len(); + let response = response + .body(Body::from(body)) + // only fails if invalid status code or invalid header/values are given. + // these are not user configurable so it cannot fail dynamically + .expect("building response payload should not fail"); + + // count the egress bytes - we miss the TLS and header overhead but oh well... + // moving this later in the stack is going to be a lot of effort and ehhhh + metrics.record_egress(len as u64); + Ok(response) + } + Err(e) => Err(e), + } } async fn query_to_json( client: &T, data: QueryData, + current_size: &mut usize, raw_output: bool, array_mode: bool, ) -> anyhow::Result { @@ -312,16 +380,10 @@ async fn query_to_json( // big. pin_mut!(row_stream); let mut rows: Vec = Vec::new(); - let mut current_size = 0; while let Some(row) = row_stream.next().await { let row = row?; - current_size += row.body_len(); + *current_size += row.body_len(); rows.push(row); - if current_size > MAX_RESPONSE_SIZE { - return Err(anyhow::anyhow!( - "response is too large (max is {MAX_RESPONSE_SIZE} bytes)" - )); - } } // grab the command tag and number of rows affected diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index fa66df0469..994a7de764 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -7,7 +7,6 @@ use crate::{ }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream, StreamExt}; -use hashbrown::HashMap; use hyper::{ server::{ accept, @@ -18,7 +17,6 @@ use hyper::{ }; use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; -use serde_json::{json, Value}; use std::{ convert::Infallible, @@ -204,44 +202,7 @@ async fn ws_handler( // TODO: that deserves a refactor as now this function also handles http json client besides websockets. // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead. } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id) - .instrument(info_span!("sql-over-http")) - .await; - let status_code = match result { - Ok(_) => StatusCode::OK, - Err(_) => StatusCode::BAD_REQUEST, - }; - let (json, headers) = match result { - Ok(r) => r, - Err(e) => { - let message = format!("{:?}", e); - let code = match e.downcast_ref::() { - Some(e) => match e.code() { - Some(e) => serde_json::to_value(e.code()).unwrap(), - None => Value::Null, - }, - None => Value::Null, - }; - error!( - ?code, - "sql-over-http per-client task finished with an error: {e:#}" - ); - ( - json!({ "message": message, "code": code }), - HashMap::default(), - ) - } - }; - json_response(status_code, json).map(|mut r| { - r.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); - for (k, v) in headers { - r.headers_mut().insert(k, v); - } - r - }) + sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") @@ -253,7 +214,7 @@ async fn ws_handler( .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code .body(Body::empty()) - .map_err(|e| ApiError::BadRequest(e.into())) + .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") } diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 9279002eb3..cfeec5622b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -3,9 +3,18 @@ use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use serde::Serialize; -use std::{collections::HashMap, convert::Infallible, time::Duration}; -use tracing::{error, info, instrument, trace, warn}; +use dashmap::{mapref::entry::Entry, DashMap}; +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicU64, AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; +use tracing::{error, info, instrument, trace}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -18,12 +27,95 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// Both the proxy and the ingestion endpoint will live in the same region (or cell) /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. -#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)] +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { pub endpoint_id: String, pub branch_id: String, } +#[derive(Debug)] +pub struct MetricCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, +} + +impl MetricCounter { + /// Record that some bytes were sent from the proxy to the client + pub fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + } + + /// extract the value that should be reported + fn should_report(self: &Arc) -> Option { + // heuristic to see if the branch is still open + // if a clone happens while we are observing, the heuristic will be incorrect. + // + // Worst case is that we won't report an event for this endpoint. + // However, for the strong count to be 1 it must have occured that at one instant + // all the endpoints were closed, so missing a report because the endpoints are closed is valid. + let is_open = Arc::strong_count(self) > 1; + let opened = self.opened_connections.swap(0, Ordering::AcqRel); + + // update cached metrics eagerly, even if they can't get sent + // (to avoid sending the same metrics twice) + // see the relevant discussion on why to do so even if the status is not success: + // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 + let value = self.transmitted.swap(0, Ordering::AcqRel); + + // Our only requirement is that we report in every interval if there was an open connection + // if there were no opened connections since, then we don't need to report + if value == 0 && !is_open && opened == 0 { + None + } else { + Some(value) + } + } + + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool { + // we can't clear this entry if it's acquired elsewhere + let Some(counter) = Arc::get_mut(self) else { + return false; + }; + let opened = *counter.opened_connections.get_mut(); + let value = *counter.transmitted.get_mut(); + // clear if there's no data to report + value == 0 && opened == 0 + } +} + +// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos +type FastHasher = std::hash::BuildHasherDefault; + +#[derive(Default)] +pub struct Metrics { + endpoints: DashMap, FastHasher>, +} + +impl Metrics { + /// Register a new byte metrics counter for this endpoint + pub fn register(&self, ids: Ids) -> Arc { + let entry = if let Some(entry) = self.endpoints.get(&ids) { + entry.clone() + } else { + self.endpoints + .entry(ids) + .or_insert_with(|| { + Arc::new(MetricCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + + entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry + } +} + +pub static USAGE_METRICS: Lazy = Lazy::new(Metrics::default); + pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result { info!("metrics collector config: {config:?}"); scopeguard::defer! { @@ -31,145 +123,83 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result)> = HashMap::new(); let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + let mut prev = Utc::now(); let mut ticker = tokio::time::interval(config.interval); loop { ticker.tick().await; - let res = collect_metrics_iteration( + let now = Utc::now(); + collect_metrics_iteration( + &USAGE_METRICS, &http_client, - &mut cached_metrics, &config.endpoint, &hostname, + prev, + now, ) .await; - - match res { - Err(e) => error!("failed to send consumption metrics: {e} "), - Ok(_) => trace!("periodic metrics collection completed successfully"), - } + prev = now; } } -fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { - let mut current_metrics: Vec<(Ids, (u64, DateTime))> = Vec::new(); - let metrics = prometheus::default_registry().gather(); - - for m in metrics { - if m.get_name() == "proxy_io_bytes_per_client" { - for ms in m.get_metric() { - let direction = ms - .get_label() - .iter() - .find(|l| l.get_name() == "direction") - .unwrap() - .get_value(); - - // Only collect metric for outbound traffic - if direction == "tx" { - let endpoint_id = ms - .get_label() - .iter() - .find(|l| l.get_name() == "endpoint_id") - .unwrap() - .get_value(); - let branch_id = ms - .get_label() - .iter() - .find(|l| l.get_name() == "branch_id") - .unwrap() - .get_value(); - - let value = ms.get_counter().get_value() as u64; - - // Report if the metric value is suspiciously large - if value > (1u64 << 40) { - warn!( - "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}", - branch_id, endpoint_id, value - ); - } - - current_metrics.push(( - Ids { - endpoint_id: endpoint_id.to_string(), - branch_id: branch_id.to_string(), - }, - (value, Utc::now()), - )); - } - } - } - } - - current_metrics -} - #[instrument(skip_all)] async fn collect_metrics_iteration( + metrics: &Metrics, client: &http::ClientWithMiddleware, - cached_metrics: &mut HashMap)>, metric_collection_endpoint: &reqwest::Url, hostname: &str, -) -> anyhow::Result<()> { + prev: DateTime, + now: DateTime, +) { info!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", metric_collection_endpoint ); - let current_metrics = gather_proxy_io_bytes_per_client(); + let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec> = current_metrics + let metrics_to_send: Vec<(Ids, u64)> = metrics + .endpoints .iter() - .filter_map(|(curr_key, (curr_val, curr_time))| { - let mut start_time = *curr_time; - let mut value = *curr_val; - - if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) { - // Only send metrics updates if the metric has increased - if curr_val > prev_val { - value = curr_val - prev_val; - start_time = *prev_time; - } else { - if curr_val < prev_val { - error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}", - prev_val, curr_val, curr_key); - } - return None; - } + .filter_map(|counter| { + let key = counter.key().clone(); + let Some(value) = counter.should_report() else { + metrics_to_clear.push(key); + return None; }; - - Some(Event { - kind: EventType::Incremental { - start_time, - stop_time: *curr_time, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value, - extra: Ids { - endpoint_id: curr_key.endpoint_id.clone(), - branch_id: curr_key.branch_id.clone(), - }, - }) + Some((key, value)) }) .collect(); if metrics_to_send.is_empty() { trace!("no new metrics to send"); - return Ok(()); } // Send metrics. // Split into chunks of 1000 metrics to avoid exceeding the max request size for chunk in metrics_to_send.chunks(CHUNK_SIZE) { + let events = chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: Ids { + endpoint_id: ids.endpoint_id.clone(), + branch_id: ids.branch_id.clone(), + }, + }) + .collect(); + let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { - events: chunk.into(), - }) + .json(&EventChunk { events }) .send() .await; @@ -183,34 +213,113 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) { + for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } - // update cached metrics after they were sent - // (to avoid sending the same metrics twice) - // see the relevant discussion on why to do so even if the status is not success: - // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - for send_metric in chunk { - let stop_time = match send_metric.kind { - EventType::Incremental { stop_time, .. } => stop_time, - _ => unreachable!(), - }; + } - cached_metrics - .entry(Ids { - endpoint_id: send_metric.extra.endpoint_id.clone(), - branch_id: send_metric.extra.branch_id.clone(), - }) - // update cached value (add delta) and time - .and_modify(|e| { - e.0 = e.0.saturating_add(send_metric.value); - e.1 = stop_time - }) - // cache new metric - .or_insert((send_metric.value, stop_time)); + for metric in metrics_to_clear { + match metrics.endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} } } - Ok(()) +} + +#[cfg(test)] +mod tests { + use std::{ + net::TcpListener, + sync::{Arc, Mutex}, + }; + + use anyhow::Error; + use chrono::Utc; + use consumption_metrics::{Event, EventChunk}; + use hyper::{ + service::{make_service_fn, service_fn}, + Body, Response, + }; + use url::Url; + + use super::{collect_metrics_iteration, Ids, Metrics}; + use crate::http; + + #[tokio::test] + async fn metrics() { + let listener = TcpListener::bind("0.0.0.0:0").unwrap(); + + let reports = Arc::new(Mutex::new(vec![])); + let reports2 = reports.clone(); + + let server = hyper::server::Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| { + let reports = reports.clone(); + async move { + Ok::<_, Error>(service_fn(move |req| { + let reports = reports.clone(); + async move { + let bytes = hyper::body::to_bytes(req.into_body()).await?; + let events: EventChunk<'static, Event> = + serde_json::from_slice(&bytes)?; + reports.lock().unwrap().push(events); + Ok::<_, Error>(Response::new(Body::from(vec![]))) + } + })) + } + })); + let addr = server.local_addr(); + tokio::spawn(server); + + let metrics = Metrics::default(); + let client = http::new_client(); + let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); + let now = Utc::now(); + + // no counters have been registered + collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + let r = std::mem::take(&mut *reports2.lock().unwrap()); + assert!(r.is_empty()); + + // register a new counter + let counter = metrics.register(Ids { + endpoint_id: "e1".to_string(), + branch_id: "b1".to_string(), + }); + + // the counter should be observed despite 0 egress + collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + let r = std::mem::take(&mut *reports2.lock().unwrap()); + assert_eq!(r.len(), 1); + assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events[0].value, 0); + + // record egress + counter.record_egress(1); + + // egress should be observered + collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + let r = std::mem::take(&mut *reports2.lock().unwrap()); + assert_eq!(r.len(), 1); + assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events[0].value, 1); + + // release counter + drop(counter); + + // we do not observe the counter + collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + let r = std::mem::take(&mut *reports2.lock().unwrap()); + assert!(r.is_empty()); + + // counter is unregistered + assert!(metrics.endpoints.is_empty()); + } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index f9da145859..71e00ed58f 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -7,6 +7,7 @@ use crate::{ compute::{self, PostgresConnection}, config::{ProxyConfig, TlsConfig}, console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, + metrics::{Ids, USAGE_METRICS}, protocol2::WithClientIp, stream::{PqStream, Stream}, }; @@ -602,6 +603,11 @@ pub async fn proxy_pass( compute: impl AsyncRead + AsyncWrite + Unpin, aux: &MetricsAuxInfo, ) -> anyhow::Result<()> { + let usage = USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id.to_string(), + branch_id: aux.branch_id.to_string(), + }); + let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx")); let mut client = MeasuredStream::new( client, @@ -609,6 +615,7 @@ pub async fn proxy_pass( |cnt| { // Number of bytes we sent to the client (outbound). m_sent.inc_by(cnt as u64); + usage.record_egress(cnt as u64); }, ); @@ -690,7 +697,14 @@ impl Client<'_, S> { .await { Ok(auth_result) => auth_result, - Err(e) => return stream.throw_error(e).await, + Err(e) => { + let user = creds.get_user(); + let db = params.get("database"); + let app = params.get("application_name"); + let params_span = tracing::info_span!("", ?user, ?db, ?app); + + return stream.throw_error(e).instrument(params_span).await; + } }; let AuthSuccess { diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 1254c4e779..6fbaa08512 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -105,6 +105,8 @@ class NeonCompare(PgCompare): self._pg_bin = pg_bin self.pageserver_http_client = self.env.pageserver.http_client() + # note that neon_simple_env now uses LOCAL_FS remote storage + # Create tenant tenant_conf: Dict[str, str] = {} if False: # TODO add pytest setting for this diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0667403ba3..92e7cd06cd 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -460,9 +460,11 @@ class NeonEnvBuilder: ), "Unexpectedly instantiated from outside a test function" self.test_name = test_name - def init_configs(self) -> NeonEnv: + def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" + if default_remote_storage_if_missing and self.pageserver_remote_storage is None: + self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) self.env = NeonEnv(self) return self.env @@ -470,8 +472,19 @@ class NeonEnvBuilder: assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() - def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv: - env = self.init_configs() + def init_start( + self, + initial_tenant_conf: Optional[Dict[str, str]] = None, + default_remote_storage_if_missing: bool = True, + ) -> NeonEnv: + """ + Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline. + + To avoid creating initial_tenant, call init_configs to setup the environment. + + Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one. + """ + env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing) self.start() # Prepare the default branch to start the postgres on later. @@ -546,7 +559,7 @@ class NeonEnvBuilder: user: RemoteStorageUser, bucket_name: Optional[str] = None, bucket_region: Optional[str] = None, - ) -> Optional[RemoteStorage]: + ) -> RemoteStorage: ret = kind.configure( self.repo_dir, self.mock_s3_server, @@ -889,6 +902,8 @@ def _shared_simple_env( """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES is set, this is shared by all tests using `neon_simple_env`. + + This fixture will use RemoteStorageKind.LOCAL_FS with pageserver. """ if os.environ.get("TEST_SHARED_FIXTURES") is None: @@ -1481,6 +1496,16 @@ class NeonAttachmentService: self.running = False return self + def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int: + response = requests.post( + f"{self.env.control_plane_api}/attach_hook", + json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id}, + ) + response.raise_for_status() + gen = response.json()["gen"] + assert isinstance(gen, int) + return gen + def __enter__(self) -> "NeonAttachmentService": return self @@ -1689,12 +1714,7 @@ class NeonPageserver(PgProtocol): to call into the pageserver HTTP client. """ if self.env.attachment_service is not None: - response = requests.post( - f"{self.env.control_plane_api}/attach_hook", - json={"tenant_id": str(tenant_id), "pageserver_id": self.id}, - ) - response.raise_for_status() - generation = response.json()["gen"] + generation = self.env.attachment_service.attach_hook(tenant_id, self.id) else: generation = None diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 9373073abf..9fdcd22bc2 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session): }, ) self.verbose_error(res) + + def deletion_queue_flush(self, execute: bool = False): + self.put( + f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" + ).raise_for_status() diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 2e5d75a0fc..70c2a06a07 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional def list_prefix( - neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None + neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. @@ -287,7 +287,7 @@ def list_prefix( # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. response = remote.client.list_objects_v2( - Delimiter="/", + Delimiter=delimiter, Bucket=remote.bucket_name, Prefix=prefix, ) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index f7cddbc821..535f8c2fe7 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -202,9 +202,6 @@ class RemoteStorageKind(str, enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" - # Pass to tests that are generic to remote storage - # to ensure the test pass with or without the remote storage - NOOP = "noop" def configure( self, @@ -215,10 +212,7 @@ class RemoteStorageKind(str, enum.Enum): user: RemoteStorageUser, bucket_name: Optional[str] = None, bucket_region: Optional[str] = None, - ) -> Optional[RemoteStorage]: - if self == RemoteStorageKind.NOOP: - return None - + ) -> RemoteStorage: if self == RemoteStorageKind.LOCAL_FS: return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user)) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index d0462844f0..c80f2d8360 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -4,7 +4,12 @@ from typing import List, Tuple import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + wait_for_last_flush_lsn, +) from fixtures.types import TenantId, TimelineId @@ -26,17 +31,18 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = [] - for _ in range(4): + for _ in range(3): tenant_id, timeline_id = env.neon_cli.create_tenant() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) endpoint.stop() tenant_timelines.append((tenant_id, timeline_id, endpoint)) - # Stop the pageserver + # Stop the pageserver -- this has to be not immediate or we need to wait for uploads env.pageserver.stop() # Leave the first timeline alone, but corrupt the others in different ways @@ -45,30 +51,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): (tenant1, timeline1, pg1) = tenant_timelines[1] metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" - f = open(metadata_path, "w") - f.write("overwritten with garbage!") - f.close() + with open(metadata_path, "w") as f: + f.write("overwritten with garbage!") log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") (tenant2, timeline2, pg2) = tenant_timelines[2] timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" - for filename in os.listdir(timeline_path): - if filename.startswith("00000"): - # Looks like a layer file. Remove it - os.remove(f"{timeline_path}/{filename}") - log.info( - f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)" - ) - - (tenant3, timeline3, pg3) = tenant_timelines[3] - timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it - f = open(f"{timeline_path}/{filename}", "w") - f.write("overwritten with garbage!") - f.close() - log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled") + p = f"{timeline_path}/{filename}" + size = os.path.getsize(p) + with open(p, "wb") as f: + f.truncate(0) + f.truncate(size) + log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled") env.pageserver.start() @@ -87,22 +84,13 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" ) - # Second timeline has no ancestors, only the metadata file and no layer files locally, - # and we don't have the remote storage enabled. It is loaded into memory, but getting - # the basebackup from it will fail. - with pytest.raises( - Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken" - ) as err: - pg2.start() - log.info(f"As expected, compute startup failed for timeline with missing layers: {err}") - - # Third timeline will also fail during basebackup, because the layer file is corrupt. + # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) with pytest.raises(Exception, match="Failed to load delta layer") as err: - pg3.start() + pg2.start() log.info( - f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}" + f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" ) diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 740e489759..d4cf1b4739 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -211,4 +211,12 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.wait() ddl.failures(False) + cur.execute("CREATE DATABASE failure WITH OWNER=cork") + ddl.wait() + with pytest.raises(psycopg2.InternalError): + ddl.failures(True) + cur.execute("DROP DATABASE failure") + ddl.wait() + ddl.pg.connect(dbname="failure") # Ensure we can connect after a failed drop + conn.close() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index ae62fdf4a4..f3f3a1ddf3 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -74,11 +74,13 @@ class EvictionEnv: pgbench_init_lsns: Dict[TenantId, Lsn] def timelines_du(self) -> Tuple[int, int, int]: - return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines]) + return poor_mans_du( + self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False + ) def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]: return { - (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0] + (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0] for tid, tlid in self.timelines } @@ -89,7 +91,21 @@ class EvictionEnv: """ lsn = self.pgbench_init_lsns[tenant_id] with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint: - self.pg_bin.run(["pgbench", "-S", endpoint.connstr()]) + # instead of using pgbench --select-only which does point selects, + # run full table scans for all tables + with endpoint.connect() as conn: + cur = conn.cursor() + + tables_cols = { + "pgbench_accounts": "abalance", + "pgbench_tellers": "tbalance", + "pgbench_branches": "bbalance", + "pgbench_history": "delta", + } + + for table, column in tables_cols.items(): + cur.execute(f"select avg({column}) from {table}") + _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( self, period, max_usage_pct, min_avail_bytes, mock_behavior @@ -127,6 +143,19 @@ class EvictionEnv: self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") +def human_bytes(amt: float) -> str: + suffixes = ["", "Ki", "Mi", "Gi"] + + last = suffixes[-1] + + for name in suffixes: + if amt < 1024 or name == last: + return f"{int(round(amt))} {name}B" + amt = amt / 1024 + + raise RuntimeError("unreachable") + + @pytest.fixture def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: """ @@ -215,8 +244,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): healthy_tenant_id, healthy_timeline_id = env.timelines[1] - broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) - healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + broken_size_pre, _, _ = poor_mans_du( + env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True + ) + healthy_size_pre, _, _ = poor_mans_du( + env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True + ) # try to evict everything, then validate that broken tenant wasn't touched target = broken_size_pre + healthy_size_pre @@ -224,8 +257,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) log.info(f"{response}") - broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) - healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + broken_size_post, _, _ = poor_mans_du( + env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True + ) + healthy_size_post, _, _ = poor_mans_du( + env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True + ) assert broken_size_pre == broken_size_post, "broken tenant should not be touched" assert healthy_size_post < healthy_size_pre @@ -366,18 +403,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): du_by_timeline = env.du_by_timeline() # pick any tenant - [our_tenant, other_tenant] = list(du_by_timeline.keys()) - (tenant_id, timeline_id) = our_tenant + [warm, cold] = list(du_by_timeline.keys()) + (tenant_id, timeline_id) = warm - # make our tenant more recently used than the other one + # make picked tenant more recently used than the other one env.warm_up_tenant(tenant_id) # Build up enough pressure to require evictions from both tenants, # but not enough to fall into global LRU. - # So, set target to all occipied space, except 2*env.layer_size per tenant - target = ( - du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size - ) + # So, set target to all occupied space, except 2*env.layer_size per tenant + target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) log.info(f"{response}") @@ -392,22 +427,33 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" + warm_size = later_du_by_timeline[warm] + + # bounds for warmed_size + warm_lower = 0.5 * du_by_timeline[warm] + + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + warm_upper = warm_lower + 3 * env.layer_size + + cold_size = later_du_by_timeline[cold] + cold_upper = 2 * env.layer_size + + log.info( + f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + ) + log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + + assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" + assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + assert ( - later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant] - ), "our warmed up tenant should be at about half capacity, part 1" - assert ( - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - later_du_by_timeline[our_tenant] - < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size - ), "our warmed up tenant should be at about half capacity, part 2" - assert ( - later_du_by_timeline[other_tenant] < 2 * env.layer_size - ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size" + cold_size < cold_upper + ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" def poor_mans_du( - env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]] + env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False ) -> Tuple[int, int, int]: """ Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples; @@ -430,9 +476,11 @@ def poor_mans_du( smallest_layer = min(smallest_layer, size) else: smallest_layer = size - log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}") + if verbose: + log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})") - log.info(f"{tenant_id}/{timeline_id}: sum {total}") + if verbose: + log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})") total_on_disk += total assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0 diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py new file mode 100644 index 0000000000..81d38ac934 --- /dev/null +++ b/test_runner/regress/test_pageserver_generations.py @@ -0,0 +1,352 @@ +""" + +Tests in this module exercise the pageserver's behavior around generation numbers, +as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require +of the pageserver are: +- Do not start a tenant without a generation number if control_plane_api is set +- Remote objects must be suffixed with generation +- Deletions may only be executed after validating generation +- Updates to remote_consistent_lsn may only be made visible after validating generation +""" + + +import re +import time +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import list_prefix +from fixtures.remote_storage import ( + RemoteStorageKind, +) +from fixtures.types import TenantId, TimelineId +from fixtures.utils import print_gc_result, wait_until + +# A tenant configuration that is convenient for generating uploads and deletions +# without a large amount of postgres traffic. +TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", +} + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 20000) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + +def get_metric_or_0(ps_http, metric: str) -> int: + v = ps_http.get_metric_value(metric) + return 0 if v is None else int(v) + + +def get_deletion_queue_executed(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total") + + +def get_deletion_queue_submitted(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total") + + +def get_deletion_queue_dropped(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total") + + +def get_deletion_queue_unexpected_errors(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total") + + +def get_deletion_queue_dropped_lsn_updates(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total") + + +def get_deletion_queue_depth(ps_http) -> int: + """ + Queue depth if at least one deletion has been submitted, else None + """ + submitted = get_deletion_queue_submitted(ps_http) + executed = get_deletion_queue_executed(ps_http) + dropped = get_deletion_queue_dropped(ps_http) + depth = submitted - executed - dropped + log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})") + + assert depth >= 0 + return int(depth) + + +def assert_deletion_queue(ps_http, size_fn) -> None: + v = get_deletion_queue_depth(ps_http) + assert v is not None + assert size_fn(v) is True + + +def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): + """ + Validate behavior when a pageserver is run without generation support enabled, + then started again after activating it: + - Before upgrade, no objects should have generation suffixes + - After upgrade, the bucket should contain a mixture. + - In both cases, postgres I/O should work. + """ + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + + env = neon_env_builder.init_configs() + env.broker.try_start() + for sk in env.safekeepers: + sk.start() + assert env.attachment_service is not None + env.attachment_service.start() + + env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + + env.neon_cli.create_tenant( + tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline + ) + generate_uploads_and_deletions(env) + + def parse_generation_suffix(key): + m = re.match(".+-([0-9a-zA-Z]{8})$", key) + if m is None: + return None + else: + log.info(f"match: {m}") + log.info(f"group: {m.group(1)}") + return int(m.group(1), 16) + + pre_upgrade_keys = list( + [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + ) + for key in pre_upgrade_keys: + assert parse_generation_suffix(key) is None + + env.pageserver.stop() + + # Starting without the override that disabled control_plane_api + env.pageserver.start() + + generate_uploads_and_deletions(env, init=False) + + legacy_objects: list[str] = [] + suffixed_objects = [] + post_upgrade_keys = list( + [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + ) + for key in post_upgrade_keys: + log.info(f"post-upgrade key: {key}") + if parse_generation_suffix(key) is not None: + suffixed_objects.append(key) + else: + legacy_objects.append(key) + + # Bucket now contains a mixture of suffixed and non-suffixed objects + assert len(suffixed_objects) > 0 + assert len(legacy_objects) > 0 + + assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0 + + +def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + some_other_pageserver = 1234 + ps_http = env.pageserver.http_client() + + generate_uploads_and_deletions(env) + + # Flush: pending deletions should all complete + assert_deletion_queue(ps_http, lambda n: n > 0) + ps_http.deletion_queue_flush(execute=True) + assert_deletion_queue(ps_http, lambda n: n == 0) + assert get_deletion_queue_dropped(ps_http) == 0 + + # Our visible remote_consistent_lsn should match projected + timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + + # Now advance the generation in the control plane: subsequent validations + # from the running pageserver will fail. No more deletions should happen. + env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + generate_uploads_and_deletions(env, init=False) + + assert_deletion_queue(ps_http, lambda n: n > 0) + queue_depth_before = get_deletion_queue_depth(ps_http) + executed_before = get_deletion_queue_executed(ps_http) + ps_http.deletion_queue_flush(execute=True) + + # Queue drains to zero because we dropped deletions + assert_deletion_queue(ps_http, lambda n: n == 0) + # The executed counter has not incremented + assert get_deletion_queue_executed(ps_http) == executed_before + # The dropped counter has incremented to consume all of the deletions that were previously enqueued + assert get_deletion_queue_dropped(ps_http) == queue_depth_before + + # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot + # because generation validation fails. + timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"] + assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0 + + # TODO: list bucket and confirm all objects have a generation suffix. + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + + +@pytest.mark.parametrize("keep_attachment", [True, False]) +def test_deletion_queue_recovery( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool +): + """ + :param keep_attachment: If true, we re-attach after restart. Else, we act as if some other + node took the attachment while we were restarting. + """ + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + ps_http = env.pageserver.http_client() + + # Prevent deletion lists from being executed, to build up some backlog of deletions + ps_http.configure_failpoints( + [ + ("deletion-queue-before-execute", "return"), + ] + ) + + generate_uploads_and_deletions(env) + + # There should be entries in the deletion queue + assert_deletion_queue(ps_http, lambda n: n > 0) + ps_http.deletion_queue_flush() + before_restart_depth = get_deletion_queue_depth(ps_http) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") + env.pageserver.stop(immediate=True) + + if not keep_attachment: + some_other_pageserver = 101010 + assert env.attachment_service is not None + env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + + env.pageserver.start() + + def assert_deletions_submitted(n: int): + assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n + + # After restart, issue a flush to kick the deletion frontend to do recovery. + # It should recover all the operations we submitted before the restart. + ps_http.deletion_queue_flush(execute=False) + wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth)) + + # The queue should drain through completely if we flush it + ps_http.deletion_queue_flush(execute=True) + wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) + + if keep_attachment: + # If we kept the attachment, then our pre-restart deletions should have executed + # successfully + assert get_deletion_queue_executed(ps_http) == before_restart_depth + else: + # If we lost the attachment, we should have dropped our pre-restart deletions. + assert get_deletion_queue_dropped(ps_http) == before_restart_depth + env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + # Restart again + env.pageserver.stop(immediate=True) + env.pageserver.start() + + # No deletion lists should be recovered: this demonstrates that deletion lists + # were cleaned up after being executed or dropped in the previous process lifetime. + time.sleep(1) + assert_deletion_queue(ps_http, lambda n: n == 0) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index dae39d2752..b76dbbee03 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -5,7 +5,6 @@ from pathlib import Path from queue import SimpleQueue from typing import Any, Dict, Set -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -17,15 +16,13 @@ from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP + -@pytest.mark.parametrize( - "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS] -) def test_metric_collection( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address, - remote_storage_kind: RemoteStorageKind, ): (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -55,7 +52,7 @@ def test_metric_collection( synthetic_size_calculation_interval="3s" """ - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") @@ -68,6 +65,14 @@ def test_metric_collection( env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) # httpserver is shut down before pageserver during passing run env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") + # we have a fast rate of calculation, these can happen at shutdown + env.pageserver.allowed_errors.append( + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" + ) + env.pageserver.allowed_errors.append( + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) @@ -98,17 +103,14 @@ def test_metric_collection( total += sample[2] return int(total) - remote_uploaded = 0 - # upload some data to remote storage - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - pageserver_http = env.pageserver.http_client() - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) - remote_uploaded = get_num_remote_ops("index", "upload") - assert remote_uploaded > 0 + remote_uploaded = get_num_remote_ops("index", "upload") + assert remote_uploaded > 0 # we expect uploads at 1Hz, on busy runners this could be too optimistic, # so give 5s we only want to get the following upload after "ready" value. @@ -211,6 +213,14 @@ def test_metric_collection_cleans_up_tempfile( # httpserver is shut down before pageserver during passing run env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") + # we have a fast rate of calculation, these can happen at shutdown + env.pageserver.allowed_errors.append( + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" + ) + env.pageserver.allowed_errors.append( + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + ) + tenant_id = env.initial_tenant timeline_id = env.initial_timeline endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c6ddb54ee6..3a56ca51a6 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -30,9 +30,7 @@ from fixtures.types import TenantId from fixtures.utils import run_pg_bench_small -@pytest.mark.parametrize( - "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()] -) +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_tenant_delete_smoke( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, @@ -43,6 +41,12 @@ def test_tenant_delete_smoke( neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + ] + ) # lucky race with stopping from flushing a layer we fail to schedule any uploads env.pageserver.allowed_errors.append( @@ -138,18 +142,12 @@ FAILPOINTS_BEFORE_BACKGROUND = [ def combinations(): result = [] - remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3] + remotes = [RemoteStorageKind.MOCK_S3] if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): remotes.append(RemoteStorageKind.REAL_S3) for remote_storage_kind in remotes: for delete_failpoint in FAILPOINTS: - if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in ( - "timeline-delete-before-index-delete", - ): - # the above failpoint are not relevant for config without remote storage - continue - # Simulate failures for only one type of remote storage # to avoid log pollution and make tests run faster if remote_storage_kind is RemoteStorageKind.MOCK_S3: @@ -195,27 +193,32 @@ def test_delete_tenant_exercise_crash_safety_failpoints( ] ) + if simulate_failures: + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + ] + ) + ps_http = env.pageserver.http_client() timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) - if remote_storage_kind is RemoteStorageKind.NOOP: - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - else: - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(tenant_id), - ) - ), - ) + if remote_storage_kind in available_s3_storages(): + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) ps_http.configure_failpoints((failpoint, "return")) @@ -246,12 +249,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( env.pageserver.stop() env.pageserver.start() - if ( - remote_storage_kind is RemoteStorageKind.NOOP - and failpoint == "tenant-delete-before-create-local-mark" - ): - tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - elif failpoint in ( + if failpoint in ( "tenant-delete-before-shutdown", "tenant-delete-before-create-remote-mark", ): @@ -383,6 +381,7 @@ def test_tenant_delete_is_resumed_on_attach( assert not tenant_path.exists() if remote_storage_kind in available_s3_storages(): + ps_http.deletion_queue_flush(execute=True) assert_prefix_empty( neon_env_builder, prefix="/".join( diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8ccbcf551d..a20523b1f3 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -519,11 +519,8 @@ def test_detach_while_attaching( # * restart the pageserver and verify that ignored tenant is still not loaded # * `load` the same tenant # * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines -@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]) -def test_ignored_tenant_reattach( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 49a6ca5a53..7cea301a9c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -15,7 +15,7 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.pg_version import PgVersion, xfail_on_postgres +from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId @@ -532,7 +532,24 @@ def test_single_branch_get_tenant_size_grows( assert size_after == prev, "size after restarting pageserver should not have changed" -@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15") +def assert_size_approx_equal(size_a, size_b): + """ + Tests that evaluate sizes are checking the pageserver space consumption + that sits many layers below the user input. The exact space needed + varies slightly depending on postgres behavior. + + Rather than expecting postgres to be determinstic and occasionally + failing the test, we permit sizes for the same data to vary by a few pages. + """ + + # Determined empirically from examples of equality failures: they differ + # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid + # failing on outliers from that observed range. + threshold = 4 * 8272 + + assert size_a == pytest.approx(size_b, abs=threshold) + + def test_get_tenant_size_with_multiple_branches( neon_env_builder: NeonEnvBuilder, test_output_dir: Path ): @@ -573,7 +590,7 @@ def test_get_tenant_size_with_multiple_branches( ) size_after_first_branch = http_client.tenant_size(tenant_id) - assert size_after_first_branch == size_at_branch + assert_size_approx_equal(size_after_first_branch, size_at_branch) first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id) @@ -599,7 +616,7 @@ def test_get_tenant_size_with_multiple_branches( "second-branch", main_branch_name, tenant_id ) size_after_second_branch = http_client.tenant_size(tenant_id) - assert size_after_second_branch == size_after_continuing_on_main + assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main) second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id) @@ -635,7 +652,7 @@ def test_get_tenant_size_with_multiple_branches( # tenant_size but so far this has been reliable, even though at least gc # and tenant_size race for the same locks size_after = http_client.tenant_size(tenant_id) - assert size_after == size_after_thinning_branch + assert_size_approx_equal(size_after, size_after_thinning_branch) size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w") size_debug = http_client.tenant_size_debug(tenant_id) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 677c0d18e8..40dff194aa 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -12,7 +12,6 @@ from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, PAGESERVER_PER_TENANT_METRICS, - PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, parse_metrics, ) from fixtures.neon_fixtures import ( @@ -232,17 +231,10 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): assert value -@pytest.mark.parametrize( - "remote_storage_kind", - # exercise both the code paths where remote_storage=None and remote_storage=Some(...) - [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3], -) -def test_pageserver_metrics_removed_after_detach( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): +def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) neon_env_builder.num_safekeepers = 3 @@ -282,9 +274,6 @@ def test_pageserver_metrics_removed_after_detach( for tenant in [tenant_1, tenant_2]: pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) expected = set(PAGESERVER_PER_TENANT_METRICS) - if remote_storage_kind == RemoteStorageKind.NOOP: - # if there's no remote storage configured, we don't expose the remote timeline client metrics - expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS) assert pre_detach_samples == expected env.pageserver.http_client().tenant_detach(tenant) @@ -294,9 +283,7 @@ def test_pageserver_metrics_removed_after_detach( # Check that empty tenants work with or without the remote storage -@pytest.mark.parametrize( - "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] -) +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_pageserver_with_empty_tenants( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 0e4df21d83..3af144c31c 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -12,7 +12,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, last_flush_lsn_upload, - wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( @@ -145,19 +144,12 @@ DELETE_FAILPOINTS = [ def combinations(): result = [] - remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3] + remotes = [RemoteStorageKind.MOCK_S3] if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): remotes.append(RemoteStorageKind.REAL_S3) for remote_storage_kind in remotes: for delete_failpoint in DELETE_FAILPOINTS: - if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in ( - "timeline-delete-before-index-delete", - "timeline-delete-after-index-delete", - ): - # the above failpoints are not relevant for config without remote storage - continue - result.append((remote_storage_kind, delete_failpoint)) return result @@ -205,23 +197,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints( with env.endpoints.create_start("delete") as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) - if remote_storage_kind is RemoteStorageKind.NOOP: - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id) - else: - last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) - if remote_storage_kind in available_s3_storages(): - assert_prefix_not_empty( - neon_env_builder, - prefix="/".join( - ( - "tenants", - str(env.initial_tenant), - "timelines", - str(timeline_id), - ) - ), - ) + last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) + + if remote_storage_kind in available_s3_storages(): + assert_prefix_not_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") # It appears when we stopped flush loop during deletion and then pageserver is stopped @@ -807,6 +797,8 @@ def test_delete_orphaned_objects( reason = timeline_info["state"]["Broken"]["reason"] assert reason.endswith(f"failpoint: {failpoint}"), reason + ps_http.deletion_queue_flush(execute=True) + for orphan in orphans: assert not orphan.exists() assert env.pageserver.log_contains( diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f856b26c6e..c2e93c48c7 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -301,12 +301,8 @@ def test_timeline_initial_logical_size_calculation_cancellation( # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_timeline_physical_size_init( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] -): - if remote_storage_kind is not None: - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -337,17 +333,12 @@ def test_timeline_physical_size_init( ) assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, + get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_timeline_physical_size_post_checkpoint( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] -): - if remote_storage_kind is not None: - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -369,19 +360,14 @@ def test_timeline_physical_size_post_checkpoint( def check(): assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, + get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) wait_until(10, 1, check) -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_timeline_physical_size_post_compaction( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] -): - if remote_storage_kind is not None: - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -420,21 +406,15 @@ def test_timeline_physical_size_post_compaction( pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) - if remote_storage_kind is not None: - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, + get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_timeline_physical_size_post_gc( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] -): - if remote_storage_kind is not None: - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) +def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -471,12 +451,10 @@ def test_timeline_physical_size_post_gc( pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) - if remote_storage_kind is not None: - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, + get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) @@ -560,14 +538,10 @@ def test_timeline_size_metrics( assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_tenant_physical_size( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] -): +def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder): random.seed(100) - if remote_storage_kind is not None: - neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -575,12 +549,10 @@ def test_tenant_physical_size( client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() - if remote_storage_kind is not None: - wait_for_upload_queue_empty(pageserver_http, tenant, timeline) def get_timeline_resident_physical_size(timeline: TimelineId): - sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind) - assert_physical_size_invariants(sizes, remote_storage_kind) + sizes = get_physical_size_values(env, tenant, timeline) + assert_physical_size_invariants(sizes) return sizes.prometheus_resident_physical timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) @@ -600,8 +572,7 @@ def test_tenant_physical_size( wait_for_last_flush_lsn(env, endpoint, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) - if remote_storage_kind is not None: - wait_for_upload_queue_empty(pageserver_http, tenant, timeline) + wait_for_upload_queue_empty(pageserver_http, tenant, timeline) timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) @@ -630,7 +601,6 @@ def get_physical_size_values( env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId, - remote_storage_kind: Optional[RemoteStorageKind], ) -> TimelinePhysicalSizeValues: res = TimelinePhysicalSizeValues() @@ -646,12 +616,9 @@ def get_physical_size_values( res.prometheus_resident_physical = metrics.query_one( "pageserver_resident_physical_size", metrics_filter ).value - if remote_storage_kind is not None: - res.prometheus_remote_physical = metrics.query_one( - "pageserver_remote_physical_size", metrics_filter - ).value - else: - res.prometheus_remote_physical = None + res.prometheus_remote_physical = metrics.query_one( + "pageserver_remote_physical_size", metrics_filter + ).value detail = client.timeline_detail( tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True @@ -664,20 +631,15 @@ def get_physical_size_values( return res -def assert_physical_size_invariants( - sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind] -): +def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # resident phyiscal size is defined as assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum # we don't do layer eviction, so, all layers are resident assert sizes.api_current_physical == sizes.prometheus_resident_physical - if remote_storage_kind is not None: - assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical - # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS - else: - assert sizes.prometheus_remote_physical is None + assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical + # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS # Timeline logical size initialization is an asynchronous background task that runs once,