diff --git a/CODEOWNERS b/CODEOWNERS
index b8ca54bc7e..e384dc39f1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -5,7 +5,7 @@
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/safekeepers
diff --git a/Cargo.lock b/Cargo.lock
index 2055f001af..36e7069eb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1780,18 +1780,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "hex"
@@ -2053,7 +2044,7 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
"libc",
"windows-sys 0.48.0",
]
@@ -2070,7 +2061,7 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
"io-lifetimes",
"rustix 0.37.19",
"windows-sys 0.48.0",
@@ -2444,11 +2435,11 @@ dependencies = [
[[package]]
name = "num_cpus"
-version = "1.15.0"
+version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
- "hermit-abi 0.2.6",
+ "hermit-abi",
"libc",
]
@@ -3246,6 +3237,7 @@ dependencies = [
"reqwest-tracing",
"routerify",
"rstest",
+ "rustc-hash",
"rustls",
"rustls-pemfile",
"scopeguard",
@@ -3417,6 +3409,7 @@ dependencies = [
"metrics",
"once_cell",
"pin-project-lite",
+ "rand",
"scopeguard",
"serde",
"serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 4fe3069822..b0bcf69039 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
rpds = "0.13"
+rustc-hash = "1.1.0"
rustls = "0.21"
rustls-pemfile = "1"
rustls-split = "0.3"
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 55eb9b7411..7e34b66d68 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -615,11 +615,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
#########################################################################################
#
# Layer "rust extensions"
-# This layer is used to build `pgx` deps
-#
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
+# This layer is used to build `pgrx` deps
#
#########################################################################################
FROM build-deps AS rust-extensions-build
@@ -635,22 +631,12 @@ USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
-RUN case "${PG_VERSION}" in \
- "v14" | "v15") \
- ;; \
- "v16") \
- echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
- ;; \
- *) \
- echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
- ;; \
- esac && \
- curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
- cargo install --locked --version 0.7.3 cargo-pgx && \
- /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+ cargo install --locked --version 0.10.2 cargo-pgrx && \
+ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
@@ -664,23 +650,11 @@ USER root
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
- "v14" | "v15") \
- ;; \
- "v16") \
- echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
- ;; \
- *) \
- echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
- ;; \
- esac && \
- wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
- echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+ echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
- sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
- cargo pgx install --release && \
+ sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+ cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
#########################################################################################
@@ -693,26 +667,11 @@ RUN case "${PG_VERSION}" in \
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
- "v14" | "v15") \
- ;; \
- "v16") \
- echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
- ;; \
- *) \
- echo "unexpected PostgreSQL version" && exit 1 \
- ;; \
- esac && \
- wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
- echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+ echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
- sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
- sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
- cargo pgx install --release && \
+ sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+ cargo pgrx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -727,21 +686,11 @@ RUN case "${PG_VERSION}" in \
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
- "v14" | "v15") \
- ;; \
- "v16") \
- echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
- ;; \
- *) \
- echo "unexpected PostgreSQL version" && exit 1 \
- ;; \
- esac && \
- wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
- echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
- cargo pgx install --release && \
+ cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
@@ -754,21 +703,15 @@ RUN case "${PG_VERSION}" in \
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
-RUN case "${PG_VERSION}" in \
- "v14" | "v15") \
- ;; \
- "v16") \
- echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
- ;; \
- *) \
- echo "unexpected PostgreSQL version" && exit 1 \
- ;; \
- esac && \
- wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
- echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+ echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
- sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
- cargo pgx install --release && \
+ echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+ wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+ patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+ echo "********************************************************************************************************" && \
+ sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+ cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs
index e879646b63..d4bca59c7b 100644
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request
) -> Result, Ap
if attach_req.pageserver_id.is_some() {
tenant_state.generation += 1;
}
+ tenant_state.pageserver = attach_req.pageserver_id;
let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?;
diff --git a/deny.toml b/deny.toml
index 55c581ce3a..f4ea0d4dac 100644
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md
new file mode 100644
index 0000000000..f708f641aa
--- /dev/null
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+ migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+ for a fast migration of the tenant from its current attachment to this
+ secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+ database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+ of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+ the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+ S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+ gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+ minimal availability gap. This RFC provides the _mechanism_
+ to fail over quickly, but does not provide the failure _detection_
+ nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+ level of HA & overhead will apply to all tenants. This doesn't rule out
+ adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+ plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+ a failure (we expect a small, bounded window of
+ read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+ around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+ defined in this RFC demonstrates the sufficiency of the pageserver API, but
+ is not the only kind of configuration change the control plane will ever do.
+ The APIs defined here should let the control plane move tenants around in
+ whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+ been issued a generation number, and is running an instance of
+ the `Tenant` type, ingesting the WAL, and available to serve
+ page reads.
+- **Location**: locations are a superset of attachments. A location
+ is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+ does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+ do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+ state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+ according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+| Step | Node A | Node B | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle | Secondary | A |
+| 2 | AttachedStale | AttachedMulti | A |
+| 3 | AttachedStale | AttachedMulti | A |
+| 4 | AttachedStale | AttachedMulti | A |
+| 5 (_cutover_) | AttachedStale | AttachedMulti | B |
+| 6 | AttachedStale | AttachedSingle | B |
+| 7 (_final_) | Secondary | AttachedSingle | B |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+ to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions. Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+ that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)),
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+ free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+ IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are. For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location. The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time. On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant//config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant//location/config`.
+
+Body content:
+
+```
+{
+ state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+ generation: Option,
+ configuration: `Option`
+ flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant//config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+ endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+ to set the generation of `ProjectLocation` rows when they are set into
+ an attached state.
+- The `Detached` state is implicitly represented as the absence of
+ a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+ to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+ and then back into AttachedSingle (this has the effect of
+ retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+ again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+ the remote state, which would make synchronizing with S3 more complex/expensive
+ when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+ purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+ these peers may use the capacity for other purposes, until it is needed
+ to handle migrated tenants. e.g. for keeping a deeper cache of their
+ attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 7b133c61af..9e89327e84 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f354296be2..68620787bb 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
+
+ /// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
+
+ /// The LSN that we are advertizing to safekeepers
+ #[serde_as(as = "DisplayFromStr")]
+ pub remote_consistent_lsn_visible: Lsn,
+
pub current_logical_size: Option, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index a4adae6146..2b808779f4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,3 +29,4 @@ workspace_hack.workspace = true
[dev-dependencies]
tempfile.workspace = true
test-context.workspace = true
+rand.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 1ddd156a08..a92b87632b 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{
use anyhow::{bail, Context};
+use serde::{Deserialize, Serialize};
use tokio::io;
use toml_edit::Item;
use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
///
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None;
+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
/// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
+impl Serialize for RemotePath {
+ fn serialize(&self, serializer: S) -> Result
+ where
+ S: serde::Serializer,
+ {
+ serializer.collect_str(self)
+ }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+ fn deserialize(deserializer: D) -> Result
+ where
+ D: serde::Deserializer<'de>,
+ {
+ let str = String::deserialize(deserializer)?;
+ Ok(Self(PathBuf::from(&str)))
+ }
+}
+
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
+
+ pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+ self.0.strip_prefix(&p.0)
+ }
}
/// Storage (potentially remote) API to manage its state.
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 9262f1e88f..fc6d7fa61b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;
use super::StorageMetadata;
use crate::{
- Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+ Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+ REMOTE_STORAGE_PREFIX_SEPARATOR,
};
-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
pub(super) mod metrics;
use self::metrics::{AttemptOutcome, RequestKind};
@@ -48,10 +47,47 @@ pub struct S3Bucket {
bucket_name: String,
prefix_in_bucket: Option,
max_keys_per_list_response: Option,
+ concurrency_limiter: ConcurrencyLimiter,
+}
+
+struct ConcurrencyLimiter {
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
// The helps to ensure we don't exceed the thresholds.
- concurrency_limiter: Arc,
+ write: Arc,
+ read: Arc,
+}
+
+impl ConcurrencyLimiter {
+ fn for_kind(&self, kind: RequestKind) -> &Arc {
+ match kind {
+ RequestKind::Get => &self.read,
+ RequestKind::Put => &self.write,
+ RequestKind::List => &self.read,
+ RequestKind::Delete => &self.write,
+ }
+ }
+
+ async fn acquire(
+ &self,
+ kind: RequestKind,
+ ) -> Result, tokio::sync::AcquireError> {
+ self.for_kind(kind).acquire().await
+ }
+
+ async fn acquire_owned(
+ &self,
+ kind: RequestKind,
+ ) -> Result {
+ Arc::clone(self.for_kind(kind)).acquire_owned().await
+ }
+
+ fn new(limit: usize) -> ConcurrencyLimiter {
+ Self {
+ read: Arc::new(Semaphore::new(limit)),
+ write: Arc::new(Semaphore::new(limit)),
+ }
+ }
}
#[derive(Default)]
@@ -118,7 +154,7 @@ impl S3Bucket {
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
- concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+ concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
})
}
@@ -157,7 +193,7 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
- .acquire()
+ .acquire(kind)
.await
.expect("semaphore is never closed");
@@ -173,8 +209,7 @@ impl S3Bucket {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
- .clone()
- .acquire_owned()
+ .acquire_owned(kind)
.await
.expect("semaphore is never closed");
@@ -500,7 +535,7 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
- for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+ for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
let started_at = start_measuring_requests(kind);
let resp = self
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 982c01a9be..b220349829 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
fn create_s3_client(
max_keys_per_list_response: Option,
) -> anyhow::Result> {
+ use rand::Rng;
+
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
- let random_prefix_part = std::time::SystemTime::now()
+
+ // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+ // millis is just a debugging aid for easier finding the prefix later.
+ let millis = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
- .as_nanos();
+ .as_millis();
+
+ // because nanos can be the same for two threads so can millis, add randomness
+ let random = rand::thread_rng().gen::();
+
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
- prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+ prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 163c8c0467..88d50905c6 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,6 +89,22 @@ impl Generation {
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
+
+ pub fn next(&self) -> Generation {
+ match self {
+ Self::Valid(n) => Self::Valid(*n + 1),
+ Self::None => Self::Valid(1),
+ Self::Broken => panic!("Attempted to use a broken generation"),
+ }
+ }
+
+ pub fn into(self) -> Option {
+ if let Self::Valid(v) = self {
+ Some(v)
+ } else {
+ None
+ }
+ }
}
impl Serialize for Generation {
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 527e486fd0..dd54cd6ecd 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box),
+ #[error("Shutting down")]
+ ShuttingDown,
+
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -52,6 +55,10 @@ impl ApiError {
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
+ ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+ "Shutting down".to_string(),
+ StatusCode::SERVICE_UNAVAILABLE,
+ ),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f69c0603d5..7f17970c4c 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
}
}
+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+ pub fn get_contents(&self) -> &str {
+ self.0.as_str()
+ }
+}
+
+impl std::fmt::Debug for SecretString {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "[SECRET]")
+ }
+}
+
#[cfg(test)]
mod tests {
use metrics::{core::Opts, IntCounterVec};
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 3254fa4501..15e972505e 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -431,14 +431,14 @@ impl CgroupWatcher {
.context("failed to request upscale")?;
let memory_high =
- self.get_high_bytes().context("failed to get memory.high")?;
+ self.get_memory_high_bytes().context("failed to get memory.high")?;
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
info!(
current_high_bytes = memory_high,
new_high_bytes = new_high,
"updating memory.high"
);
- self.set_high_bytes(new_high)
+ self.set_memory_high_bytes(new_high)
.context("failed to set memory.high")?;
last_memory_high_increase_at = Some(Instant::now());
continue;
@@ -556,14 +556,6 @@ impl CgroupWatcher {
}
}
-/// Represents a set of limits we apply to a cgroup to control memory usage.
-///
-/// Setting these values also affects the thresholds for receiving usage alerts.
-#[derive(Debug)]
-pub struct MemoryLimits {
- pub high: u64,
-}
-
// Methods for manipulating the actual cgroup
impl CgroupWatcher {
/// Get a handle on the freezer subsystem.
@@ -624,50 +616,29 @@ impl CgroupWatcher {
}
/// Set cgroup memory.high threshold.
- pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+ pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+ self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+ }
+
+ /// Set the cgroup's memory.high to 'max', disabling it.
+ pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+ self.set_memory_high_internal(MaxValue::Max)
+ }
+
+ fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
self.memory()
.context("failed to get memory subsystem")?
.set_mem(cgroups_rs::memory::SetMemory {
low: None,
- high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+ high: Some(value),
min: None,
max: None,
})
- .context("failed to set memory.high")
- }
-
- /// Set cgroup memory.high and memory.max.
- pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
- info!(limits.high, path = self.path(), "writing new memory limits",);
- self.memory()
- .context("failed to get memory subsystem while setting memory limits")?
- .set_mem(cgroups_rs::memory::SetMemory {
- min: None,
- low: None,
- high: Some(MaxValue::Value(
- u64::min(limits.high, i64::MAX as u64) as i64
- )),
- max: None,
- })
- .context("failed to set memory limits")
- }
-
- /// Given some amount of available memory, set the desired cgroup memory limits
- pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
- let new_high = self.config.calculate_memory_high_value(available_memory);
- let limits = MemoryLimits { high: new_high };
- info!(
- path = self.path(),
- memory = ?limits,
- "setting cgroup memory",
- );
- self.set_limits(&limits)
- .context("failed to set cgroup memory limits")?;
- Ok(())
+ .map_err(anyhow::Error::from)
}
/// Get memory.high threshold.
- pub fn get_high_bytes(&self) -> anyhow::Result {
+ pub fn get_memory_high_bytes(&self) -> anyhow::Result {
let high = self
.memory()
.context("failed to get memory subsystem while getting memory statistics")?
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 376017d784..09863c8936 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
-use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::cgroup::{CgroupWatcher, Sequenced};
use crate::dispatcher::Dispatcher;
use crate::filecache::{FileCacheConfig, FileCacheState};
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,6 +106,51 @@ impl Runner {
kill,
};
+ // If we have both the cgroup and file cache integrations enabled, it's possible for
+ // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+ // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+ // we *do* still want to determine the file cache size before setting the cgroup's
+ // memory.high, so it's not as simple as just swapping the order.
+ //
+ // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+ // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+ // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+ // of a hacky solution, but helps with reliability.
+ if let Some(name) = &args.cgroup {
+ // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+ // now, and then set limits later.
+ info!("initializing cgroup");
+
+ let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+ .context("failed to create cgroup manager")?;
+
+ info!("temporarily unsetting memory.high");
+
+ // Temporarily un-set cgroup memory.high; see above.
+ cgroup
+ .unset_memory_high()
+ .context("failed to unset memory.high")?;
+
+ let cgroup = Arc::new(cgroup);
+
+ let cgroup_clone = Arc::clone(&cgroup);
+ spawn_with_cancel(
+ token.clone(),
+ |_| error!("cgroup watcher terminated"),
+ async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+ );
+
+ state.cgroup = Some(cgroup);
+ } else {
+ // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+ // This allows us to poll it in `Monitor::run` regardless of whether we
+ // are managing a cgroup or not. If we don't forget it, all receives will
+ // immediately return an error because the sender is droped and it will
+ // claim all select! statements, effectively turning `Monitor::run` into
+ // `loop { fail to receive }`.
+ mem::forget(requesting_send);
+ }
+
let mut file_cache_reserved_bytes = 0;
let mem = get_total_system_memory();
@@ -119,7 +164,7 @@ impl Runner {
false => FileCacheConfig::default_in_memory(),
};
- let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+ let mut file_cache = FileCacheState::new(connstr, config, token)
.await
.context("failed to create file cache")?;
@@ -152,35 +197,15 @@ impl Runner {
state.filecache = Some(file_cache);
}
- if let Some(name) = &args.cgroup {
- let (mut cgroup, cgroup_event_stream) =
- CgroupWatcher::new(name.clone(), requesting_send)
- .context("failed to create cgroup manager")?;
-
+ if let Some(cgroup) = &state.cgroup {
let available = mem - file_cache_reserved_bytes;
+ let value = cgroup.config.calculate_memory_high_value(available);
+
+ info!(value, "setting memory.high");
cgroup
- .set_memory_limits(available)
- .context("failed to set cgroup memory limits")?;
-
- let cgroup = Arc::new(cgroup);
-
- // Some might call this . . . cgroup v2
- let cgroup_clone = Arc::clone(&cgroup);
-
- spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
- cgroup_clone.watch(notified_recv, cgroup_event_stream).await
- });
-
- state.cgroup = Some(cgroup);
- } else {
- // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
- // This allows us to poll it in `Monitor::run` regardless of whether we
- // are managing a cgroup or not. If we don't forget it, all receives will
- // immediately return an error because the sender is droped and it will
- // claim all select! statements, effectively turning `Monitor::run` into
- // `loop { fail to receive }`.
- mem::forget(requesting_send);
+ .set_memory_high_bytes(value)
+ .context("failed to set cgroup memory.high")?;
}
Ok(state)
@@ -257,14 +282,11 @@ impl Runner {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
- let limits = MemoryLimits {
- // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
- // since it is properly initialized in the previous cgroup if let block
- high: new_cgroup_mem_high,
- };
+ // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+ // since it is properly initialized in the previous cgroup if let block
cgroup
- .set_limits(&limits)
- .context("failed to set cgroup memory limits")?;
+ .set_memory_high_bytes(new_cgroup_mem_high)
+ .context("failed to set cgroup memory.high")?;
let message = format!(
"set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -327,12 +349,9 @@ impl Runner {
name = cgroup.path(),
"updating cgroup memory.high",
);
- let limits = MemoryLimits {
- high: new_cgroup_mem_high,
- };
cgroup
- .set_limits(&limits)
- .context("failed to set file cache size")?;
+ .set_memory_high_bytes(new_cgroup_mem_high)
+ .context("failed to set cgroup memory.high")?;
}
Ok(())
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b6a2117f9c..d8a00b677b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
context::{DownloadBehavior, RequestContext},
+ deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
}
};
+ // Top-level cancellation token for the process
+ let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
+ // Set up deletion queue
+ let (deletion_queue, deletion_workers) = DeletionQueue::new(
+ remote_storage.clone(),
+ ControlPlaneClient::new(conf, &shutdown_pageserver),
+ conf,
+ );
+ if let Some(deletion_workers) = deletion_workers {
+ deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+ }
+
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
@@ -379,13 +394,13 @@ fn start_pageserver(
};
// Scan the local 'tenants/' directory and start loading the tenants
- let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+ let deletion_queue_client = deletion_queue.new_client();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
+ deletion_queue_client,
},
order,
shutdown_pageserver.clone(),
@@ -481,9 +496,10 @@ fn start_pageserver(
http::routes::State::new(
conf,
http_auth.clone(),
- remote_storage,
+ remote_storage.clone(),
broker_client.clone(),
disk_usage_eviction_state,
+ deletion_queue.new_client(),
)
.context("Failed to initialize router state")?,
);
@@ -611,7 +627,12 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
- BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+ let bg_remote_storage = remote_storage.clone();
+ let bg_deletion_queue = deletion_queue.clone();
+ BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+ bg_remote_storage.map(|_| bg_deletion_queue),
+ 0,
+ ));
unreachable!()
}
})
@@ -623,7 +644,7 @@ fn create_remote_storage_client(
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
- // No remote storage configured.
+ tracing::warn!("no remote storage configured, this is a deprecated configuration");
return Ok(None);
};
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8ee7f28c11..c3f2f14a74 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,6 +11,7 @@ use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
+use utils::logging::SecretString;
use once_cell::sync::OnceCell;
use reqwest::Url;
@@ -207,6 +208,9 @@ pub struct PageServerConf {
pub background_task_maximum_delay: Duration,
pub control_plane_api: Option,
+
+ /// JWT token for use with the control plane API.
+ pub control_plane_api_token: Option,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -283,6 +287,7 @@ struct PageServerConfigBuilder {
background_task_maximum_delay: BuilderValue,
control_plane_api: BuilderValue>,
+ control_plane_api_token: BuilderValue >,
}
impl Default for PageServerConfigBuilder {
@@ -347,6 +352,7 @@ impl Default for PageServerConfigBuilder {
.unwrap()),
control_plane_api: Set(None),
+ control_plane_api_token: Set(None),
}
}
}
@@ -475,8 +481,8 @@ impl PageServerConfigBuilder {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
- pub fn control_plane_api(&mut self, api: Url) {
- self.control_plane_api = BuilderValue::Set(Some(api))
+ pub fn control_plane_api(&mut self, api: Option) {
+ self.control_plane_api = BuilderValue::Set(api)
}
pub fn build(self) -> anyhow::Result {
@@ -567,6 +573,9 @@ impl PageServerConfigBuilder {
control_plane_api: self
.control_plane_api
.ok_or(anyhow!("missing control_plane_api"))?,
+ control_plane_api_token: self
+ .control_plane_api_token
+ .ok_or(anyhow!("missing control_plane_api_token"))?,
})
}
}
@@ -580,6 +589,27 @@ impl PageServerConf {
self.workdir.join(TENANTS_SEGMENT_NAME)
}
+ pub fn deletion_prefix(&self) -> PathBuf {
+ self.workdir.join("deletion")
+ }
+
+ pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+ // Encode a version in the filename, so that if we ever switch away from JSON we can
+ // increment this.
+ const VERSION: u8 = 1;
+
+ self.deletion_prefix()
+ .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+ }
+
+ pub fn deletion_header_path(&self) -> PathBuf {
+ // Encode a version in the filename, so that if we ever switch away from JSON we can
+ // increment this.
+ const VERSION: u8 = 1;
+
+ self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+ }
+
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
@@ -747,7 +777,14 @@ impl PageServerConf {
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
- "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
+ "control_plane_api" => {
+ let parsed = parse_toml_string(key, item)?;
+ if parsed.is_empty() {
+ builder.control_plane_api(None)
+ } else {
+ builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+ }
+ },
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -917,6 +954,7 @@ impl PageServerConf {
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
control_plane_api: None,
+ control_plane_api_token: None,
}
}
}
@@ -1140,7 +1178,8 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
- control_plane_api: None
+ control_plane_api: None,
+ control_plane_api_token: None
},
"Correct defaults should be used when no config values are provided"
);
@@ -1196,7 +1235,8 @@ background_task_maximum_delay = '334 s'
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
- control_plane_api: None
+ control_plane_api: None,
+ control_plane_api_token: None
},
"Should be able to parse all basic config values correctly"
);
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 192eb16789..3375392373 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,7 +1,9 @@
use std::collections::HashMap;
-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use pageserver_api::control_api::{
+ ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{
@@ -12,25 +14,34 @@ use utils::{
use crate::config::PageServerConf;
-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
+pub struct ControlPlaneClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
cancel: CancellationToken,
}
+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+ ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+ async fn re_attach(&self) -> Result, RetryForeverError>;
+ async fn validate(
+ &self,
+ tenants: Vec<(TenantId, Generation)>,
+ ) -> Result, RetryForeverError>;
+}
+
impl ControlPlaneClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
- pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option {
+ pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
@@ -42,39 +53,78 @@ impl ControlPlaneClient {
segs.pop_if_empty().push("");
}
- let client = reqwest::ClientBuilder::new()
- .build()
- .expect("Failed to construct http client");
+ let mut client = reqwest::ClientBuilder::new();
+
+ if let Some(jwt) = &conf.control_plane_api_token {
+ let mut headers = hyper::HeaderMap::new();
+ headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+ client = client.default_headers(headers);
+ }
Some(Self {
- http_client: client,
+ http_client: client.build().expect("Failed to construct HTTP client"),
base_url: url,
node_id: conf.id,
cancel: cancel.clone(),
})
}
- async fn try_re_attach(
+ async fn retry_http_forever(
&self,
- url: Url,
- request: &ReAttachRequest,
- ) -> anyhow::Result {
- match self.http_client.post(url).json(request).send().await {
- Err(e) => Err(anyhow::Error::from(e)),
- Ok(r) => {
- if r.status() == StatusCode::OK {
- r.json::()
- .await
- .map_err(anyhow::Error::from)
- } else {
- Err(anyhow::anyhow!("Unexpected status {}", r.status()))
- }
+ url: &url::Url,
+ request: R,
+ ) -> Result
+ where
+ R: Serialize,
+ T: DeserializeOwned,
+ {
+ #[derive(thiserror::Error, Debug)]
+ enum RemoteAttemptError {
+ #[error("shutdown")]
+ Shutdown,
+ #[error("remote: {0}")]
+ Remote(reqwest::Error),
+ }
+
+ match backoff::retry(
+ || async {
+ let response = self
+ .http_client
+ .post(url.clone())
+ .json(&request)
+ .send()
+ .await
+ .map_err(RemoteAttemptError::Remote)?;
+
+ response
+ .error_for_status_ref()
+ .map_err(RemoteAttemptError::Remote)?;
+ response
+ .json::()
+ .await
+ .map_err(RemoteAttemptError::Remote)
+ },
+ |_| false,
+ 3,
+ u32::MAX,
+ "calling control plane generation validation API",
+ backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+ )
+ .await
+ {
+ Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+ Err(RemoteAttemptError::Remote(_)) => {
+ panic!("We retry forever, this should never be reached");
}
+ Ok(r) => Ok(r),
}
}
+}
- /// Block until we get a successful response
- pub(crate) async fn re_attach(&self) -> anyhow::Result> {
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+ /// Block until we get a successful response, or error out if we are shut down
+ async fn re_attach(&self) -> Result, RetryForeverError> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -83,37 +133,47 @@ impl ControlPlaneClient {
node_id: self.node_id,
};
- let mut attempt = 0;
- loop {
- let result = self.try_re_attach(re_attach_path.clone(), &request).await;
- match result {
- Ok(res) => {
- tracing::info!(
- "Received re-attach response with {} tenants",
- res.tenants.len()
- );
+ let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+ tracing::info!(
+ "Received re-attach response with {} tenants",
+ response.tenants.len()
+ );
- return Ok(res
- .tenants
- .into_iter()
- .map(|t| (t.id, Generation::new(t.generation)))
- .collect::>());
- }
- Err(e) => {
- tracing::error!("Error re-attaching tenants, retrying: {e:#}");
- backoff::exponential_backoff(
- attempt,
- BACKOFF_INCREMENT,
- BACKOFF_MAX,
- &self.cancel,
- )
- .await;
- if self.cancel.is_cancelled() {
- return Err(anyhow::anyhow!("Shutting down"));
- }
- attempt += 1;
- }
- }
- }
+ Ok(response
+ .tenants
+ .into_iter()
+ .map(|t| (t.id, Generation::new(t.generation)))
+ .collect::>())
+ }
+
+ /// Block until we get a successful response, or error out if we are shut down
+ async fn validate(
+ &self,
+ tenants: Vec<(TenantId, Generation)>,
+ ) -> Result, RetryForeverError> {
+ let re_attach_path = self
+ .base_url
+ .join("validate")
+ .expect("Failed to build validate path");
+
+ let request = ValidateRequest {
+ tenants: tenants
+ .into_iter()
+ .map(|(id, gen)| ValidateRequestTenant {
+ id,
+ gen: gen
+ .into()
+ .expect("Generation should always be valid for a Tenant doing deletions"),
+ })
+ .collect(),
+ };
+
+ let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+ Ok(response
+ .tenants
+ .into_iter()
+ .map(|rt| (rt.id, rt.valid))
+ .collect())
}
}
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
new file mode 100644
index 0000000000..4c0d399789
--- /dev/null
+++ b/pageserver/src/deletion_queue.rs
@@ -0,0 +1,1312 @@
+mod deleter;
+mod list_writer;
+mod validator;
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::VirtualFile;
+use anyhow::Context;
+use hex::FromHex;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::Deserialize;
+use serde::Serialize;
+use serde_with::serde_as;
+use thiserror::Error;
+use tokio;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use tracing::{self, debug, error};
+use utils::crashsafe::path_with_suffix_extension;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::AtomicLsn;
+use utils::lsn::Lsn;
+
+use self::deleter::Deleter;
+use self::list_writer::DeletionOp;
+use self::list_writer::ListWriter;
+use self::list_writer::RecoverOp;
+use self::validator::Validator;
+use deleter::DeleterMessage;
+use list_writer::ListWriterQueueMessage;
+use validator::ValidatorQueueMessage;
+
+use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+
+// TODO: adminstrative "panic button" config property to disable all deletions
+// TODO: configurable for how long to wait before executing deletions
+
+/// We aggregate object deletions from many tenants in one place, for several reasons:
+/// - Coalesce deletions into fewer DeleteObjects calls
+/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
+/// to flush any outstanding deletions.
+/// - Globally control throughput of deletions, as these are a low priority task: do
+/// not compete with the same S3 clients/connections used for higher priority uploads.
+/// - Enable gating deletions on validation of a tenant's generation number, to make
+/// it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md)
+///
+/// There are two kinds of deletion: deferred and immediate. A deferred deletion
+/// may be intentionally delayed to protect passive readers of S3 data, and is
+/// subject to a generation number validation step. An immediate deletion is
+/// ready to execute immediately, and is only queued up so that it can be coalesced
+/// with other deletions in flight.
+///
+/// Deferred deletions pass through three steps:
+/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into
+/// DeletionLists, which are persisted to disk.
+/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing
+/// the keys in the list onward for actual deletion. Also validate remote_consistent_lsn
+/// updates for running timelines.
+/// - Deleter: accumulate object keys that the validator has validated, and execute them in
+/// batches of 1000 keys via DeleteObjects.
+///
+/// Non-deferred deletions, such as during timeline deletion, bypass the first
+/// two stages and are passed straight into the Deleter.
+///
+/// Internally, each stage is joined by a channel to the next. On disk, there is only
+/// one queue (of DeletionLists), which is written by the frontend and consumed
+/// by the backend.
+#[derive(Clone)]
+pub struct DeletionQueue {
+ client: DeletionQueueClient,
+
+ // Parent cancellation token for the tokens passed into background workers
+ cancel: CancellationToken,
+}
+
+/// Opaque wrapper around individual worker tasks, to avoid making the
+/// worker objects themselves public
+pub struct DeletionQueueWorkers
+where
+ C: ControlPlaneGenerationsApi + Send + Sync,
+{
+ frontend: ListWriter,
+ backend: Validator,
+ executor: Deleter,
+}
+
+impl DeletionQueueWorkers
+where
+ C: ControlPlaneGenerationsApi + Send + Sync + 'static,
+{
+ pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> {
+ let jh_frontend = runtime.spawn(async move {
+ self.frontend
+ .background()
+ .instrument(tracing::info_span!(parent:None, "deletion frontend"))
+ .await
+ });
+ let jh_backend = runtime.spawn(async move {
+ self.backend
+ .background()
+ .instrument(tracing::info_span!(parent:None, "deletion backend"))
+ .await
+ });
+ let jh_executor = runtime.spawn(async move {
+ self.executor
+ .background()
+ .instrument(tracing::info_span!(parent:None, "deletion executor"))
+ .await
+ });
+
+ runtime.spawn({
+ async move {
+ jh_frontend.await.expect("error joining frontend worker");
+ jh_backend.await.expect("error joining backend worker");
+ drop(jh_executor.await.expect("error joining executor worker"));
+ }
+ })
+ }
+}
+
+/// A FlushOp is just a oneshot channel, where we send the transmit side down
+/// another channel, and the receive side will receive a message when the channel
+/// we're flushing has reached the FlushOp we sent into it.
+///
+/// The only extra behavior beyond the channel is that the notify() method does not
+/// return an error when the receive side has been dropped, because in this use case
+/// it is harmless (the code that initiated the flush no longer cares about the result).
+#[derive(Debug)]
+struct FlushOp {
+ tx: tokio::sync::oneshot::Sender<()>,
+}
+
+impl FlushOp {
+ fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) {
+ let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+ (Self { tx }, rx)
+ }
+
+ fn notify(self) {
+ if self.tx.send(()).is_err() {
+ // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
+ debug!("deletion queue flush from dropped client");
+ };
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct DeletionQueueClient {
+ tx: tokio::sync::mpsc::Sender,
+ executor_tx: tokio::sync::mpsc::Sender,
+
+ lsn_table: Arc>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct TenantDeletionList {
+ /// For each Timeline, a list of key fragments to append to the timeline remote path
+ /// when reconstructing a full key
+ #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+ timelines: HashMap>,
+
+ /// The generation in which this deletion was emitted: note that this may not be the
+ /// same as the generation of any layers being deleted. The generation of the layer
+ /// has already been absorbed into the keys in `objects`
+ generation: Generation,
+}
+
+impl TenantDeletionList {
+ pub(crate) fn len(&self) -> usize {
+ self.timelines.values().map(|v| v.len()).sum()
+ }
+}
+
+/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
+fn to_hex_map(input: &HashMap, serializer: S) -> Result
+where
+ S: serde::Serializer,
+ V: Serialize,
+ I: AsRef<[u8]>,
+{
+ let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+ transformed
+ .collect::>()
+ .serialize(serializer)
+}
+
+/// For HashMaps using a FromHex key, where we would like to decode the key
+fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result, D::Error>
+where
+ D: serde::de::Deserializer<'de>,
+ V: Deserialize<'de>,
+ I: FromHex + std::hash::Hash + Eq,
+{
+ let hex_map = HashMap::::deserialize(deserializer)?;
+ hex_map
+ .into_iter()
+ .map(|(k, v)| {
+ I::from_hex(k)
+ .map(|k| (k, v))
+ .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
+ })
+ .collect()
+}
+
+/// Files ending with this suffix will be ignored and erased
+/// during recovery as startup.
+const TEMP_SUFFIX: &str = ".tmp";
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionList {
+ /// Serialization version, for future use
+ version: u8,
+
+ /// Used for constructing a unique key for each deletion list we write out.
+ sequence: u64,
+
+ /// To avoid repeating tenant/timeline IDs in every key, we store keys in
+ /// nested HashMaps by TenantTimelineID. Each Tenant only appears once
+ /// with one unique generation ID: if someone tries to push a second generation
+ /// ID for the same tenant, we will start a new DeletionList.
+ #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+ tenants: HashMap,
+
+ /// Avoid having to walk `tenants` to calculate the number of keys in
+ /// the nested deletion lists
+ size: usize,
+
+ /// Set to true when the list has undergone validation with the control
+ /// plane and the remaining contents of `tenants` are valid. A list may
+ /// also be implicitly marked valid by DeletionHeader.validated_sequence
+ /// advancing to >= DeletionList.sequence
+ #[serde(default)]
+ #[serde(skip_serializing_if = "std::ops::Not::not")]
+ validated: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionHeader {
+ /// Serialization version, for future use
+ version: u8,
+
+ /// The highest sequence number (inclusive) that has been validated. All deletion
+ /// lists on disk with a sequence <= this value are safe to execute.
+ validated_sequence: u64,
+}
+
+impl DeletionHeader {
+ const VERSION_LATEST: u8 = 1;
+
+ fn new(validated_sequence: u64) -> Self {
+ Self {
+ version: Self::VERSION_LATEST,
+ validated_sequence,
+ }
+ }
+
+ async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+ debug!("Saving deletion list header {:?}", self);
+ let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
+ let header_path = conf.deletion_header_path();
+ let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
+ VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+ .await
+ .map_err(Into::into)
+ }
+}
+
+impl DeletionList {
+ const VERSION_LATEST: u8 = 1;
+ fn new(sequence: u64) -> Self {
+ Self {
+ version: Self::VERSION_LATEST,
+ sequence,
+ tenants: HashMap::new(),
+ size: 0,
+ validated: false,
+ }
+ }
+
+ fn is_empty(&self) -> bool {
+ self.tenants.is_empty()
+ }
+
+ fn len(&self) -> usize {
+ self.size
+ }
+
+ /// Returns true if the push was accepted, false if the caller must start a new
+ /// deletion list.
+ fn push(
+ &mut self,
+ tenant: &TenantId,
+ timeline: &TimelineId,
+ generation: Generation,
+ objects: &mut Vec,
+ ) -> bool {
+ if objects.is_empty() {
+ // Avoid inserting an empty TimelineDeletionList: this preserves the property
+ // that if we have no keys, then self.objects is empty (used in Self::is_empty)
+ return true;
+ }
+
+ let tenant_entry = self
+ .tenants
+ .entry(*tenant)
+ .or_insert_with(|| TenantDeletionList {
+ timelines: HashMap::new(),
+ generation,
+ });
+
+ if tenant_entry.generation != generation {
+ // Only one generation per tenant per list: signal to
+ // caller to start a new list.
+ return false;
+ }
+
+ let timeline_entry = tenant_entry
+ .timelines
+ .entry(*timeline)
+ .or_insert_with(Vec::new);
+
+ let timeline_remote_path = remote_timeline_path(tenant, timeline);
+
+ self.size += objects.len();
+ timeline_entry.extend(objects.drain(..).map(|p| {
+ p.strip_prefix(&timeline_remote_path)
+ .expect("Timeline paths always start with the timeline prefix")
+ .to_string_lossy()
+ .to_string()
+ }));
+ true
+ }
+
+ fn into_remote_paths(self) -> Vec {
+ let mut result = Vec::new();
+ for (tenant, tenant_deletions) in self.tenants.into_iter() {
+ for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
+ let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
+ result.extend(
+ timeline_layers
+ .into_iter()
+ .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+ );
+ }
+ }
+
+ result
+ }
+
+ async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+ let path = conf.deletion_list_path(self.sequence);
+ let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
+
+ let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
+ VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+ .await
+ .map_err(Into::into)
+ }
+}
+
+impl std::fmt::Display for DeletionList {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(
+ f,
+ "DeletionList",
+ self.sequence,
+ self.tenants.len(),
+ self.size
+ )
+ }
+}
+
+struct PendingLsn {
+ projected: Lsn,
+ result_slot: Arc,
+}
+
+struct TenantLsnState {
+ timelines: HashMap,
+
+ // In what generation was the most recent update proposed?
+ generation: Generation,
+}
+
+#[derive(Default)]
+struct VisibleLsnUpdates {
+ tenants: HashMap,
+}
+
+impl VisibleLsnUpdates {
+ fn new() -> Self {
+ Self {
+ tenants: HashMap::new(),
+ }
+ }
+}
+
+impl std::fmt::Debug for VisibleLsnUpdates {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len())
+ }
+}
+
+#[derive(Error, Debug)]
+pub enum DeletionQueueError {
+ #[error("Deletion queue unavailable during shutdown")]
+ ShuttingDown,
+}
+
+impl DeletionQueueClient {
+ pub(crate) fn broken() -> Self {
+ // Channels whose receivers are immediately dropped.
+ let (tx, _rx) = tokio::sync::mpsc::channel(1);
+ let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
+ Self {
+ tx,
+ executor_tx,
+ lsn_table: Arc::default(),
+ }
+ }
+
+ /// This is cancel-safe. If you drop the future before it completes, the message
+ /// is not pushed, although in the context of the deletion queue it doesn't matter: once
+ /// we decide to do a deletion the decision is always final.
+ async fn do_push(
+ &self,
+ queue: &tokio::sync::mpsc::Sender,
+ msg: T,
+ ) -> Result<(), DeletionQueueError> {
+ match queue.send(msg).await {
+ Ok(_) => Ok(()),
+ Err(e) => {
+ // This shouldn't happen, we should shut down all tenants before
+ // we shut down the global delete queue. If we encounter a bug like this,
+ // we may leak objects as deletions won't be processed.
+ error!("Deletion queue closed while pushing, shutting down? ({e})");
+ Err(DeletionQueueError::ShuttingDown)
+ }
+ }
+ }
+
+ pub(crate) async fn recover(
+ &self,
+ attached_tenants: HashMap,
+ ) -> Result<(), DeletionQueueError> {
+ self.do_push(
+ &self.tx,
+ ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
+ )
+ .await
+ }
+
+ /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
+ /// world, it must validate its generation number before doing so. Rather than do this synchronously,
+ /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most
+ /// recently validated separately.
+ ///
+ /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates. The
+ /// backend will later wake up and notice that the tenant's generation requires validation.
+ pub(crate) async fn update_remote_consistent_lsn(
+ &self,
+ tenant_id: TenantId,
+ timeline_id: TimelineId,
+ current_generation: Generation,
+ lsn: Lsn,
+ result_slot: Arc,
+ ) {
+ let mut locked = self
+ .lsn_table
+ .write()
+ .expect("Lock should never be poisoned");
+
+ let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
+ timelines: HashMap::new(),
+ generation: current_generation,
+ });
+
+ if tenant_entry.generation != current_generation {
+ // Generation might have changed if we were detached and then re-attached: in this case,
+ // state from the previous generation cannot be trusted.
+ tenant_entry.timelines.clear();
+ tenant_entry.generation = current_generation;
+ }
+
+ tenant_entry.timelines.insert(
+ timeline_id,
+ PendingLsn {
+ projected: lsn,
+ result_slot,
+ },
+ );
+ }
+
+ /// Submit a list of layers for deletion: this function will return before the deletion is
+ /// persistent, but it may be executed at any time after this function enters: do not push
+ /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
+ /// references them).
+ ///
+ /// The `current_generation` is the generation of this pageserver's current attachment. The
+ /// generations in `layers` are the generations in which those layers were written.
+ pub(crate) async fn push_layers(
+ &self,
+ tenant_id: TenantId,
+ timeline_id: TimelineId,
+ current_generation: Generation,
+ layers: Vec<(LayerFileName, Generation)>,
+ ) -> Result<(), DeletionQueueError> {
+ if current_generation.is_none() {
+ debug!("Enqueuing deletions in legacy mode, skipping queue");
+ let mut layer_paths = Vec::new();
+ for (layer, generation) in layers {
+ layer_paths.push(remote_layer_path(
+ &tenant_id,
+ &timeline_id,
+ &layer,
+ generation,
+ ));
+ }
+ self.push_immediate(layer_paths).await?;
+ return self.flush_immediate().await;
+ }
+
+ metrics::DELETION_QUEUE
+ .keys_submitted
+ .inc_by(layers.len() as u64);
+ self.do_push(
+ &self.tx,
+ ListWriterQueueMessage::Delete(DeletionOp {
+ tenant_id,
+ timeline_id,
+ layers,
+ generation: current_generation,
+ objects: Vec::new(),
+ }),
+ )
+ .await
+ }
+
+ /// This is cancel-safe. If you drop the future the flush may still happen in the background.
+ async fn do_flush(
+ &self,
+ queue: &tokio::sync::mpsc::Sender,
+ msg: T,
+ rx: tokio::sync::oneshot::Receiver<()>,
+ ) -> Result<(), DeletionQueueError> {
+ self.do_push(queue, msg).await?;
+ if rx.await.is_err() {
+ // This shouldn't happen if tenants are shut down before deletion queue. If we
+ // encounter a bug like this, then a flusher will incorrectly believe it has flushed
+ // when it hasn't, possibly leading to leaking objects.
+ error!("Deletion queue dropped flush op while client was still waiting");
+ Err(DeletionQueueError::ShuttingDown)
+ } else {
+ Ok(())
+ }
+ }
+
+ /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
+ ///
+ /// This is cancel-safe. If you drop the future the flush may still happen in the background.
+ pub async fn flush(&self) -> Result<(), DeletionQueueError> {
+ let (flush_op, rx) = FlushOp::new();
+ self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx)
+ .await
+ }
+
+ // Wait until all previous deletions are executed
+ pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
+ debug!("flush_execute: flushing to deletion lists...");
+ // Flush any buffered work to deletion lists
+ self.flush().await?;
+
+ // Flush the backend into the executor of deletion lists
+ let (flush_op, rx) = FlushOp::new();
+ debug!("flush_execute: flushing backend...");
+ self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx)
+ .await?;
+ debug!("flush_execute: finished flushing backend...");
+
+ // Flush any immediate-mode deletions (the above backend flush will only flush
+ // the executor if deletions had flowed through the backend)
+ debug!("flush_execute: flushing execution...");
+ let (flush_op, rx) = FlushOp::new();
+ self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+ .await?;
+ debug!("flush_execute: finished flushing execution...");
+ Ok(())
+ }
+
+ /// This interface bypasses the persistent deletion queue, and any validation
+ /// that this pageserver is still elegible to execute the deletions. It is for
+ /// use in timeline deletions, where the control plane is telling us we may
+ /// delete everything in the timeline.
+ ///
+ /// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`.
+ pub(crate) async fn push_immediate(
+ &self,
+ objects: Vec,
+ ) -> Result<(), DeletionQueueError> {
+ metrics::DELETION_QUEUE
+ .keys_submitted
+ .inc_by(objects.len() as u64);
+ self.executor_tx
+ .send(DeleterMessage::Delete(objects))
+ .await
+ .map_err(|_| DeletionQueueError::ShuttingDown)
+ }
+
+ /// Companion to push_immediate. When this returns Ok, all prior objects sent
+ /// into push_immediate have been deleted from remote storage.
+ pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
+ let (flush_op, rx) = FlushOp::new();
+ self.executor_tx
+ .send(DeleterMessage::Flush(flush_op))
+ .await
+ .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+ rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+ }
+}
+
+impl DeletionQueue {
+ pub fn new_client(&self) -> DeletionQueueClient {
+ self.client.clone()
+ }
+
+ /// Caller may use the returned object to construct clients with new_client.
+ /// Caller should tokio::spawn the background() members of the two worker objects returned:
+ /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
+ ///
+ /// If remote_storage is None, then the returned workers will also be None.
+ pub fn new(
+ remote_storage: Option,
+ control_plane_client: Option,
+ conf: &'static PageServerConf,
+ ) -> (Self, Option>)
+ where
+ C: ControlPlaneGenerationsApi + Send + Sync,
+ {
+ // Deep channel: it consumes deletions from all timelines and we do not want to block them
+ let (tx, rx) = tokio::sync::mpsc::channel(16384);
+
+ // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
+ let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
+
+ // Shallow channel: it carries lists of paths, and we expect the main queueing to
+ // happen in the backend (persistent), not in this queue.
+ let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
+
+ let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));
+
+ // The deletion queue has an independent cancellation token to
+ // the general pageserver shutdown token, because it stays alive a bit
+ // longer to flush after Tenants have all been torn down.
+ let cancel = CancellationToken::new();
+
+ let remote_storage = match remote_storage {
+ None => {
+ return (
+ Self {
+ client: DeletionQueueClient {
+ tx,
+ executor_tx,
+ lsn_table: lsn_table.clone(),
+ },
+ cancel,
+ },
+ None,
+ )
+ }
+ Some(r) => r,
+ };
+
+ (
+ Self {
+ client: DeletionQueueClient {
+ tx,
+ executor_tx: executor_tx.clone(),
+ lsn_table: lsn_table.clone(),
+ },
+ cancel: cancel.clone(),
+ },
+ Some(DeletionQueueWorkers {
+ frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
+ backend: Validator::new(
+ conf,
+ backend_rx,
+ executor_tx,
+ control_plane_client,
+ lsn_table.clone(),
+ cancel.clone(),
+ ),
+ executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
+ }),
+ )
+ }
+
+ pub async fn shutdown(&mut self, timeout: Duration) {
+ self.cancel.cancel();
+
+ match tokio::time::timeout(timeout, self.client.flush()).await {
+ Ok(Ok(())) => {
+ tracing::info!("Deletion queue flushed successfully on shutdown")
+ }
+ Ok(Err(DeletionQueueError::ShuttingDown)) => {
+ // This is not harmful for correctness, but is unexpected: the deletion
+ // queue's workers should stay alive as long as there are any client handles instantiated.
+ tracing::warn!("Deletion queue stopped prematurely");
+ }
+ Err(_timeout) => {
+ tracing::warn!("Timed out flushing deletion queue on shutdown")
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use hex_literal::hex;
+ use std::{
+ io::ErrorKind,
+ path::{Path, PathBuf},
+ time::Duration,
+ };
+ use tracing::info;
+
+ use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+ use tokio::task::JoinHandle;
+
+ use crate::{
+ control_plane_client::RetryForeverError,
+ repository::Key,
+ tenant::{
+ harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+ storage_layer::DeltaFileName,
+ },
+ };
+
+ use super::*;
+ pub const TIMELINE_ID: TimelineId =
+ TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+ pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+ key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+ lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
+ });
+
+ // When you need a second layer in a test.
+ pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+ key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+ lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
+ });
+
+ struct TestSetup {
+ harness: TenantHarness,
+ remote_fs_dir: PathBuf,
+ storage: GenericRemoteStorage,
+ mock_control_plane: MockControlPlane,
+ deletion_queue: DeletionQueue,
+ worker_join: JoinHandle<()>,
+ }
+
+ impl TestSetup {
+ /// Simulate a pageserver restart by destroying and recreating the deletion queue
+ async fn restart(&mut self) {
+ let (deletion_queue, workers) = DeletionQueue::new(
+ Some(self.storage.clone()),
+ Some(self.mock_control_plane.clone()),
+ self.harness.conf,
+ );
+
+ tracing::debug!("Spawning worker for new queue queue");
+ let worker_join = workers
+ .unwrap()
+ .spawn_with(&tokio::runtime::Handle::current());
+
+ let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
+ let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
+
+ tracing::debug!("Joining worker from previous queue");
+ old_deletion_queue.cancel.cancel();
+ old_worker_join
+ .await
+ .expect("Failed to join workers for previous deletion queue");
+ }
+
+ fn set_latest_generation(&self, gen: Generation) {
+ let tenant_id = self.harness.tenant_id;
+ self.mock_control_plane
+ .latest_generation
+ .lock()
+ .unwrap()
+ .insert(tenant_id, gen);
+ }
+
+ /// Returns remote layer file name, suitable for use in assert_remote_files
+ fn write_remote_layer(
+ &self,
+ file_name: LayerFileName,
+ gen: Generation,
+ ) -> anyhow::Result {
+ let tenant_id = self.harness.tenant_id;
+ let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+ let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
+ std::fs::create_dir_all(&remote_timeline_path)?;
+ let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
+
+ let content: Vec = format!("placeholder contents of {file_name}").into();
+
+ std::fs::write(
+ remote_timeline_path.join(remote_layer_file_name.clone()),
+ content,
+ )?;
+
+ Ok(remote_layer_file_name)
+ }
+ }
+
+ #[derive(Debug, Clone)]
+ struct MockControlPlane {
+ pub latest_generation: std::sync::Arc>>,
+ }
+
+ impl MockControlPlane {
+ fn new() -> Self {
+ Self {
+ latest_generation: Arc::default(),
+ }
+ }
+ }
+
+ #[async_trait::async_trait]
+ impl ControlPlaneGenerationsApi for MockControlPlane {
+ #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
+ async fn re_attach(&self) -> Result, RetryForeverError> {
+ unimplemented!()
+ }
+ async fn validate(
+ &self,
+ tenants: Vec<(TenantId, Generation)>,
+ ) -> Result, RetryForeverError> {
+ let mut result = HashMap::new();
+
+ let latest_generation = self.latest_generation.lock().unwrap();
+
+ for (tenant_id, generation) in tenants {
+ if let Some(latest) = latest_generation.get(&tenant_id) {
+ result.insert(tenant_id, *latest == generation);
+ }
+ }
+
+ Ok(result)
+ }
+ }
+
+ fn setup(test_name: &str) -> anyhow::Result {
+ let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
+ let harness = TenantHarness::create(test_name)?;
+
+ // We do not load() the harness: we only need its config and remote_storage
+
+ // Set up a GenericRemoteStorage targetting a directory
+ let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+ std::fs::create_dir_all(remote_fs_dir)?;
+ let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+ let storage_config = RemoteStorageConfig {
+ max_concurrent_syncs: std::num::NonZeroUsize::new(
+ remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+ )
+ .unwrap(),
+ max_sync_errors: std::num::NonZeroU32::new(
+ remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+ )
+ .unwrap(),
+ storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+ };
+ let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+ let mock_control_plane = MockControlPlane::new();
+
+ let (deletion_queue, worker) = DeletionQueue::new(
+ Some(storage.clone()),
+ Some(mock_control_plane.clone()),
+ harness.conf,
+ );
+
+ let worker = worker.unwrap();
+ let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
+
+ Ok(TestSetup {
+ harness,
+ remote_fs_dir,
+ storage,
+ mock_control_plane,
+ deletion_queue,
+ worker_join,
+ })
+ }
+
+ // TODO: put this in a common location so that we can share with remote_timeline_client's tests
+ fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+ let mut expected: Vec = expected.iter().map(|x| String::from(*x)).collect();
+ expected.sort();
+
+ let mut found: Vec = Vec::new();
+ let dir = match std::fs::read_dir(remote_path) {
+ Ok(d) => d,
+ Err(e) => {
+ if e.kind() == ErrorKind::NotFound {
+ if expected.is_empty() {
+ // We are asserting prefix is empty: it is expected that the dir is missing
+ return;
+ } else {
+ assert_eq!(expected, Vec::::new());
+ unreachable!();
+ }
+ } else {
+ panic!(
+ "Unexpected error listing {}: {e}",
+ remote_path.to_string_lossy()
+ );
+ }
+ }
+ };
+
+ for entry in dir.flatten() {
+ let entry_name = entry.file_name();
+ let fname = entry_name.to_str().unwrap();
+ found.push(String::from(fname));
+ }
+ found.sort();
+
+ assert_eq!(expected, found);
+ }
+
+ fn assert_local_files(expected: &[&str], directory: &Path) {
+ let dir = match std::fs::read_dir(directory) {
+ Ok(d) => d,
+ Err(_) => {
+ assert_eq!(expected, &Vec::::new());
+ return;
+ }
+ };
+ let mut found = Vec::new();
+ for dentry in dir {
+ let dentry = dentry.unwrap();
+ let file_name = dentry.file_name();
+ let file_name_str = file_name.to_string_lossy();
+ found.push(file_name_str.to_string());
+ }
+ found.sort();
+ assert_eq!(expected, found);
+ }
+
+ #[tokio::test]
+ async fn deletion_queue_smoke() -> anyhow::Result<()> {
+ // Basic test that the deletion queue processes the deletions we pass into it
+ let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+ let client = ctx.deletion_queue.new_client();
+ client.recover(HashMap::new()).await?;
+
+ let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+ let tenant_id = ctx.harness.tenant_id;
+
+ let content: Vec = "victim1 contents".into();
+ let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+ let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+ let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+ // Exercise the distinction between the generation of the layers
+ // we delete, and the generation of the running Tenant.
+ let layer_generation = Generation::new(0xdeadbeef);
+ let now_generation = Generation::new(0xfeedbeef);
+
+ let remote_layer_file_name_1 =
+ format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
+
+ // Set mock control plane state to valid for our generation
+ ctx.set_latest_generation(now_generation);
+
+ // Inject a victim file to remote storage
+ info!("Writing");
+ std::fs::create_dir_all(&remote_timeline_path)?;
+ std::fs::write(
+ remote_timeline_path.join(remote_layer_file_name_1.clone()),
+ content,
+ )?;
+ assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+ // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
+ info!("Pushing");
+ client
+ .push_layers(
+ tenant_id,
+ TIMELINE_ID,
+ now_generation,
+ [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+ )
+ .await?;
+ assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+ assert_local_files(&[], &deletion_prefix);
+
+ // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
+ info!("Flushing");
+ client.flush().await?;
+ assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+ assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
+
+ // File should go away when we execute
+ info!("Flush-executing");
+ client.flush_execute().await?;
+ assert_remote_files(&[], &remote_timeline_path);
+ assert_local_files(&["header-01"], &deletion_prefix);
+
+ // Flushing on an empty queue should succeed immediately, and not write any lists
+ info!("Flush-executing on empty");
+ client.flush_execute().await?;
+ assert_local_files(&["header-01"], &deletion_prefix);
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn deletion_queue_validation() -> anyhow::Result<()> {
+ let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+ let client = ctx.deletion_queue.new_client();
+ client.recover(HashMap::new()).await?;
+
+ // Generation that the control plane thinks is current
+ let latest_generation = Generation::new(0xdeadbeef);
+ // Generation that our DeletionQueue thinks the tenant is running with
+ let stale_generation = latest_generation.previous();
+ // Generation that our example layer file was written with
+ let layer_generation = stale_generation.previous();
+
+ ctx.set_latest_generation(latest_generation);
+
+ let tenant_id = ctx.harness.tenant_id;
+ let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+ let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+
+ // Initial state: a remote layer exists
+ let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+ assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+ tracing::debug!("Pushing...");
+ client
+ .push_layers(
+ tenant_id,
+ TIMELINE_ID,
+ stale_generation,
+ [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+ )
+ .await?;
+
+ // We enqueued the operation in a stale generation: it should have failed validation
+ tracing::debug!("Flushing...");
+ tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+ assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+ tracing::debug!("Pushing...");
+ client
+ .push_layers(
+ tenant_id,
+ TIMELINE_ID,
+ latest_generation,
+ [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+ )
+ .await?;
+
+ // We enqueued the operation in a fresh generation: it should have passed validation
+ tracing::debug!("Flushing...");
+ tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+ assert_remote_files(&[], &remote_timeline_path);
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn deletion_queue_recovery() -> anyhow::Result<()> {
+ // Basic test that the deletion queue processes the deletions we pass into it
+ let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+ let client = ctx.deletion_queue.new_client();
+ client.recover(HashMap::new()).await?;
+
+ let tenant_id = ctx.harness.tenant_id;
+
+ let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+ let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+ let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+ let layer_generation = Generation::new(0xdeadbeef);
+ let now_generation = Generation::new(0xfeedbeef);
+
+ // Inject a deletion in the generation before generation_now: after restart,
+ // this deletion should _not_ get executed (only the immediately previous
+ // generation gets that treatment)
+ let remote_layer_file_name_historical =
+ ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+ client
+ .push_layers(
+ tenant_id,
+ TIMELINE_ID,
+ now_generation.previous(),
+ [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+ )
+ .await?;
+
+ // Inject a deletion in the generation before generation_now: after restart,
+ // this deletion should get executed, because we execute deletions in the
+ // immediately previous generation on the same node.
+ let remote_layer_file_name_previous =
+ ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
+ client
+ .push_layers(
+ tenant_id,
+ TIMELINE_ID,
+ now_generation,
+ [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+ )
+ .await?;
+
+ client.flush().await?;
+ assert_remote_files(
+ &[
+ &remote_layer_file_name_historical,
+ &remote_layer_file_name_previous,
+ ],
+ &remote_timeline_path,
+ );
+
+ // Different generatinos for the same tenant will cause two separate
+ // deletion lists to be emitted.
+ assert_local_files(
+ &["0000000000000001-01.list", "0000000000000002-01.list"],
+ &deletion_prefix,
+ );
+
+ // Simulate a node restart: the latest generation advances
+ let now_generation = now_generation.next();
+ ctx.set_latest_generation(now_generation);
+
+ // Restart the deletion queue
+ drop(client);
+ ctx.restart().await;
+ let client = ctx.deletion_queue.new_client();
+ client
+ .recover(HashMap::from([(tenant_id, now_generation)]))
+ .await?;
+
+ info!("Flush-executing");
+ client.flush_execute().await?;
+ // The deletion from immediately prior generation was executed, the one from
+ // an older generation was not.
+ assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path);
+ Ok(())
+ }
+}
+
+/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
+/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
+#[cfg(test)]
+pub(crate) mod mock {
+ use tracing::info;
+
+ use crate::tenant::remote_timeline_client::remote_layer_path;
+
+ use super::*;
+ use std::sync::{
+ atomic::{AtomicUsize, Ordering},
+ Arc,
+ };
+
+ pub struct ConsumerState {
+ rx: tokio::sync::mpsc::Receiver,
+ executor_rx: tokio::sync::mpsc::Receiver,
+ }
+
+ impl ConsumerState {
+ async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
+ let mut executed = 0;
+
+ info!("Executing all pending deletions");
+
+ // Transform all executor messages to generic frontend messages
+ while let Ok(msg) = self.executor_rx.try_recv() {
+ match msg {
+ DeleterMessage::Delete(objects) => {
+ for path in objects {
+ match remote_storage.delete(&path).await {
+ Ok(_) => {
+ debug!("Deleted {path}");
+ }
+ Err(e) => {
+ error!("Failed to delete {path}, leaking object! ({e})");
+ }
+ }
+ executed += 1;
+ }
+ }
+ DeleterMessage::Flush(flush_op) => {
+ flush_op.notify();
+ }
+ }
+ }
+
+ while let Ok(msg) = self.rx.try_recv() {
+ match msg {
+ ListWriterQueueMessage::Delete(op) => {
+ let mut objects = op.objects;
+ for (layer, generation) in op.layers {
+ objects.push(remote_layer_path(
+ &op.tenant_id,
+ &op.timeline_id,
+ &layer,
+ generation,
+ ));
+ }
+
+ for path in objects {
+ info!("Executing deletion {path}");
+ match remote_storage.delete(&path).await {
+ Ok(_) => {
+ debug!("Deleted {path}");
+ }
+ Err(e) => {
+ error!("Failed to delete {path}, leaking object! ({e})");
+ }
+ }
+ executed += 1;
+ }
+ }
+ ListWriterQueueMessage::Flush(op) => {
+ op.notify();
+ }
+ ListWriterQueueMessage::FlushExecute(op) => {
+ // We have already executed all prior deletions because mock does them inline
+ op.notify();
+ }
+ ListWriterQueueMessage::Recover(_) => {
+ // no-op in mock
+ }
+ }
+ info!("All pending deletions have been executed");
+ }
+
+ executed
+ }
+ }
+
+ pub struct MockDeletionQueue {
+ tx: tokio::sync::mpsc::Sender,
+ executor_tx: tokio::sync::mpsc::Sender,
+ executed: Arc,
+ remote_storage: Option,
+ consumer: std::sync::Mutex,
+ lsn_table: Arc>,
+ }
+
+ impl MockDeletionQueue {
+ pub fn new(remote_storage: Option) -> Self {
+ let (tx, rx) = tokio::sync::mpsc::channel(16384);
+ let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
+
+ let executed = Arc::new(AtomicUsize::new(0));
+
+ Self {
+ tx,
+ executor_tx,
+ executed,
+ remote_storage,
+ consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+ lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
+ }
+ }
+
+ pub fn get_executed(&self) -> usize {
+ self.executed.load(Ordering::Relaxed)
+ }
+
+ #[allow(clippy::await_holding_lock)]
+ pub async fn pump(&self) {
+ if let Some(remote_storage) = &self.remote_storage {
+ // Permit holding mutex across await, because this is only ever
+ // called once at a time in tests.
+ let mut locked = self.consumer.lock().unwrap();
+ let count = locked.consume(remote_storage).await;
+ self.executed.fetch_add(count, Ordering::Relaxed);
+ }
+ }
+
+ pub(crate) fn new_client(&self) -> DeletionQueueClient {
+ DeletionQueueClient {
+ tx: self.tx.clone(),
+ executor_tx: self.executor_tx.clone(),
+ lsn_table: self.lsn_table.clone(),
+ }
+ }
+ }
+}
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
new file mode 100644
index 0000000000..5c6e7dc9d7
--- /dev/null
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue. It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+ Delete(Vec),
+ Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+ // Accumulate up to 1000 keys for the next deletion operation
+ accumulator: Vec,
+
+ rx: tokio::sync::mpsc::Receiver,
+
+ cancel: CancellationToken,
+ remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+ pub(super) fn new(
+ remote_storage: GenericRemoteStorage,
+ rx: tokio::sync::mpsc::Receiver,
+ cancel: CancellationToken,
+ ) -> Self {
+ Self {
+ remote_storage,
+ rx,
+ cancel,
+ accumulator: Vec::new(),
+ }
+ }
+
+ /// Wrap the remote `delete_objects` with a failpoint
+ async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+ fail::fail_point!("deletion-queue-before-execute", |_| {
+ info!("Skipping execution, failpoint set");
+ metrics::DELETION_QUEUE
+ .remote_errors
+ .with_label_values(&["failpoint"])
+ .inc();
+ Err(anyhow::anyhow!("failpoint hit"))
+ });
+
+ self.remote_storage.delete_objects(&self.accumulator).await
+ }
+
+ /// Block until everything in accumulator has been executed
+ async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+ while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+ match self.remote_delete().await {
+ Ok(()) => {
+ // Note: we assume that the remote storage layer returns Ok(()) if some
+ // or all of the deleted objects were already gone.
+ metrics::DELETION_QUEUE
+ .keys_executed
+ .inc_by(self.accumulator.len() as u64);
+ info!(
+ "Executed deletion batch {}..{}",
+ self.accumulator
+ .first()
+ .expect("accumulator should be non-empty"),
+ self.accumulator
+ .last()
+ .expect("accumulator should be non-empty"),
+ );
+ self.accumulator.clear();
+ }
+ Err(e) => {
+ warn!("DeleteObjects request failed: {e:#}, will retry");
+ metrics::DELETION_QUEUE
+ .remote_errors
+ .with_label_values(&["execute"])
+ .inc();
+ }
+ };
+ }
+ if self.cancel.is_cancelled() {
+ // Expose an error because we may not have actually flushed everything
+ Err(DeletionQueueError::ShuttingDown)
+ } else {
+ Ok(())
+ }
+ }
+
+ pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+ self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+ loop {
+ if self.cancel.is_cancelled() {
+ return Err(DeletionQueueError::ShuttingDown);
+ }
+
+ let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+ Ok(Some(m)) => m,
+ Ok(None) => {
+ // All queue senders closed
+ info!("Shutting down");
+ return Err(DeletionQueueError::ShuttingDown);
+ }
+ Err(_) => {
+ // Timeout, we hit deadline to execute whatever we have in hand. These functions will
+ // return immediately if no work is pending
+ self.flush().await?;
+
+ continue;
+ }
+ };
+
+ match msg {
+ DeleterMessage::Delete(mut list) => {
+ while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+ if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+ self.flush().await?;
+ // If we have received this number of keys, proceed with attempting to execute
+ assert_eq!(self.accumulator.len(), 0);
+ }
+
+ let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+ let take_count = std::cmp::min(available_slots, list.len());
+ for path in list.drain(list.len() - take_count..) {
+ self.accumulator.push(path);
+ }
+ }
+ }
+ DeleterMessage::Flush(flush_op) => {
+ // If flush() errors, we drop the flush_op and the caller will get
+ // an error recv()'ing their oneshot channel.
+ self.flush().await?;
+ flush_op.notify();
+ }
+ }
+ }
+ }
+}
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
new file mode 100644
index 0000000000..618a59f8fe
--- /dev/null
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue. It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline). This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+ pub(super) tenant_id: TenantId,
+ pub(super) timeline_id: TimelineId,
+ // `layers` and `objects` are both just lists of objects. `layers` is used if you do not
+ // have a config object handy to project it to a remote key, and need the consuming worker
+ // to do it for you.
+ pub(super) layers: Vec<(LayerFileName, Generation)>,
+ pub(super) objects: Vec,
+
+ /// The _current_ generation of the Tenant attachment in which we are enqueuing
+ /// this deletion.
+ pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+ pub(super) attached_tenants: HashMap,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+ Delete(DeletionOp),
+ // Wait until all prior deletions make it into a persistent DeletionList
+ Flush(FlushOp),
+ // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+ FlushExecute(FlushOp),
+ // Call once after re-attaching to control plane, to notify the deletion queue about
+ // latest attached generations & load any saved deletion lists from disk.
+ Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+ conf: &'static PageServerConf,
+
+ // Incoming frontend requests to delete some keys
+ rx: tokio::sync::mpsc::Receiver,
+
+ // Outbound requests to the backend to execute deletion lists we have composed.
+ tx: tokio::sync::mpsc::Sender,
+
+ // The list we are currently building, contains a buffer of keys to delete
+ // and our next sequence number
+ pending: DeletionList,
+
+ // These FlushOps should notify the next time we flush
+ pending_flushes: Vec,
+
+ // Worker loop is torn down when this fires.
+ cancel: CancellationToken,
+
+ // Safety guard to do recovery exactly once
+ recovered: bool,
+}
+
+impl ListWriter {
+ // Initially DeletionHeader.validated_sequence is zero. The place we start our
+ // sequence numbers must be higher than that.
+ const BASE_SEQUENCE: u64 = 1;
+
+ pub(super) fn new(
+ conf: &'static PageServerConf,
+ rx: tokio::sync::mpsc::Receiver,
+ tx: tokio::sync::mpsc::Sender,
+ cancel: CancellationToken,
+ ) -> Self {
+ Self {
+ pending: DeletionList::new(Self::BASE_SEQUENCE),
+ conf,
+ rx,
+ tx,
+ pending_flushes: Vec::new(),
+ cancel,
+ recovered: false,
+ }
+ }
+
+ /// Try to flush `list` to persistent storage
+ ///
+ /// This does not return errors, because on failure to flush we do not lose
+ /// any state: flushing will be retried implicitly on the next deadline
+ async fn flush(&mut self) {
+ if self.pending.is_empty() {
+ for f in self.pending_flushes.drain(..) {
+ f.notify();
+ }
+ return;
+ }
+
+ match self.pending.save(self.conf).await {
+ Ok(_) => {
+ info!(sequence = self.pending.sequence, "Stored deletion list");
+
+ for f in self.pending_flushes.drain(..) {
+ f.notify();
+ }
+
+ // Take the list we've accumulated, replace it with a fresh list for the next sequence
+ let next_list = DeletionList::new(self.pending.sequence + 1);
+ let list = std::mem::replace(&mut self.pending, next_list);
+
+ if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+ // This is allowed to fail: it will only happen if the backend worker is shut down,
+ // so we can just drop this on the floor.
+ info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+ }
+ }
+ Err(e) => {
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ warn!(
+ sequence = self.pending.sequence,
+ "Failed to write deletion list, will retry later ({e:#})"
+ );
+ }
+ }
+ }
+
+ /// Load the header, to learn the sequence number up to which deletions
+ /// have been validated. We will apply validated=true to DeletionLists
+ /// <= this sequence when loading them.
+ ///
+ /// It is not an error for the header to not exist: we return None, and
+ /// the caller should act as if validated_sequence is 0
+ async fn load_validated_sequence(&self) -> Result, anyhow::Error> {
+ let header_path = self.conf.deletion_header_path();
+ match tokio::fs::read(&header_path).await {
+ Ok(header_bytes) => {
+ match serde_json::from_slice::(&header_bytes) {
+ Ok(h) => Ok(Some(h.validated_sequence)),
+ Err(e) => {
+ warn!(
+ "Failed to deserialize deletion header, ignoring {}: {e:#}",
+ header_path.display()
+ );
+ // This should never happen unless we make a mistake with our serialization.
+ // Ignoring a deletion header is not consequential for correctnes because all deletions
+ // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ Ok(None)
+ }
+ }
+ }
+ Err(e) => {
+ if e.kind() == std::io::ErrorKind::NotFound {
+ debug!(
+ "Deletion header {} not found, first start?",
+ header_path.display()
+ );
+ Ok(None)
+ } else {
+ Err(anyhow::anyhow!(e))
+ }
+ }
+ }
+ }
+
+ async fn recover(
+ &mut self,
+ attached_tenants: HashMap,
+ ) -> Result<(), anyhow::Error> {
+ debug!(
+ "recovering with {} attached tenants",
+ attached_tenants.len()
+ );
+
+ // Load the header
+ let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+ self.pending.sequence = validated_sequence + 1;
+
+ let deletion_directory = self.conf.deletion_prefix();
+ let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+ Ok(d) => d,
+ Err(e) => {
+ warn!(
+ "Failed to open deletion list directory {}: {e:#}",
+ deletion_directory.display(),
+ );
+
+ // Give up: if we can't read the deletion list directory, we probably can't
+ // write lists into it later, so the queue won't work.
+ return Err(e.into());
+ }
+ };
+
+ let list_name_pattern =
+ Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap();
+
+ let header_path = self.conf.deletion_header_path();
+ let mut seqs: Vec = Vec::new();
+ while let Some(dentry) = dir.next_entry().await? {
+ let file_name = dentry.file_name();
+ let dentry_str = file_name.to_string_lossy();
+
+ if Some(file_name.as_os_str()) == header_path.file_name() {
+ // Don't try and parse the header's name like a list
+ continue;
+ }
+
+ if dentry_str.ends_with(TEMP_SUFFIX) {
+ info!("Cleaning up temporary file {dentry_str}");
+ let absolute_path = deletion_directory.join(dentry.file_name());
+ if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+ // Non-fatal error: we will just leave the file behind but not
+ // try and load it.
+ warn!(
+ "Failed to clean up temporary file {}: {e:#}",
+ absolute_path.display()
+ );
+ }
+
+ continue;
+ }
+
+ let file_name = dentry.file_name().to_owned();
+ let basename = file_name.to_string_lossy();
+ let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+ m.name("sequence")
+ .expect("Non optional group should be present")
+ .as_str()
+ } else {
+ warn!("Unexpected key in deletion queue: {basename}");
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ continue;
+ };
+
+ let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+ Ok(s) => s,
+ Err(e) => {
+ warn!("Malformed key '{basename}': {e}");
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ continue;
+ }
+ };
+ seqs.push(seq);
+ }
+ seqs.sort();
+
+ // Start our next deletion list from after the last location validated by
+ // previous process lifetime, or after the last location found (it is updated
+ // below after enumerating the deletion lists)
+ self.pending.sequence = validated_sequence + 1;
+ if let Some(max_list_seq) = seqs.last() {
+ self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+ }
+
+ for s in seqs {
+ let list_path = self.conf.deletion_list_path(s);
+
+ let list_bytes = tokio::fs::read(&list_path).await?;
+
+ let mut deletion_list = match serde_json::from_slice::(&list_bytes) {
+ Ok(l) => l,
+ Err(e) => {
+ // Drop the list on the floor: any objects it referenced will be left behind
+ // for scrubbing to clean up. This should never happen unless we have a serialization bug.
+ warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ continue;
+ }
+ };
+
+ if deletion_list.sequence <= validated_sequence {
+ // If the deletion list falls below valid_seq, we may assume that it was
+ // already validated the last time this pageserver ran. Otherwise, we still
+ // load it, as it may still contain content valid in this generation.
+ deletion_list.validated = true;
+ } else {
+ // Special case optimization: if a tenant is still attached, and no other
+ // generation was issued to another node in the interval while we restarted,
+ // then we may treat deletion lists from the previous generation as if they
+ // belong to our currently attached generation, and proceed to validate & execute.
+ for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+ if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+ if attached_gen.previous() == tenant_list.generation {
+ tenant_list.generation = *attached_gen;
+ }
+ }
+ }
+ }
+
+ info!(
+ validated = deletion_list.validated,
+ sequence = deletion_list.sequence,
+ "Recovered deletion list"
+ );
+
+ // We will drop out of recovery if this fails: it indicates that we are shutting down
+ // or the backend has panicked
+ metrics::DELETION_QUEUE
+ .keys_submitted
+ .inc_by(deletion_list.len() as u64);
+ self.tx
+ .send(ValidatorQueueMessage::Delete(deletion_list))
+ .await?;
+ }
+
+ info!(next_sequence = self.pending.sequence, "Replay complete");
+
+ Ok(())
+ }
+
+ /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+ /// and write them out, for later validation by the backend and execution by the executor.
+ pub(super) async fn background(&mut self) {
+ info!("Started deletion frontend worker");
+
+ // Synchronous, but we only do it once per process lifetime so it's tolerable
+ if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+ tracing::error!(
+ "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+ self.conf.deletion_prefix().display()
+ );
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ return;
+ }
+
+ while !self.cancel.is_cancelled() {
+ let timeout = if self.pending_flushes.is_empty() {
+ FRONTEND_DEFAULT_TIMEOUT
+ } else {
+ FRONTEND_FLUSHING_TIMEOUT
+ };
+
+ let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+ Ok(Some(msg)) => msg,
+ Ok(None) => {
+ // Queue sender destroyed, shutting down
+ break;
+ }
+ Err(_) => {
+ // Hit deadline, flush.
+ self.flush().await;
+ continue;
+ }
+ };
+
+ match msg {
+ ListWriterQueueMessage::Delete(op) => {
+ assert!(
+ self.recovered,
+ "Cannot process deletions before recovery. This is a bug."
+ );
+
+ debug!(
+ "Delete: ingesting {} layers, {} other objects",
+ op.layers.len(),
+ op.objects.len()
+ );
+
+ let mut layer_paths = Vec::new();
+ for (layer, generation) in op.layers {
+ layer_paths.push(remote_layer_path(
+ &op.tenant_id,
+ &op.timeline_id,
+ &layer,
+ generation,
+ ));
+ }
+ layer_paths.extend(op.objects);
+
+ if !self.pending.push(
+ &op.tenant_id,
+ &op.timeline_id,
+ op.generation,
+ &mut layer_paths,
+ ) {
+ self.flush().await;
+ let retry_succeeded = self.pending.push(
+ &op.tenant_id,
+ &op.timeline_id,
+ op.generation,
+ &mut layer_paths,
+ );
+ if !retry_succeeded {
+ // Unexpected: after we flush, we should have
+ // drained self.pending, so a conflict on
+ // generation numbers should be impossible.
+ tracing::error!(
+ "Failed to enqueue deletions, leaking objects. This is a bug."
+ );
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ }
+ }
+ }
+ ListWriterQueueMessage::Flush(op) => {
+ if self.pending.is_empty() {
+ // Execute immediately
+ debug!("Flush: No pending objects, flushing immediately");
+ op.notify()
+ } else {
+ // Execute next time we flush
+ debug!("Flush: adding to pending flush list for next deadline flush");
+ self.pending_flushes.push(op);
+ }
+ }
+ ListWriterQueueMessage::FlushExecute(op) => {
+ debug!("FlushExecute: passing through to backend");
+ // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+ if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+ info!("Can't flush, shutting down ({e})");
+ // Caller will get error when their oneshot sender was dropped.
+ }
+ }
+ ListWriterQueueMessage::Recover(op) => {
+ if self.recovered {
+ tracing::error!(
+ "Deletion queue recovery called more than once. This is a bug."
+ );
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+ continue;
+ }
+
+ if let Err(e) = self.recover(op.attached_tenants).await {
+ // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+ // queue receiver has been dropped, or something is critically broken with
+ // the local filesystem holding deletion lists.
+ info!(
+ "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+ );
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ return;
+ } else {
+ self.recovered = true;
+ }
+ }
+ }
+
+ if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+ self.flush().await;
+ }
+ }
+ info!("Deletion queue shut down.");
+ }
+}
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
new file mode 100644
index 0000000000..64603045d2
--- /dev/null
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current. See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+ Delete(DeletionList),
+ Flush(FlushOp),
+}
+pub(super) struct Validator
+where
+ C: ControlPlaneGenerationsApi,
+{
+ conf: &'static PageServerConf,
+ rx: tokio::sync::mpsc::Receiver,
+ tx: tokio::sync::mpsc::Sender,
+
+ // Client for calling into control plane API for validation of deletes
+ control_plane_client: Option,
+
+ // DeletionLists which are waiting generation validation. Not safe to
+ // execute until [`validate`] has processed them.
+ pending_lists: Vec,
+
+ // DeletionLists which have passed validation and are ready to execute.
+ validated_lists: Vec,
+
+ // Sum of all the lengths of lists in pending_lists
+ pending_key_count: usize,
+
+ // Lsn validation state: we read projected LSNs and write back visible LSNs
+ // after validation. This is the LSN equivalent of `pending_validation_lists`:
+ // it is drained in [`validate`]
+ lsn_table: Arc>,
+
+ // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+ // we must remember that and refuse to advance our persistent validated sequence
+ // number past the failure.
+ list_write_failed: Option,
+
+ cancel: CancellationToken,
+}
+
+impl Validator
+where
+ C: ControlPlaneGenerationsApi,
+{
+ pub(super) fn new(
+ conf: &'static PageServerConf,
+ rx: tokio::sync::mpsc::Receiver,
+ tx: tokio::sync::mpsc::Sender,
+ control_plane_client: Option,
+ lsn_table: Arc>,
+ cancel: CancellationToken,
+ ) -> Self {
+ Self {
+ conf,
+ rx,
+ tx,
+ control_plane_client,
+ lsn_table,
+ pending_lists: Vec::new(),
+ validated_lists: Vec::new(),
+ pending_key_count: 0,
+ list_write_failed: None,
+ cancel,
+ }
+ }
+ /// Process any outstanding validations of generations of pending LSN updates or pending
+ /// DeletionLists.
+ ///
+ /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+ /// go into the queue of ready-to-execute lists.
+ async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+ let mut tenant_generations = HashMap::new();
+ for list in &self.pending_lists {
+ for (tenant_id, tenant_list) in &list.tenants {
+ // Note: DeletionLists are in logical time order, so generation always
+ // goes up. By doing a simple insert() we will always end up with
+ // the latest generation seen for a tenant.
+ tenant_generations.insert(*tenant_id, tenant_list.generation);
+ }
+ }
+
+ let pending_lsn_updates = {
+ let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+ std::mem::take(&mut *lsn_table)
+ };
+ for (tenant_id, update) in &pending_lsn_updates.tenants {
+ let entry = tenant_generations
+ .entry(*tenant_id)
+ .or_insert(update.generation);
+ if update.generation > *entry {
+ *entry = update.generation;
+ }
+ }
+
+ if tenant_generations.is_empty() {
+ // No work to do
+ return Ok(());
+ }
+
+ let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+ match control_plane_client
+ .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+ .await
+ {
+ Ok(tenants) => tenants,
+ Err(RetryForeverError::ShuttingDown) => {
+ // The only way a validation call returns an error is when the cancellation token fires
+ return Err(DeletionQueueError::ShuttingDown);
+ }
+ }
+ } else {
+ // Control plane API disabled. In legacy mode we consider everything valid.
+ tenant_generations.keys().map(|k| (*k, true)).collect()
+ };
+
+ let mut validated_sequence: Option = None;
+
+ // Apply the validation results to the pending LSN updates
+ for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+ let validated_generation = tenant_generations
+ .get(&tenant_id)
+ .expect("Map was built from the same keys we're reading");
+
+ let valid = tenants_valid
+ .get(&tenant_id)
+ .copied()
+ // If the tenant was missing from the validation response, it has been deleted.
+ // The Timeline that requested the LSN update is probably already torn down,
+ // or will be torn down soon. In this case, drop the update by setting valid=false.
+ .unwrap_or(false);
+
+ if valid && *validated_generation == tenant_lsn_state.generation {
+ for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+ pending_lsn.result_slot.store(pending_lsn.projected);
+ }
+ } else {
+ // If we failed validation, then do not apply any of the projected updates
+ warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+ metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+ }
+ }
+
+ // Apply the validation results to the pending deletion lists
+ for list in &mut self.pending_lists {
+ // Filter the list based on whether the server responded valid: true.
+ // If a tenant is omitted in the response, it has been deleted, and we should
+ // proceed with deletion.
+ let mut mutated = false;
+ list.tenants.retain(|tenant_id, tenant| {
+ let validated_generation = tenant_generations
+ .get(tenant_id)
+ .expect("Map was built from the same keys we're reading");
+
+ // If the tenant was missing from the validation response, it has been deleted.
+ // This means that a deletion is valid, but also redundant since the tenant's
+ // objects should have already been deleted. Treat it as invalid to drop the
+ // redundant deletion.
+ let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+ // A list is valid if it comes from the current _or previous_ generation.
+ // - The previous generation case is permitted due to how we store deletion lists locally:
+ // if we see the immediately previous generation in a locally stored deletion list,
+ // it proves that this node's disk was used for both current & previous generations,
+ // and therefore no other node was involved in between: the two generations may be
+ // logically treated as the same.
+ // - In that previous generation case, we rewrote it to the current generation
+ // in recover(), so the comparison here is simply an equality.
+
+ let this_list_valid = valid
+ && (tenant.generation == *validated_generation);
+
+ if !this_list_valid {
+ warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+ metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+ mutated = true;
+ }
+ this_list_valid
+ });
+ list.validated = true;
+
+ if mutated {
+ // Save the deletion list if we had to make changes due to stale generations. The
+ // saved list is valid for execution.
+ if let Err(e) = list.save(self.conf).await {
+ // Highly unexpected. Could happen if e.g. disk full.
+ // If we didn't save the trimmed list, it is _not_ valid to execute.
+ warn!("Failed to save modified deletion list {list}: {e:#}");
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+ // Rather than have a complex retry process, just drop it and leak the objects,
+ // scrubber will clean up eventually.
+ list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+ // We must remember this failure, to prevent later writing out a header that
+ // would imply the unwritable list was valid on disk.
+ if self.list_write_failed.is_none() {
+ self.list_write_failed = Some(list.sequence);
+ }
+ }
+ }
+
+ validated_sequence = Some(list.sequence);
+ }
+
+ if let Some(validated_sequence) = validated_sequence {
+ if let Some(list_write_failed) = self.list_write_failed {
+ // Rare error case: we failed to write out a deletion list to excise invalid
+ // entries, so we cannot advance the header's valid sequence number past that point.
+ //
+ // In this state we will continue to validate, execute and delete deletion lists,
+ // we just cannot update the header. It should be noticed and fixed by a human due to
+ // the nonzero value of our unexpected_errors metric.
+ warn!(
+ sequence_number = list_write_failed,
+ "Cannot write header because writing a deletion list failed earlier",
+ );
+ } else {
+ // Write the queue header to record how far validation progressed. This avoids having
+ // to rewrite each DeletionList to set validated=true in it.
+ let header = DeletionHeader::new(validated_sequence);
+
+ // Drop result because the validated_sequence is an optimization. If we fail to save it,
+ // then restart, we will drop some deletion lists, creating work for scrubber.
+ // The save() function logs a warning on error.
+ if let Err(e) = header.save(self.conf).await {
+ warn!("Failed to write deletion queue header: {e:#}");
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ }
+ }
+ }
+
+ // Transfer the validated lists to the validated queue, for eventual execution
+ self.validated_lists.append(&mut self.pending_lists);
+
+ Ok(())
+ }
+
+ async fn cleanup_lists(&mut self, list_paths: Vec) {
+ for list_path in list_paths {
+ debug!("Removing deletion list {}", list_path.display());
+
+ if let Err(e) = tokio::fs::remove_file(&list_path).await {
+ // Unexpected: we should have permissions and nothing else should
+ // be touching these files. We will leave the file behind. Subsequent
+ // pageservers will try and load it again: hopefully whatever storage
+ // issue (probably permissions) has been fixed by then.
+ tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+ metrics::DELETION_QUEUE.unexpected_errors.inc();
+ break;
+ }
+ }
+ }
+
+ async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+ tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+ // Issue any required generation validation calls to the control plane
+ self.validate().await?;
+
+ // After successful validation, nothing is pending: any lists that
+ // made it through validation will be in validated_lists.
+ assert!(self.pending_lists.is_empty());
+ self.pending_key_count = 0;
+
+ tracing::debug!(
+ "Validation complete, have {} validated lists",
+ self.validated_lists.len()
+ );
+
+ // Return quickly if we have no validated lists to execute. This avoids flushing the
+ // executor when an idle backend hits its autoflush interval
+ if self.validated_lists.is_empty() {
+ return Ok(());
+ }
+
+ // Drain `validated_lists` into the executor
+ let mut executing_lists = Vec::new();
+ for list in self.validated_lists.drain(..) {
+ let list_path = self.conf.deletion_list_path(list.sequence);
+ let objects = list.into_remote_paths();
+ self.tx
+ .send(DeleterMessage::Delete(objects))
+ .await
+ .map_err(|_| DeletionQueueError::ShuttingDown)?;
+ executing_lists.push(list_path);
+ }
+
+ self.flush_executor().await?;
+
+ // Erase the deletion lists whose keys have all be deleted from remote storage
+ self.cleanup_lists(executing_lists).await;
+
+ Ok(())
+ }
+
+ async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+ // Flush the executor, so that all the keys referenced by these deletion lists
+ // are actually removed from remote storage. This is a precondition to deleting
+ // the deletion lists themselves.
+ let (flush_op, rx) = FlushOp::new();
+ self.tx
+ .send(DeleterMessage::Flush(flush_op))
+ .await
+ .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+ rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+ }
+
+ pub(super) async fn background(&mut self) {
+ tracing::info!("Started deletion backend worker");
+
+ while !self.cancel.is_cancelled() {
+ let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+ Ok(Some(m)) => m,
+ Ok(None) => {
+ // All queue senders closed
+ info!("Shutting down");
+ break;
+ }
+ Err(_) => {
+ // Timeout, we hit deadline to execute whatever we have in hand. These functions will
+ // return immediately if no work is pending.
+ match self.flush().await {
+ Ok(()) => {}
+ Err(DeletionQueueError::ShuttingDown) => {
+ // If we are shutting down, then auto-flush can safely be skipped
+ }
+ }
+
+ continue;
+ }
+ };
+
+ match msg {
+ ValidatorQueueMessage::Delete(list) => {
+ if list.validated {
+ // A pre-validated list may only be seen during recovery, if we are recovering
+ // a DeletionList whose on-disk state has validated=true
+ self.validated_lists.push(list)
+ } else {
+ self.pending_key_count += list.len();
+ self.pending_lists.push(list);
+ }
+
+ if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+ match self.flush().await {
+ Ok(()) => {}
+ Err(DeletionQueueError::ShuttingDown) => {
+ // If we are shutting down, then auto-flush can safely be skipped
+ }
+ }
+ }
+ }
+ ValidatorQueueMessage::Flush(op) => {
+ match self.flush().await {
+ Ok(()) => {
+ op.notify();
+ }
+ Err(DeletionQueueError::ShuttingDown) => {
+ // If we fail due to shutting down, we will just drop `op` to propagate that status.
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4988641d6a..f5c1224f01 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,6 +1093,9 @@ components:
remote_consistent_lsn:
type: string
format: hex
+ remote_consistent_lsn_visible:
+ type: string
+ format: hex
ancestor_timeline_id:
type: string
format: hex
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8e914ba08..e61a9dcf3f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
@@ -24,6 +25,7 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
@@ -61,6 +63,7 @@ pub struct State {
remote_storage: Option,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc,
+ deletion_queue_client: DeletionQueueClient,
}
impl State {
@@ -70,6 +73,7 @@ impl State {
remote_storage: Option,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc,
+ deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
@@ -82,8 +86,17 @@ impl State {
remote_storage,
broker_client,
disk_usage_eviction_state,
+ deletion_queue_client,
})
}
+
+ fn tenant_resources(&self) -> TenantSharedResources {
+ TenantSharedResources {
+ broker_client: self.broker_client.clone(),
+ remote_storage: self.remote_storage.clone(),
+ deletion_queue_client: self.deletion_queue_client.clone(),
+ }
+ }
}
#[inline(always)]
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
- let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+ let remote_consistent_lsn_projected = timeline
+ .get_remote_consistent_lsn_projected()
+ .unwrap_or(Lsn(0));
+ let remote_consistent_lsn_visible = timeline
+ .get_remote_consistent_lsn_visible()
+ .unwrap_or(Lsn(0));
let walreceiver_status = timeline.walreceiver_status();
@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
- remote_consistent_lsn,
+ remote_consistent_lsn: remote_consistent_lsn_projected,
+ remote_consistent_lsn_visible,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
- if let Some(remote_storage) = &state.remote_storage {
- mgr::attach_tenant(
- state.conf,
- tenant_id,
- generation,
- tenant_conf,
- state.broker_client.clone(),
- remote_storage.clone(),
- &ctx,
- )
- .instrument(info_span!("tenant_attach", %tenant_id))
- .await?;
- } else {
+ if state.remote_storage.is_none() {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
+ mgr::attach_tenant(
+ state.conf,
+ tenant_id,
+ generation,
+ tenant_conf,
+ state.tenant_resources(),
+ &ctx,
+ )
+ .instrument(info_span!("tenant_attach", %tenant_id))
+ .await?;
+
json_response(StatusCode::ACCEPTED, ())
}
@@ -570,6 +588,7 @@ async fn tenant_load_handler(
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
+ state.deletion_queue_client.clone(),
&ctx,
)
.instrument(info_span!("load", %tenant_id))
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
tenant_conf,
target_tenant_id,
generation,
- state.broker_client.clone(),
- state.remote_storage.clone(),
+ state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
+async fn deletion_queue_flush(
+ r: Request,
+ cancel: CancellationToken,
+) -> Result, ApiError> {
+ let state = get_state(&r);
+
+ if state.remote_storage.is_none() {
+ // Nothing to do if remote storage is disabled.
+ return json_response(StatusCode::OK, ());
+ }
+
+ let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+ let flush = async {
+ if execute {
+ state.deletion_queue_client.flush_execute().await
+ } else {
+ state.deletion_queue_client.flush().await
+ }
+ }
+ // DeletionQueueError's only case is shutting down.
+ .map_err(|_| ApiError::ShuttingDown);
+
+ tokio::select! {
+ res = flush => {
+ res.map(|()| json_response(StatusCode::OK, ()))?
+ }
+ _ = cancel.cancelled() => {
+ Err(ApiError::ShuttingDown)
+ }
+ }
+}
+
async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -1463,6 +1514,9 @@ pub fn make_router(
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
+ .put("/v1/deletion_queue/flush", |r| {
+ api_handler(r, deletion_queue_flush)
+ })
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 3049ad6a4e..e370e063ba 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,8 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
-mod control_plane_client;
+pub mod control_plane_client;
+pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
@@ -27,6 +28,7 @@ pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;
+use deletion_queue::DeletionQueue;
use tracing::info;
/// Current storage format version
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
-#[tracing::instrument]
-pub async fn shutdown_pageserver(exit_code: i32) {
+#[tracing::instrument(skip_all, fields(%exit_code))]
+pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
)
.await;
+ // Best effort to persist any outstanding deletions, to avoid leaking objects
+ if let Some(mut deletion_queue) = deletion_queue {
+ deletion_queue.shutdown(Duration::from_secs(5)).await;
+ }
+
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98dee095a3..f85f525630 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -264,6 +264,46 @@ pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheS
},
});
+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::new(|| {
+ register_histogram!(
+ "pageserver_page_cache_acquire_pinned_slot_seconds",
+ "Time spent acquiring a pinned slot in the page cache",
+ CRITICAL_OP_BUCKETS.into(),
+ )
+ .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy = Lazy::new(|| {
+ register_int_counter!(
+ "pageserver_page_cache_find_victim_iters_total",
+ "Counter for the number of iterations in the find_victim loop",
+ )
+ .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| {
+ register_int_counter_vec!(
+ "page_cache_errors_total",
+ "Number of timeouts while acquiring a pinned slot in the page cache",
+ &["error_kind"]
+ )
+ .expect("failed to define a metric")
+});
+
+#[derive(IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum PageCacheErrorKind {
+ AcquirePinnedSlotTimeout,
+ EvictIterLimit,
+}
+
+pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
+ PAGE_CACHE_ERRORS
+ .get_metric_with_label_values(&[error_kind.into()])
+ .unwrap()
+ .inc();
+}
+
pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| {
register_histogram!(
"pageserver_wait_lsn_seconds",
@@ -291,6 +331,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| {
.expect("failed to define a metric")
});
+pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| {
+ register_uint_gauge!(
+ "pageserver_resident_physical_size_global",
+ "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
+ )
+ .expect("failed to define a metric")
+});
+
static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
@@ -301,6 +349,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| {
.expect("failed to define a metric")
});
+static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| {
+ register_uint_gauge!(
+ "pageserver_remote_physical_size_global",
+ "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
+ )
+ .expect("failed to define a metric")
+});
+
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
@@ -887,6 +943,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy
.expect("failed to define a metric")
});
+pub(crate) struct DeletionQueueMetrics {
+ pub(crate) keys_submitted: IntCounter,
+ pub(crate) keys_dropped: IntCounter,
+ pub(crate) keys_executed: IntCounter,
+ pub(crate) dropped_lsn_updates: IntCounter,
+ pub(crate) unexpected_errors: IntCounter,
+ pub(crate) remote_errors: IntCounterVec,
+}
+pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| {
+ DeletionQueueMetrics{
+
+ keys_submitted: register_int_counter!(
+ "pageserver_deletion_queue_submitted_total",
+ "Number of objects submitted for deletion"
+ )
+ .expect("failed to define a metric"),
+
+ keys_dropped: register_int_counter!(
+ "pageserver_deletion_queue_dropped_total",
+ "Number of object deletions dropped due to stale generation."
+ )
+ .expect("failed to define a metric"),
+
+ keys_executed: register_int_counter!(
+ "pageserver_deletion_queue_executed_total",
+ "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+ )
+ .expect("failed to define a metric"),
+
+ dropped_lsn_updates: register_int_counter!(
+ "pageserver_deletion_queue_dropped_lsn_updates_total",
+ "Updates to remote_consistent_lsn dropped due to stale generation number."
+ )
+ .expect("failed to define a metric"),
+ unexpected_errors: register_int_counter!(
+ "pageserver_deletion_queue_unexpected_errors_total",
+ "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
+ )
+ .expect("failed to define a metric"),
+ remote_errors: register_int_counter_vec!(
+ "pageserver_deletion_queue_remote_errors_total",
+ "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
+ &["op_kind"],
+ )
+ .expect("failed to define a metric")
+}
+});
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1161,7 +1265,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
- pub resident_physical_size_gauge: UIntGauge,
+ resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -1239,10 +1343,29 @@ impl TimelineMetrics {
}
pub fn record_new_file_metrics(&self, sz: u64) {
- self.resident_physical_size_gauge.add(sz);
+ self.resident_physical_size_add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}
+
+ pub fn resident_physical_size_sub(&self, sz: u64) {
+ self.resident_physical_size_gauge.sub(sz);
+ crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
+ }
+
+ pub fn resident_physical_size_add(&self, sz: u64) {
+ self.resident_physical_size_gauge.add(sz);
+ crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
+ }
+
+ pub fn resident_physical_size_set(&self, sz: u64) {
+ self.resident_physical_size_gauge.set(sz);
+ crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+ }
+
+ pub fn resident_physical_size_get(&self) -> u64 {
+ self.resident_physical_size_gauge.get()
+ }
}
impl Drop for TimelineMetrics {
@@ -1250,7 +1373,10 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
- let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+ {
+ RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
+ let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+ }
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1304,10 +1430,43 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
+/// Maintain a per timeline gauge in addition to the global gauge.
+struct PerTimelineRemotePhysicalSizeGauge {
+ last_set: u64,
+ gauge: UIntGauge,
+}
+
+impl PerTimelineRemotePhysicalSizeGauge {
+ fn new(per_timeline_gauge: UIntGauge) -> Self {
+ Self {
+ last_set: per_timeline_gauge.get(),
+ gauge: per_timeline_gauge,
+ }
+ }
+ fn set(&mut self, sz: u64) {
+ self.gauge.set(sz);
+ if sz < self.last_set {
+ REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+ } else {
+ REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+ };
+ self.last_set = sz;
+ }
+ fn get(&self) -> u64 {
+ self.gauge.get()
+ }
+}
+
+impl Drop for PerTimelineRemotePhysicalSizeGauge {
+ fn drop(&mut self) {
+ REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+ }
+}
+
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
- remote_physical_size_gauge: Mutex>,
+ remote_physical_size_gauge: Mutex >,
calls_unfinished_gauge: Mutex>,
bytes_started_counter: Mutex>,
bytes_finished_counter: Mutex>,
@@ -1325,18 +1484,24 @@ impl RemoteTimelineClientMetrics {
}
}
- pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+ pub(crate) fn remote_physical_size_set(&self, sz: u64) {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
- guard
- .get_or_insert_with(|| {
+ let gauge = guard.get_or_insert_with(|| {
+ PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
- .unwrap()
- })
- .clone()
+ .unwrap(),
+ )
+ });
+ gauge.set(sz);
+ }
+
+ pub(crate) fn remote_physical_size_get(&self) -> u64 {
+ let guard = self.remote_physical_size_gauge.lock().unwrap();
+ guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
}
pub fn remote_operation_time(
@@ -1675,6 +1840,9 @@ pub fn preinitialize_metrics() {
Lazy::force(c);
});
+ // Deletion queue stats
+ Lazy::force(&DELETION_QUEUE);
+
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 38b169ea85..97ca2bfea7 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,7 +75,11 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
- sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+ sync::{
+ atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+ Arc, Weak,
+ },
+ time::Duration,
};
use anyhow::Context;
@@ -165,6 +169,8 @@ struct Slot {
struct SlotInner {
key: Option,
+ // for `coalesce_readers_permit`
+ permit: std::sync::Mutex>,
buf: &'static mut [u8; PAGE_SZ],
}
@@ -207,6 +213,22 @@ impl Slot {
}
}
+impl SlotInner {
+ /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
+ fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc {
+ let mut guard = self.permit.lock().unwrap();
+ if let Some(existing_permit) = guard.upgrade() {
+ drop(guard);
+ drop(permit);
+ existing_permit
+ } else {
+ let permit = Arc::new(permit);
+ *guard = Arc::downgrade(&permit);
+ permit
+ }
+ }
+}
+
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
@@ -224,6 +246,8 @@ pub struct PageCache {
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
+ pinned_slots: Arc,
+
/// Index of the next candidate to evict, for the Clock replacement algorithm.
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
@@ -231,23 +255,28 @@ pub struct PageCache {
size_metrics: &'static PageCacheSizeMetrics,
}
+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
/// until the guard is dropped.
///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i> {
+ _permit: Arc,
+ slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
+}
impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
- self.0.buf
+ self.slot_guard.buf
}
}
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
fn as_ref(&self) -> &[u8; PAGE_SZ] {
- self.0.buf
+ self.slot_guard.buf
}
}
@@ -264,6 +293,8 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
pub struct PageWriteGuard<'i> {
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+ _permit: PinnedSlotsPermit,
+
// Are the page contents currently valid?
// Used to mark pages as invalid that are assigned but not yet filled with data.
valid: bool,
@@ -348,6 +379,10 @@ impl PageCache {
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
+ let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+ return None;
+ };
+
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
@@ -362,7 +397,10 @@ impl PageCache {
lsn,
};
- if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+ if let Some(guard) = self
+ .try_lock_for_read(&mut cache_key, &mut Some(permit))
+ .await
+ {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
@@ -445,6 +483,29 @@ impl PageCache {
// "mappings" after this section. But the routines in this section should
// not require changes.
+ async fn try_get_pinned_slot_permit(&self) -> anyhow::Result {
+ let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
+ match tokio::time::timeout(
+ // Choose small timeout, neon_smgr does its own retries.
+ // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
+ Duration::from_secs(10),
+ Arc::clone(&self.pinned_slots).acquire_owned(),
+ )
+ .await
+ {
+ Ok(res) => Ok(PinnedSlotsPermit(
+ res.expect("this semaphore is never closed"),
+ )),
+ Err(_timeout) => {
+ timer.stop_and_discard();
+ crate::metrics::page_cache_errors_inc(
+ crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
+ );
+ anyhow::bail!("timeout: there were page guards alive for all page cache slots")
+ }
+ }
+ }
+
/// Look up a page in the cache.
///
/// If the search criteria is not exact, *cache_key is updated with the key
@@ -454,7 +515,11 @@ impl PageCache {
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
- async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option {
+ async fn try_lock_for_read(
+ &self,
+ cache_key: &mut CacheKey,
+ permit: &mut Option,
+ ) -> Option {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
@@ -464,7 +529,10 @@ impl PageCache {
let inner = slot.inner.read().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
- return Some(PageReadGuard(inner));
+ return Some(PageReadGuard {
+ _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
+ slot_guard: inner,
+ });
} else {
// search_mapping might have modified the search key; restore it.
*cache_key = cache_key_orig;
@@ -507,6 +575,8 @@ impl PageCache {
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result {
+ let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
@@ -523,17 +593,21 @@ impl PageCache {
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
- if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+ if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
+ debug_assert!(permit.is_none());
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
+ debug_assert!(permit.is_some());
is_first_iteration = false;
// Not found. Find a victim buffer
- let (slot_idx, mut inner) =
- self.find_victim().context("Failed to find evict victim")?;
+ let (slot_idx, mut inner) = self
+ .find_victim(permit.as_ref().unwrap())
+ .await
+ .context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -555,7 +629,16 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
+ debug_assert!(
+ {
+ let guard = inner.permit.lock().unwrap();
+ guard.upgrade().is_none()
+ },
+ "we hold a write lock, so, no one else should have a permit"
+ );
+
return Ok(ReadBufResult::NotFound(PageWriteGuard {
+ _permit: permit.take().unwrap(),
inner,
valid: false,
}));
@@ -566,7 +649,11 @@ impl PageCache {
/// found, returns None.
///
/// When locking a page for writing, the search criteria is always "exact".
- async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option {
+ async fn try_lock_for_write(
+ &self,
+ cache_key: &CacheKey,
+ permit: &mut Option,
+ ) -> Option {
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
@@ -575,7 +662,18 @@ impl PageCache {
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
- return Some(PageWriteGuard { inner, valid: true });
+ debug_assert!(
+ {
+ let guard = inner.permit.lock().unwrap();
+ guard.upgrade().is_none()
+ },
+ "we hold a write lock, so, no one else should have a permit"
+ );
+ return Some(PageWriteGuard {
+ _permit: permit.take().unwrap(),
+ inner,
+ valid: true,
+ });
}
}
None
@@ -586,15 +684,20 @@ impl PageCache {
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result {
+ let mut permit = Some(self.try_get_pinned_slot_permit().await?);
loop {
// First check if the key already exists in the cache.
- if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
+ if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
+ debug_assert!(permit.is_none());
return Ok(WriteBufResult::Found(write_guard));
}
+ debug_assert!(permit.is_some());
// Not found. Find a victim buffer
- let (slot_idx, mut inner) =
- self.find_victim().context("Failed to find evict victim")?;
+ let (slot_idx, mut inner) = self
+ .find_victim(permit.as_ref().unwrap())
+ .await
+ .context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -616,7 +719,16 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
+ debug_assert!(
+ {
+ let guard = inner.permit.lock().unwrap();
+ guard.upgrade().is_none()
+ },
+ "we hold a write lock, so, no one else should have a permit"
+ );
+
return Ok(WriteBufResult::NotFound(PageWriteGuard {
+ _permit: permit.take().unwrap(),
inner,
valid: false,
}));
@@ -769,7 +881,10 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
- fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard)> {
+ async fn find_victim(
+ &self,
+ _permit_witness: &PinnedSlotsPermit,
+ ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
@@ -782,13 +897,40 @@ impl PageCache {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(_err) => {
- // If we have looped through the whole buffer pool 10 times
- // and still haven't found a victim buffer, something's wrong.
- // Maybe all the buffers were in locked. That could happen in
- // theory, if you have more threads holding buffers locked than
- // there are buffers in the pool. In practice, with a reasonably
- // large buffer pool it really shouldn't happen.
if iters > iter_limit {
+ // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+ // any particular number of iterations: other threads might race ahead and acquire and
+ // release pins just as we're scanning the array.
+ //
+ // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+ // slots. There are two threads running concurrently, A and B. A has just
+ // acquired the permit from the semaphore.
+ //
+ // A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+ // B: Acquire permit.
+ // B: Look at slot 2, decrement its usage_count to zero and continue the search
+ // B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+ // B: Release pin and permit again
+ // B: Acquire permit.
+ // B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+ // B: Release pin and permit again
+ //
+ // Now we're back in the starting situation that both slots have
+ // usage_count 1, but A has now been through one iteration of the
+ // find_victim() loop. This can repeat indefinitely and on each
+ // iteration, A's iteration count increases by one.
+ //
+ // So, even though the semaphore for the permits is fair, the victim search
+ // itself happens in parallel and is not fair.
+ // Hence even with a permit, a task can theoretically be starved.
+ // To avoid this, we'd need tokio to give priority to tasks that are holding
+ // permits for longer.
+ // Note that just yielding to tokio during iteration without such
+ // priority boosting is likely counter-productive. We'd just give more opportunities
+ // for B to bump usage count, further starving A.
+ crate::metrics::page_cache_errors_inc(
+ crate::metrics::PageCacheErrorKind::EvictIterLimit,
+ );
anyhow::bail!("exceeded evict iter limit");
}
continue;
@@ -799,6 +941,7 @@ impl PageCache {
self.remove_mapping(old_key);
inner.key = None;
}
+ crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}
@@ -826,7 +969,11 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
- inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+ inner: tokio::sync::RwLock::new(SlotInner {
+ key: None,
+ buf,
+ permit: std::sync::Mutex::new(Weak::new()),
+ }),
usage_count: AtomicU8::new(0),
}
})
@@ -838,6 +985,7 @@ impl PageCache {
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
+ pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
}
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 047fa761c3..7a94c3449d 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
| self.field6 as i128
}
- pub fn from_i128(x: i128) -> Self {
+ pub const fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 650bc119b6..017322ffb2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -456,7 +456,7 @@ async fn task_finish(
}
if shutdown_process {
- shutdown_pageserver(1).await;
+ shutdown_pageserver(None, 1).await;
}
}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c92c618fa..47bfd4a8ef 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
@@ -117,7 +118,7 @@ mod span;
pub mod metadata;
mod par_fsync;
-mod remote_timeline_client;
+pub mod remote_timeline_client;
pub mod storage_layer;
pub mod config;
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: Option,
+ pub deletion_queue_client: DeletionQueueClient,
}
///
@@ -197,6 +199,9 @@ pub struct Tenant {
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: Option,
+ // Access to global deletion queue for when this tenant wants to schedule a deletion
+ deletion_queue_client: DeletionQueueClient,
+
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex>,
cached_synthetic_tenant_size: Arc,
@@ -523,15 +528,20 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
- broker_client: storage_broker::BrokerClientChannel,
+ resources: TenantSharedResources,
tenants: &'static tokio::sync::RwLock,
- remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> anyhow::Result> {
// TODO dedup with spawn_load
let tenant_conf =
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
+ let TenantSharedResources {
+ broker_client,
+ remote_storage,
+ deletion_queue_client,
+ } = resources;
+
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Arc::new(Tenant::new(
TenantState::Attaching,
@@ -540,7 +550,8 @@ impl Tenant {
wal_redo_manager,
tenant_id,
generation,
- Some(remote_storage.clone()),
+ remote_storage.clone(),
+ deletion_queue_client,
));
// Do all the hard work in the background
@@ -571,7 +582,7 @@ impl Tenant {
let pending_deletion = {
match DeleteTenantFlow::should_resume_deletion(
conf,
- Some(&remote_storage),
+ remote_storage.as_ref(),
&tenant_clone,
)
.await
@@ -660,6 +671,7 @@ impl Tenant {
for timeline_id in remote_timeline_ids {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
+ self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -726,6 +738,7 @@ impl Tenant {
remote_metadata,
TimelineResources {
remote_client: Some(remote_client),
+ deletion_queue_client: self.deletion_queue_client.clone(),
},
ctx,
)
@@ -750,6 +763,7 @@ impl Tenant {
timeline_id,
&index_part.metadata,
Some(remote_timeline_client),
+ self.deletion_queue_client.clone(),
None,
)
.await
@@ -851,6 +865,7 @@ impl Tenant {
tenant_id,
Generation::broken(),
None,
+ DeletionQueueClient::broken(),
))
}
@@ -895,6 +910,7 @@ impl Tenant {
tenant_id,
generation,
remote_storage.clone(),
+ resources.deletion_queue_client.clone(),
);
let tenant = Arc::new(tenant);
@@ -1302,6 +1318,7 @@ impl Tenant {
timeline_id,
&local_metadata,
Some(remote_client),
+ self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -1351,6 +1368,7 @@ impl Tenant {
timeline_id,
&local_metadata,
None,
+ self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -2242,6 +2260,9 @@ impl Tenant {
Ok(timeline)
}
+ // Allow too_many_arguments because a constructor's argument list naturally grows with the
+ // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
+ #[allow(clippy::too_many_arguments)]
fn new(
state: TenantState,
conf: &'static PageServerConf,
@@ -2250,6 +2271,7 @@ impl Tenant {
tenant_id: TenantId,
generation: Generation,
remote_storage: Option,
+ deletion_queue_client: DeletionQueueClient,
) -> Tenant {
let (state, mut rx) = watch::channel(state);
@@ -2317,6 +2339,7 @@ impl Tenant {
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
+ deletion_queue_client,
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2856,6 +2879,7 @@ impl Tenant {
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
+ self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -2866,7 +2890,10 @@ impl Tenant {
None
};
- TimelineResources { remote_client }
+ TimelineResources {
+ remote_client,
+ deletion_queue_client: self.deletion_queue_client.clone(),
+ }
}
/// Creates intermediate timeline structure and its files.
@@ -3322,6 +3349,7 @@ pub mod harness {
use utils::logging;
use utils::lsn::Lsn;
+ use crate::deletion_queue::mock::MockDeletionQueue;
use crate::{
config::PageServerConf,
repository::Key,
@@ -3383,6 +3411,7 @@ pub mod harness {
pub generation: Generation,
pub remote_storage: GenericRemoteStorage,
pub remote_fs_dir: PathBuf,
+ pub deletion_queue: MockDeletionQueue,
}
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3431,6 +3460,7 @@ pub mod harness {
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+ let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
Ok(Self {
conf,
@@ -3439,6 +3469,7 @@ pub mod harness {
generation: Generation::new(0xdeadbeef),
remote_storage,
remote_fs_dir,
+ deletion_queue,
})
}
@@ -3463,6 +3494,7 @@ pub mod harness {
self.tenant_id,
self.generation,
Some(self.remote_storage.clone()),
+ self.deletion_queue.new_client(),
));
tenant
.load(None, ctx)
@@ -4193,7 +4225,8 @@ mod tests {
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
- let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+ let harness = TenantHarness::create("test_bulk_insert")?;
+ let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4240,7 +4273,8 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
- let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+ let harness = TenantHarness::create("test_random_updates")?;
+ let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 74faee1115..17bcc9eb5f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,10 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
+use crate::control_plane_client::{
+ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+};
+use crate::deletion_queue::DeletionQueueClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
@@ -116,7 +119,28 @@ pub async fn init_tenant_mgr(
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
- Some(client.re_attach().await?)
+ let result = match client.re_attach().await {
+ Ok(tenants) => tenants,
+ Err(RetryForeverError::ShuttingDown) => {
+ anyhow::bail!("Shut down while waiting for control plane re-attach response")
+ }
+ };
+
+ // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+ // deletion list entries may still be valid. We provide that by pushing a recovery operation into
+ // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+ // are processed, even though we don't block on recovery completing here.
+ //
+ // Must only do this if remote storage is enabled, otherwise deletion queue
+ // is not running and channel push will fail.
+ if resources.remote_storage.is_some() {
+ resources
+ .deletion_queue_client
+ .recover(result.clone())
+ .await?;
+ }
+
+ Some(result)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
@@ -285,29 +309,21 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
- if let Some(remote_storage) = resources.remote_storage {
- match Tenant::spawn_attach(
- conf,
- tenant_id,
- generation,
- resources.broker_client,
- tenants,
- remote_storage,
- ctx,
- ) {
- Ok(tenant) => tenant,
- Err(e) => {
- error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
- Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
- }
- }
- } else {
+ if resources.remote_storage.is_none() {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
+ } else {
+ match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
+ Ok(tenant) => tenant,
+ Err(e) => {
+ error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+ Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+ }
+ }
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -438,8 +454,7 @@ pub async fn create_tenant(
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
generation: Generation,
- broker_client: storage_broker::BrokerClientChannel,
- remote_storage: Option,
+ resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -450,13 +465,9 @@ pub async fn create_tenant(
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
- let tenant_resources = TenantSharedResources {
- broker_client,
- remote_storage,
- };
let created_tenant =
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
- generation, tenant_resources, None, &TENANTS, ctx)?;
+ generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -622,6 +633,7 @@ pub async fn load_tenant(
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option,
+ deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -635,6 +647,7 @@ pub async fn load_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage,
+ deletion_queue_client
};
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
.with_context(|| {
@@ -702,8 +715,7 @@ pub async fn attach_tenant(
tenant_id: TenantId,
generation: Generation,
tenant_conf: TenantConfOpt,
- broker_client: storage_broker::BrokerClientChannel,
- remote_storage: GenericRemoteStorage,
+ resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -718,10 +730,7 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
- let resources = TenantSharedResources {
- broker_client,
- remote_storage: Some(remote_storage),
- };
+
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 6f42b54ac2..ee99151ef2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,8 +116,12 @@
//! # Completion
//!
//! Once an operation has completed, we update
-//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
-//! to safekeepers that they can delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! and submit a request through the DeletionQueue to update
+//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
+//! validated that our generation is not stale. It is this visible value
+//! that is advertized to safekeepers as a signal that that they can
+//! delete the WAL up to that LSN.
//!
//! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
//! for all pending operations to complete. It does not prevent more
@@ -200,7 +204,6 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
-mod delete;
mod download;
pub mod index;
mod upload;
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
+use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
metrics: Arc,
storage_impl: GenericRemoteStorage,
+
+ deletion_queue_client: DeletionQueueClient,
}
impl RemoteTimelineClient {
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
///
pub fn new(
remote_storage: GenericRemoteStorage,
+ deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
timeline_id,
generation,
storage_impl: remote_storage,
+ deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
Ok(())
}
- pub fn last_uploaded_consistent_lsn(&self) -> Option {
- match &*self.upload_queue.lock().unwrap() {
+ pub fn remote_consistent_lsn_projected(&self) -> Option {
+ match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
- UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
- UploadQueue::Stopped(q) => {
- Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
- }
+ UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
+ UploadQueue::Stopped(q) => q
+ .upload_queue_for_deletion
+ .get_last_remote_consistent_lsn_projected(),
+ }
+ }
+
+ pub fn remote_consistent_lsn_visible(&self) -> Option {
+ match &mut *self.upload_queue.lock().unwrap() {
+ UploadQueue::Uninitialized => None,
+ UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
+ UploadQueue::Stopped(q) => Some(
+ q.upload_queue_for_deletion
+ .get_last_remote_consistent_lsn_visible(),
+ ),
}
}
@@ -434,11 +453,11 @@ impl RemoteTimelineClient {
} else {
0
};
- self.metrics.remote_physical_size_gauge().set(size);
+ self.metrics.remote_physical_size_set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
- self.metrics.remote_physical_size_gauge().get()
+ self.metrics.remote_physical_size_get()
}
//
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc,
- names: &[LayerFileName],
+ names: Vec,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
- .iter()
+ .into_iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
- let meta = upload_queue.latest_files.remove(name);
+ let meta = upload_queue.latest_files.remove(&name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
- // schedule the actual deletions
- for (name, generation) in with_generations {
- let op = UploadOp::Delete(Delete {
- file_kind: RemoteOpFileKind::Layer,
- layer_file_name: name.clone(),
- scheduled_from_timeline_delete: false,
- generation,
- });
- self.calls_unfinished_metric_begin(&op);
- upload_queue.queued_operations.push_back(op);
- info!("scheduled layer file deletion {name}");
+ for (name, gen) in &with_generations {
+ info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
+ // schedule the actual deletions
+ let op = UploadOp::Delete(Delete {
+ layers: with_generations,
+ });
+ self.calls_unfinished_metric_begin(&op);
+ upload_queue.queued_operations.push_back(op);
+
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
};
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
pub(crate) async fn delete_all(self: &Arc) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
- let (mut receiver, deletions_queued) = {
- let mut deletions_queued = 0;
-
+ let layers: Vec = {
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
@@ -847,42 +862,30 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
- .queued_operations
- .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
- // schedule the actual deletions
- for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
- let op = UploadOp::Delete(Delete {
- file_kind: RemoteOpFileKind::Layer,
- layer_file_name: name.clone(),
- scheduled_from_timeline_delete: true,
- generation: meta.generation,
- });
-
- self.calls_unfinished_metric_begin(&op);
- stopped
- .upload_queue_for_deletion
- .queued_operations
- .push_back(op);
-
- info!("scheduled layer file deletion {name}");
- deletions_queued += 1;
- }
-
- self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
- (
- self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
- deletions_queued,
- )
+ .latest_files
+ .drain()
+ .map(|(file_name, meta)| {
+ remote_layer_path(
+ &self.tenant_id,
+ &self.timeline_id,
+ &file_name,
+ meta.generation,
+ )
+ })
+ .collect()
};
- receiver.changed().await.context("upload queue shut down")?;
+ let layer_deletion_count = layers.len();
+ self.deletion_queue_client.push_immediate(layers).await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
+ // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+ // taking the burden of listing all the layers that we already know we should delete.
+ self.deletion_queue_client.flush_immediate().await?;
+
let remaining = backoff::retry(
|| async {
self.storage_impl
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
})
.collect();
+ let not_referenced_count = remaining.len();
if !remaining.is_empty() {
- backoff::retry(
- || async { self.storage_impl.delete_objects(&remaining).await },
- |_e| false,
- FAILED_UPLOAD_WARN_THRESHOLD,
- FAILED_REMOTE_OP_RETRIES,
- "delete_objects",
- backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
- )
- .await
- .context("delete_objects")?;
+ self.deletion_queue_client.push_immediate(remaining).await?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
- debug!("deleting index part");
+ debug!("enqueuing index part deletion");
+ self.deletion_queue_client
+ .push_immediate([index_file_path].to_vec())
+ .await?;
- backoff::retry(
- || async { self.storage_impl.delete(&index_file_path).await },
- |_e| false,
- FAILED_UPLOAD_WARN_THRESHOLD,
- FAILED_REMOTE_OP_RETRIES,
- "delete_index",
- backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
- )
- .await
- .context("delete_index")?;
+ // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
+ // for a flush to a persistent deletion list so that we may be sure deletion will occur.
+ self.deletion_queue_client.flush_immediate().await?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
))?
});
- info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+ info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
Ok(())
}
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
}
res
}
- UploadOp::Delete(delete) => {
- let path = &self
- .conf
- .timeline_path(&self.tenant_id, &self.timeline_id)
- .join(delete.layer_file_name.file_name());
- delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
- .measure_remote_op(
- self.tenant_id,
- self.timeline_id,
- delete.file_kind,
- RemoteOpKind::Delete,
- Arc::clone(&self.metrics),
- )
- .await
- }
+ UploadOp::Delete(delete) => self
+ .deletion_queue_client
+ .push_layers(
+ self.tenant_id,
+ self.timeline_id,
+ self.generation,
+ delete.layers.clone(),
+ )
+ .await
+ .map_err(|e| anyhow::anyhow!(e)),
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
}
// The task has completed successfully. Remove it from the in-progress list.
- {
+ let lsn_update = {
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
- UploadQueue::Stopped(stopped) => {
- // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
- // then stop() took care of it so we just return.
- // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
- match &task.op {
- UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
- _ => None
- }
+ UploadQueue::Stopped(_stopped) => {
+ None
},
UploadQueue::Initialized(qi) => { Some(qi) }
};
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
- match task.op {
+ let lsn_update = match task.op {
UploadOp::UploadLayer(_, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
+ None
}
UploadOp::UploadMetadata(_, lsn) => {
upload_queue.num_inprogress_metadata_uploads -= 1;
- upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
+ // XXX monotonicity check?
+
+ upload_queue.projected_remote_consistent_lsn = Some(lsn);
+ if self.generation.is_none() {
+ // Legacy mode: skip validating generation
+ upload_queue.visible_remote_consistent_lsn.store(lsn);
+ None
+ } else {
+ Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+ }
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
+ None
}
UploadOp::Barrier(_) => unreachable!(),
};
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
+ lsn_update
+ };
+
+ if let Some((lsn, slot)) = lsn_update {
+ // Updates to the remote_consistent_lsn we advertise to pageservers
+ // are all routed through the DeletionQueue, to enforce important
+ // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
+ self.deletion_queue_client
+ .update_remote_consistent_lsn(
+ self.tenant_id,
+ self.timeline_id,
+ self.generation,
+ lsn,
+ slot,
+ )
+ .await;
}
+
self.calls_unfinished_metric_end(&task.op);
}
@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
- UploadOp::Delete(delete) => (
- delete.file_kind,
+ UploadOp::Delete(_delete) => (
+ RemoteOpFileKind::Layer,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
- last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+ projected_remote_consistent_lsn: None,
+ visible_remote_consistent_lsn: initialized
+ .visible_remote_consistent_lsn
+ .clone(),
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
- layer_meta: &LayerFileMetadata,
+ generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
- layer_meta.generation.get_suffix()
+ generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1554,7 +1565,6 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result {
- // Use a current-thread runtime in the test
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
@@ -1580,6 +1590,7 @@ mod tests {
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
+ deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_id,
@@ -1749,7 +1760,7 @@ mod tests {
)
.unwrap();
client
- .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+ .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -1775,6 +1786,7 @@ mod tests {
// Finish them
client.wait_completion().await.unwrap();
+ harness.deletion_queue.pump().await;
assert_remote_files(
&[
diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
deleted file mode 100644
index 7324559223..0000000000
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-use crate::{
- config::PageServerConf,
- tenant::{remote_timeline_client::remote_path, Generation},
-};
-
-pub(super) async fn delete_layer<'a>(
- conf: &'static PageServerConf,
- storage: &'a GenericRemoteStorage,
- local_layer_path: &'a Path,
- generation: Generation,
-) -> anyhow::Result<()> {
- fail::fail_point!("before-delete-layer", |_| {
- anyhow::bail!("failpoint before-delete-layer")
- });
- debug!("Deleting layer from remote storage: {local_layer_path:?}",);
-
- let path_to_delete = remote_path(conf, local_layer_path, generation)?;
-
- // We don't want to print an error if the delete failed if the file has
- // already been deleted. Thankfully, in this situation S3 already
- // does not yield an error. While OS-provided local file system APIs do yield
- // errors, we avoid them in the `LocalFs` wrapper.
- storage
- .delete(&path_to_delete)
- .await
- .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
-}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 9863215529..5c173c613f 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
- let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+ let remote_path = remote_layer_path(
+ &tenant_id,
+ &timeline_id,
+ layer_file_name,
+ layer_metadata.generation,
+ );
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 78ac1338db..9b62ba1c50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
+use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
@@ -143,6 +144,7 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option,
+ pub deletion_queue_client: DeletionQueueClient,
}
pub struct Timeline {
@@ -521,9 +523,23 @@ impl Timeline {
self.disk_consistent_lsn.load()
}
- pub fn get_remote_consistent_lsn(&self) -> Option {
+ /// remote_consistent_lsn from the perspective of the tenant's current generation,
+ /// not validated with control plane yet.
+ /// See [`Self::get_remote_consistent_lsn_visible`].
+ pub fn get_remote_consistent_lsn_projected(&self) -> Option {
if let Some(remote_client) = &self.remote_client {
- remote_client.last_uploaded_consistent_lsn()
+ remote_client.remote_consistent_lsn_projected()
+ } else {
+ None
+ }
+ }
+
+ /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
+ /// i.e. a value of remote_consistent_lsn_projected which has undergone
+ /// generation validation in the deletion queue.
+ pub fn get_remote_consistent_lsn_visible(&self) -> Option {
+ if let Some(remote_client) = &self.remote_client {
+ remote_client.remote_consistent_lsn_visible()
} else {
None
}
@@ -543,7 +559,7 @@ impl Timeline {
}
pub fn resident_physical_size(&self) -> u64 {
- self.metrics.resident_physical_size_gauge.get()
+ self.metrics.resident_physical_size_get()
}
///
@@ -1293,10 +1309,7 @@ impl Timeline {
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
- self.metrics
- .resident_physical_size_gauge
- .sub(layer_file_size);
-
+ self.metrics.resident_physical_size_sub(layer_file_size);
self.metrics.evictions.inc();
if let Some(delta) = local_layer_residence_duration {
@@ -1820,7 +1833,7 @@ impl Timeline {
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
- rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+ rtc.schedule_layer_file_deletion(needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -1830,9 +1843,7 @@ impl Timeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
- self.metrics
- .resident_physical_size_gauge
- .set(total_physical_size);
+ self.metrics.resident_physical_size_set(total_physical_size);
timer.stop_and_record();
Ok(())
@@ -3875,7 +3886,7 @@ impl Timeline {
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
- remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+ remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
}
Ok(())
@@ -4210,7 +4221,7 @@ impl Timeline {
}
if let Some(remote_client) = &self.remote_client {
- remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+ remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
}
apply.flush();
@@ -4382,7 +4393,7 @@ impl Timeline {
// XXX the temp file is still around in Err() case
// and consumes space until we clean up upon pageserver restart.
- self_clone.metrics.resident_physical_size_gauge.add(*size);
+ self_clone.metrics.resident_physical_size_add(*size);
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 18588cf0fd..7d55388f44 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,6 +14,7 @@ use utils::{
use crate::{
config::PageServerConf,
+ deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option,
+ deletion_queue_client: DeletionQueueClient,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
- TimelineResources { remote_client },
+ TimelineResources {
+ remote_client,
+ deletion_queue_client,
+ },
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 3c88d31f24..0a387bd779 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
let desc = layer.layer_desc();
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
- metrics.resident_physical_size_gauge.sub(desc.file_size);
+ metrics.resident_physical_size_sub(desc.file_size);
}
// TODO Removing from the bottom of the layer map is expensive.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7d1e9b4a39..0831b9ceda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
})?;
if let Some(last_lsn) = status_update {
- let timeline_remote_consistent_lsn =
- timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+ let timeline_remote_consistent_lsn = timeline
+ .get_remote_consistent_lsn_visible()
+ .unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
let last_received_lsn = last_lsn;
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 28822335b0..08b1cb8866 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,3 @@
-use crate::metrics::RemoteOpFileKind;
-
use super::storage_layer::LayerFileName;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
@@ -11,6 +9,7 @@ use std::fmt::Debug;
use chrono::NaiveDateTime;
use std::sync::Arc;
use tracing::info;
+use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
- pub(crate) last_uploaded_consistent_lsn: Lsn,
+ ///
+ /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+ /// the control plane (unlesss a timeline's generation is None, in which case
+ /// we skip validation)
+ pub(crate) projected_remote_consistent_lsn: Option,
+ pub(crate) visible_remote_consistent_lsn: Arc,
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
pub(super) fn no_pending_work(&self) -> bool {
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
}
+
+ pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
+ self.visible_remote_consistent_lsn.load()
+ }
+
+ pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option {
+ self.projected_remote_consistent_lsn
+ }
}
#[derive(Clone, Copy)]
@@ -114,9 +126,8 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
- // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
- // safekeepers from garbage-collecting anything.
- last_uploaded_consistent_lsn: Lsn(0),
+ projected_remote_consistent_lsn: None,
+ visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -158,7 +169,10 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
- last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+ projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
+ visible_remote_consistent_lsn: Arc::new(
+ index_part.metadata.disk_consistent_lsn().into(),
+ ),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
+/// A deletion of some layers within the lifetime of a timeline. This is not used
+/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
- pub(crate) file_kind: RemoteOpFileKind,
- pub(crate) layer_file_name: LayerFileName,
- pub(crate) scheduled_from_timeline_delete: bool,
- pub(crate) generation: Generation,
+ pub(crate) layers: Vec<(LayerFileName, Generation)>,
}
#[derive(Debug)]
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
- /// Delete a layer file
+ /// Delete layer files
Delete(Delete),
/// Barrier. When the barrier operation is reached,
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
UploadOp::UploadMetadata(_, lsn) => {
write!(f, "UploadMetadata(lsn: {})", lsn)
}
- UploadOp::Delete(delete) => write!(
- f,
- "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
- delete.layer_file_name.file_name(),
- delete.scheduled_from_timeline_delete,
- delete.generation
- ),
+ UploadOp::Delete(delete) => {
+ write!(f, "Delete({} layers)", delete.layers.len(),)
+ }
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index debbbce117..8b0035b8e8 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,6 +741,13 @@ NeonProcessUtility(
break;
case T_DropdbStmt:
HandleDropDb(castNode(DropdbStmt, parseTree));
+ /*
+ * We do this here to hack around the fact that Postgres performs the drop
+ * INSIDE of standard_ProcessUtility, which means that if we try to
+ * abort the drop normally it'll be too late. DROP DATABASE can't be inside
+ * of a transaction block anyway, so this should be fine to do.
+ */
+ NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
break;
case T_CreateRoleStmt:
HandleCreateRole(castNode(CreateRoleStmt, parseTree));
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 4be75e1dad..22f20a4c0b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,7 +14,6 @@
*/
#include
-#include
#include
#include
@@ -38,9 +37,6 @@
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/buf_internals.h"
-#include "storage/procsignal.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
@@ -66,9 +62,6 @@
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
-#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
-#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
-
typedef struct FileCacheEntry
{
BufferTag key;
@@ -91,14 +84,12 @@ static int lfc_desc = 0;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
-static int lfc_free_space_watermark;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
-static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
void FileCacheMonitorMain(Datum main_arg);
@@ -254,80 +245,6 @@ lfc_change_limit_hook(int newval, void *extra)
LWLockRelease(lfc_lock);
}
-/*
- * Local file system state monitor check available free space.
- * If it is lower than lfc_free_space_watermark then we shrink size of local cache
- * but throwing away least recently accessed chunks.
- * First time low space watermark is reached cache size is divided by two,
- * second time by four,... Finally we remove all chunks from local cache.
- *
- * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
- * We only throw away cached chunks but do not prevent from filling cache by new chunks.
- *
- * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
- * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
- * Calling statvfs each second should not add any noticeable overhead.
- */
-void
-FileCacheMonitorMain(Datum main_arg)
-{
- /*
- * Choose file system state monitor interval so that space can not be exosted
- * during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
- */
- uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
-
- /* Establish signal handlers. */
- pqsignal(SIGUSR1, procsignal_sigusr1_handler);
- pqsignal(SIGHUP, SignalHandlerForConfigReload);
- pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
- BackgroundWorkerUnblockSignals();
-
- /* Periodically dump buffers until terminated. */
- while (!ShutdownRequestPending)
- {
- if (lfc_size_limit != 0)
- {
- struct statvfs sfs;
- if (statvfs(lfc_path, &sfs) < 0)
- {
- elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
- }
- else
- {
- if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
- {
- if (lfc_shrinking_factor < 31) {
- lfc_shrinking_factor += 1;
- }
- lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
- }
- else
- lfc_shrinking_factor = 0; /* reset to initial value */
- }
- }
- pg_usleep(monitor_interval);
- }
-}
-
-static void
-lfc_register_free_space_monitor(void)
-{
- BackgroundWorker bgw;
- memset(&bgw, 0, sizeof(bgw));
- bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
- bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
- snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
- snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
- snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
- snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
- bgw.bgw_restart_time = 5;
- bgw.bgw_notify_pid = 0;
- bgw.bgw_main_arg = (Datum) 0;
-
- RegisterBackgroundWorker(&bgw);
-}
-
void
lfc_init(void)
{
@@ -364,19 +281,6 @@ lfc_init(void)
lfc_change_limit_hook,
NULL);
- DefineCustomIntVariable("neon.free_space_watermark",
- "Minimal free space in local file system after reaching which local file cache will be truncated",
- NULL,
- &lfc_free_space_watermark,
- 1024, /* 1GB */
- 0,
- INT_MAX,
- PGC_SIGHUP,
- GUC_UNIT_MB,
- NULL,
- NULL,
- NULL);
-
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
@@ -391,9 +295,6 @@ lfc_init(void)
if (lfc_max_size == 0)
return;
- if (lfc_free_space_watermark != 0)
- lfc_register_free_space_monitor();
-
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = lfc_shmem_startup;
#if PG_VERSION_NUM>=150000
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index cbab0c6f07..92498d3ecd 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,6 +42,7 @@ reqwest-middleware.workspace = true
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
routerify.workspace = true
+rustc-hash.workspace = true
rustls-pemfile.workspace = true
rustls.workspace = true
scopeguard.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ff73f2b625..03c9029862 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -160,6 +160,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
Test(_) => Some("test".to_owned()),
}
}
+
+ /// Get username from the credentials.
+ pub fn get_user(&self) -> &str {
+ use BackendType::*;
+
+ match self {
+ Console(_, creds) => creds.user,
+ Postgres(_, creds) => creds.user,
+ Link(_) => "link",
+ Test(_) => "test",
+ }
+ }
+
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(
diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index e771e5d7ed..a7ef15d342 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -17,11 +17,12 @@ use std::{
use tokio::time;
use tokio_postgres::AsyncMessage;
-use crate::{auth, console};
+use crate::{
+ auth, console,
+ metrics::{Ids, MetricCounter, USAGE_METRICS},
+};
use crate::{compute, config};
-use super::sql_over_http::MAX_RESPONSE_SIZE;
-
use crate::proxy::ConnectMechanism;
use tracing::{error, warn};
@@ -400,7 +401,6 @@ async fn connect_to_compute_once(
.user(&conn_info.username)
.password(&conn_info.password)
.dbname(&conn_info.dbname)
- .max_backend_message_size(MAX_RESPONSE_SIZE)
.connect_timeout(timeout)
.connect(tokio_postgres::NoTls)
.await?;
@@ -412,6 +412,10 @@ async fn connect_to_compute_once(
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
});
+ let ids = Ids {
+ endpoint_id: node_info.aux.endpoint_id.to_string(),
+ branch_id: node_info.aux.branch_id.to_string(),
+ };
tokio::spawn(
poll_fn(move |cx| {
@@ -450,10 +454,18 @@ async fn connect_to_compute_once(
Ok(Client {
inner: client,
session: tx,
+ ids,
})
}
pub struct Client {
pub inner: tokio_postgres::Client,
session: tokio::sync::watch::Sender,
+ ids: Ids,
+}
+
+impl Client {
+ pub fn metrics(&self) -> Arc {
+ USAGE_METRICS.register(self.ids.clone())
+ }
}
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index fe57096105..b74b3e9646 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -3,10 +3,12 @@ use std::sync::Arc;
use anyhow::bail;
use futures::pin_mut;
use futures::StreamExt;
-use hashbrown::HashMap;
use hyper::body::HttpBody;
+use hyper::header;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
+use hyper::Response;
+use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Map;
@@ -16,7 +18,11 @@ use tokio_postgres::types::Type;
use tokio_postgres::GenericClient;
use tokio_postgres::IsolationLevel;
use tokio_postgres::Row;
+use tracing::error;
+use tracing::instrument;
use url::Url;
+use utils::http::error::ApiError;
+use utils::http::json::json_response;
use super::conn_pool::ConnInfo;
use super::conn_pool::GlobalConnPool;
@@ -39,7 +45,6 @@ enum Payload {
Batch(BatchQueryData),
}
-pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -182,7 +187,45 @@ pub async fn handle(
sni_hostname: Option,
conn_pool: Arc,
session_id: uuid::Uuid,
-) -> anyhow::Result<(Value, HashMap)> {
+) -> Result, ApiError> {
+ let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
+
+ let mut response = match result {
+ Ok(r) => r,
+ Err(e) => {
+ let message = format!("{:?}", e);
+ let code = match e.downcast_ref::() {
+ Some(e) => match e.code() {
+ Some(e) => serde_json::to_value(e.code()).unwrap(),
+ None => Value::Null,
+ },
+ None => Value::Null,
+ };
+ error!(
+ ?code,
+ "sql-over-http per-client task finished with an error: {e:#}"
+ );
+ // TODO: this shouldn't always be bad request.
+ json_response(
+ StatusCode::BAD_REQUEST,
+ json!({ "message": message, "code": code }),
+ )?
+ }
+ };
+ response.headers_mut().insert(
+ "Access-Control-Allow-Origin",
+ hyper::http::HeaderValue::from_static("*"),
+ );
+ Ok(response)
+}
+
+#[instrument(name = "sql-over-http", skip_all)]
+async fn handle_inner(
+ request: Request,
+ sni_hostname: Option,
+ conn_pool: Arc,
+ session_id: uuid::Uuid,
+) -> anyhow::Result> {
//
// Determine the destination and connection params
//
@@ -233,13 +276,18 @@ pub async fn handle(
let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
+ let mut response = Response::builder()
+ .status(StatusCode::OK)
+ .header(header::CONTENT_TYPE, "application/json");
+
//
// Now execute the query and return the result
//
+ let mut size = 0;
let result = match payload {
- Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
- .await
- .map(|x| (x, HashMap::default())),
+ Payload::Single(query) => {
+ query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
+ }
Payload::Batch(batch_query) => {
let mut results = Vec::new();
let mut builder = client.inner.build_transaction();
@@ -254,7 +302,8 @@ pub async fn handle(
}
let transaction = builder.start().await?;
for query in batch_query.queries {
- let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+ let result =
+ query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
match result {
Ok(r) => results.push(r),
Err(e) => {
@@ -264,26 +313,27 @@ pub async fn handle(
}
}
transaction.commit().await?;
- let mut headers = HashMap::default();
if txn_read_only {
- headers.insert(
+ response = response.header(
TXN_READ_ONLY.clone(),
HeaderValue::try_from(txn_read_only.to_string())?,
);
}
if txn_deferrable {
- headers.insert(
+ response = response.header(
TXN_DEFERRABLE.clone(),
HeaderValue::try_from(txn_deferrable.to_string())?,
);
}
if let Some(txn_isolation_level) = txn_isolation_level_raw {
- headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+ response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
}
- Ok((json!({ "results": results }), headers))
+ Ok(json!({ "results": results }))
}
};
+ let metrics = client.metrics();
+
if allow_pool {
let current_span = tracing::Span::current();
// return connection to the pool
@@ -293,12 +343,30 @@ pub async fn handle(
});
}
- result
+ match result {
+ Ok(value) => {
+ // how could this possibly fail
+ let body = serde_json::to_string(&value).expect("json serialization should not fail");
+ let len = body.len();
+ let response = response
+ .body(Body::from(body))
+ // only fails if invalid status code or invalid header/values are given.
+ // these are not user configurable so it cannot fail dynamically
+ .expect("building response payload should not fail");
+
+ // count the egress bytes - we miss the TLS and header overhead but oh well...
+ // moving this later in the stack is going to be a lot of effort and ehhhh
+ metrics.record_egress(len as u64);
+ Ok(response)
+ }
+ Err(e) => Err(e),
+ }
}
async fn query_to_json(
client: &T,
data: QueryData,
+ current_size: &mut usize,
raw_output: bool,
array_mode: bool,
) -> anyhow::Result {
@@ -312,16 +380,10 @@ async fn query_to_json(
// big.
pin_mut!(row_stream);
let mut rows: Vec = Vec::new();
- let mut current_size = 0;
while let Some(row) = row_stream.next().await {
let row = row?;
- current_size += row.body_len();
+ *current_size += row.body_len();
rows.push(row);
- if current_size > MAX_RESPONSE_SIZE {
- return Err(anyhow::anyhow!(
- "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
- ));
- }
}
// grab the command tag and number of rows affected
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index fa66df0469..994a7de764 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -7,7 +7,6 @@ use crate::{
};
use bytes::{Buf, Bytes};
use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
use hyper::{
server::{
accept,
@@ -18,7 +17,6 @@ use hyper::{
};
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
-use serde_json::{json, Value};
use std::{
convert::Infallible,
@@ -204,44 +202,7 @@ async fn ws_handler(
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
- let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
- .instrument(info_span!("sql-over-http"))
- .await;
- let status_code = match result {
- Ok(_) => StatusCode::OK,
- Err(_) => StatusCode::BAD_REQUEST,
- };
- let (json, headers) = match result {
- Ok(r) => r,
- Err(e) => {
- let message = format!("{:?}", e);
- let code = match e.downcast_ref::() {
- Some(e) => match e.code() {
- Some(e) => serde_json::to_value(e.code()).unwrap(),
- None => Value::Null,
- },
- None => Value::Null,
- };
- error!(
- ?code,
- "sql-over-http per-client task finished with an error: {e:#}"
- );
- (
- json!({ "message": message, "code": code }),
- HashMap::default(),
- )
- }
- };
- json_response(status_code, json).map(|mut r| {
- r.headers_mut().insert(
- "Access-Control-Allow-Origin",
- hyper::http::HeaderValue::from_static("*"),
- );
- for (k, v) in headers {
- r.headers_mut().insert(k, v);
- }
- r
- })
+ sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
@@ -253,7 +214,7 @@ async fn ws_handler(
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
- .map_err(|e| ApiError::BadRequest(e.into()))
+ .map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")
}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9279002eb3..cfeec5622b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -3,9 +3,18 @@
use crate::{config::MetricCollectionConfig, http};
use chrono::{DateTime, Utc};
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use serde::Serialize;
-use std::{collections::HashMap, convert::Infallible, time::Duration};
-use tracing::{error, info, instrument, trace, warn};
+use dashmap::{mapref::entry::Entry, DashMap};
+use once_cell::sync::Lazy;
+use serde::{Deserialize, Serialize};
+use std::{
+ convert::Infallible,
+ sync::{
+ atomic::{AtomicU64, AtomicUsize, Ordering},
+ Arc,
+ },
+ time::Duration,
+};
+use tracing::{error, info, instrument, trace};
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
@@ -18,12 +27,95 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
/// so while the project-id is unique across regions the whole pipeline will work correctly
/// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
+#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
pub struct Ids {
pub endpoint_id: String,
pub branch_id: String,
}
+#[derive(Debug)]
+pub struct MetricCounter {
+ transmitted: AtomicU64,
+ opened_connections: AtomicUsize,
+}
+
+impl MetricCounter {
+ /// Record that some bytes were sent from the proxy to the client
+ pub fn record_egress(&self, bytes: u64) {
+ self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+ }
+
+ /// extract the value that should be reported
+ fn should_report(self: &Arc) -> Option {
+ // heuristic to see if the branch is still open
+ // if a clone happens while we are observing, the heuristic will be incorrect.
+ //
+ // Worst case is that we won't report an event for this endpoint.
+ // However, for the strong count to be 1 it must have occured that at one instant
+ // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
+ let is_open = Arc::strong_count(self) > 1;
+ let opened = self.opened_connections.swap(0, Ordering::AcqRel);
+
+ // update cached metrics eagerly, even if they can't get sent
+ // (to avoid sending the same metrics twice)
+ // see the relevant discussion on why to do so even if the status is not success:
+ // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+ let value = self.transmitted.swap(0, Ordering::AcqRel);
+
+ // Our only requirement is that we report in every interval if there was an open connection
+ // if there were no opened connections since, then we don't need to report
+ if value == 0 && !is_open && opened == 0 {
+ None
+ } else {
+ Some(value)
+ }
+ }
+
+ /// Determine whether the counter should be cleared from the global map.
+ fn should_clear(self: &mut Arc) -> bool {
+ // we can't clear this entry if it's acquired elsewhere
+ let Some(counter) = Arc::get_mut(self) else {
+ return false;
+ };
+ let opened = *counter.opened_connections.get_mut();
+ let value = *counter.transmitted.get_mut();
+ // clear if there's no data to report
+ value == 0 && opened == 0
+ }
+}
+
+// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos
+type FastHasher = std::hash::BuildHasherDefault;
+
+#[derive(Default)]
+pub struct Metrics {
+ endpoints: DashMap, FastHasher>,
+}
+
+impl Metrics {
+ /// Register a new byte metrics counter for this endpoint
+ pub fn register(&self, ids: Ids) -> Arc {
+ let entry = if let Some(entry) = self.endpoints.get(&ids) {
+ entry.clone()
+ } else {
+ self.endpoints
+ .entry(ids)
+ .or_insert_with(|| {
+ Arc::new(MetricCounter {
+ transmitted: AtomicU64::new(0),
+ opened_connections: AtomicUsize::new(0),
+ })
+ })
+ .clone()
+ };
+
+ entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+ entry
+ }
+}
+
+pub static USAGE_METRICS: Lazy = Lazy::new(Metrics::default);
+
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result {
info!("metrics collector config: {config:?}");
scopeguard::defer! {
@@ -31,145 +123,83 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result)> = HashMap::new();
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
+ let mut prev = Utc::now();
let mut ticker = tokio::time::interval(config.interval);
loop {
ticker.tick().await;
- let res = collect_metrics_iteration(
+ let now = Utc::now();
+ collect_metrics_iteration(
+ &USAGE_METRICS,
&http_client,
- &mut cached_metrics,
&config.endpoint,
&hostname,
+ prev,
+ now,
)
.await;
-
- match res {
- Err(e) => error!("failed to send consumption metrics: {e} "),
- Ok(_) => trace!("periodic metrics collection completed successfully"),
- }
+ prev = now;
}
}
-fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> {
- let mut current_metrics: Vec<(Ids, (u64, DateTime))> = Vec::new();
- let metrics = prometheus::default_registry().gather();
-
- for m in metrics {
- if m.get_name() == "proxy_io_bytes_per_client" {
- for ms in m.get_metric() {
- let direction = ms
- .get_label()
- .iter()
- .find(|l| l.get_name() == "direction")
- .unwrap()
- .get_value();
-
- // Only collect metric for outbound traffic
- if direction == "tx" {
- let endpoint_id = ms
- .get_label()
- .iter()
- .find(|l| l.get_name() == "endpoint_id")
- .unwrap()
- .get_value();
- let branch_id = ms
- .get_label()
- .iter()
- .find(|l| l.get_name() == "branch_id")
- .unwrap()
- .get_value();
-
- let value = ms.get_counter().get_value() as u64;
-
- // Report if the metric value is suspiciously large
- if value > (1u64 << 40) {
- warn!(
- "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
- branch_id, endpoint_id, value
- );
- }
-
- current_metrics.push((
- Ids {
- endpoint_id: endpoint_id.to_string(),
- branch_id: branch_id.to_string(),
- },
- (value, Utc::now()),
- ));
- }
- }
- }
- }
-
- current_metrics
-}
-
#[instrument(skip_all)]
async fn collect_metrics_iteration(
+ metrics: &Metrics,
client: &http::ClientWithMiddleware,
- cached_metrics: &mut HashMap)>,
metric_collection_endpoint: &reqwest::Url,
hostname: &str,
-) -> anyhow::Result<()> {
+ prev: DateTime,
+ now: DateTime,
+) {
info!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
);
- let current_metrics = gather_proxy_io_bytes_per_client();
+ let mut metrics_to_clear = Vec::new();
- let metrics_to_send: Vec> = current_metrics
+ let metrics_to_send: Vec<(Ids, u64)> = metrics
+ .endpoints
.iter()
- .filter_map(|(curr_key, (curr_val, curr_time))| {
- let mut start_time = *curr_time;
- let mut value = *curr_val;
-
- if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
- // Only send metrics updates if the metric has increased
- if curr_val > prev_val {
- value = curr_val - prev_val;
- start_time = *prev_time;
- } else {
- if curr_val < prev_val {
- error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
- prev_val, curr_val, curr_key);
- }
- return None;
- }
+ .filter_map(|counter| {
+ let key = counter.key().clone();
+ let Some(value) = counter.should_report() else {
+ metrics_to_clear.push(key);
+ return None;
};
-
- Some(Event {
- kind: EventType::Incremental {
- start_time,
- stop_time: *curr_time,
- },
- metric: PROXY_IO_BYTES_PER_CLIENT,
- idempotency_key: idempotency_key(hostname),
- value,
- extra: Ids {
- endpoint_id: curr_key.endpoint_id.clone(),
- branch_id: curr_key.branch_id.clone(),
- },
- })
+ Some((key, value))
})
.collect();
if metrics_to_send.is_empty() {
trace!("no new metrics to send");
- return Ok(());
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+ let events = chunk
+ .iter()
+ .map(|(ids, value)| Event {
+ kind: EventType::Incremental {
+ start_time: prev,
+ stop_time: now,
+ },
+ metric: PROXY_IO_BYTES_PER_CLIENT,
+ idempotency_key: idempotency_key(hostname),
+ value: *value,
+ extra: Ids {
+ endpoint_id: ids.endpoint_id.clone(),
+ branch_id: ids.branch_id.clone(),
+ },
+ })
+ .collect();
+
let res = client
.post(metric_collection_endpoint.clone())
- .json(&EventChunk {
- events: chunk.into(),
- })
+ .json(&EventChunk { events })
.send()
.await;
@@ -183,34 +213,113 @@ async fn collect_metrics_iteration(
if !res.status().is_success() {
error!("metrics endpoint refused the sent metrics: {:?}", res);
- for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
+ for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
// Report if the metric value is suspiciously large
error!("potentially abnormal metric value: {:?}", metric);
}
}
- // update cached metrics after they were sent
- // (to avoid sending the same metrics twice)
- // see the relevant discussion on why to do so even if the status is not success:
- // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
- for send_metric in chunk {
- let stop_time = match send_metric.kind {
- EventType::Incremental { stop_time, .. } => stop_time,
- _ => unreachable!(),
- };
+ }
- cached_metrics
- .entry(Ids {
- endpoint_id: send_metric.extra.endpoint_id.clone(),
- branch_id: send_metric.extra.branch_id.clone(),
- })
- // update cached value (add delta) and time
- .and_modify(|e| {
- e.0 = e.0.saturating_add(send_metric.value);
- e.1 = stop_time
- })
- // cache new metric
- .or_insert((send_metric.value, stop_time));
+ for metric in metrics_to_clear {
+ match metrics.endpoints.entry(metric) {
+ Entry::Occupied(mut counter) => {
+ if counter.get_mut().should_clear() {
+ counter.remove_entry();
+ }
+ }
+ Entry::Vacant(_) => {}
}
}
- Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use std::{
+ net::TcpListener,
+ sync::{Arc, Mutex},
+ };
+
+ use anyhow::Error;
+ use chrono::Utc;
+ use consumption_metrics::{Event, EventChunk};
+ use hyper::{
+ service::{make_service_fn, service_fn},
+ Body, Response,
+ };
+ use url::Url;
+
+ use super::{collect_metrics_iteration, Ids, Metrics};
+ use crate::http;
+
+ #[tokio::test]
+ async fn metrics() {
+ let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+
+ let reports = Arc::new(Mutex::new(vec![]));
+ let reports2 = reports.clone();
+
+ let server = hyper::server::Server::from_tcp(listener)
+ .unwrap()
+ .serve(make_service_fn(move |_| {
+ let reports = reports.clone();
+ async move {
+ Ok::<_, Error>(service_fn(move |req| {
+ let reports = reports.clone();
+ async move {
+ let bytes = hyper::body::to_bytes(req.into_body()).await?;
+ let events: EventChunk<'static, Event> =
+ serde_json::from_slice(&bytes)?;
+ reports.lock().unwrap().push(events);
+ Ok::<_, Error>(Response::new(Body::from(vec![])))
+ }
+ }))
+ }
+ }));
+ let addr = server.local_addr();
+ tokio::spawn(server);
+
+ let metrics = Metrics::default();
+ let client = http::new_client();
+ let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
+ let now = Utc::now();
+
+ // no counters have been registered
+ collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+ let r = std::mem::take(&mut *reports2.lock().unwrap());
+ assert!(r.is_empty());
+
+ // register a new counter
+ let counter = metrics.register(Ids {
+ endpoint_id: "e1".to_string(),
+ branch_id: "b1".to_string(),
+ });
+
+ // the counter should be observed despite 0 egress
+ collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+ let r = std::mem::take(&mut *reports2.lock().unwrap());
+ assert_eq!(r.len(), 1);
+ assert_eq!(r[0].events.len(), 1);
+ assert_eq!(r[0].events[0].value, 0);
+
+ // record egress
+ counter.record_egress(1);
+
+ // egress should be observered
+ collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+ let r = std::mem::take(&mut *reports2.lock().unwrap());
+ assert_eq!(r.len(), 1);
+ assert_eq!(r[0].events.len(), 1);
+ assert_eq!(r[0].events[0].value, 1);
+
+ // release counter
+ drop(counter);
+
+ // we do not observe the counter
+ collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+ let r = std::mem::take(&mut *reports2.lock().unwrap());
+ assert!(r.is_empty());
+
+ // counter is unregistered
+ assert!(metrics.endpoints.is_empty());
+ }
}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f9da145859..71e00ed58f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
compute::{self, PostgresConnection},
config::{ProxyConfig, TlsConfig},
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+ metrics::{Ids, USAGE_METRICS},
protocol2::WithClientIp,
stream::{PqStream, Stream},
};
@@ -602,6 +603,11 @@ pub async fn proxy_pass(
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: &MetricsAuxInfo,
) -> anyhow::Result<()> {
+ let usage = USAGE_METRICS.register(Ids {
+ endpoint_id: aux.endpoint_id.to_string(),
+ branch_id: aux.branch_id.to_string(),
+ });
+
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
let mut client = MeasuredStream::new(
client,
@@ -609,6 +615,7 @@ pub async fn proxy_pass(
|cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
+ usage.record_egress(cnt as u64);
},
);
@@ -690,7 +697,14 @@ impl Client<'_, S> {
.await
{
Ok(auth_result) => auth_result,
- Err(e) => return stream.throw_error(e).await,
+ Err(e) => {
+ let user = creds.get_user();
+ let db = params.get("database");
+ let app = params.get("application_name");
+ let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+ return stream.throw_error(e).instrument(params_span).await;
+ }
};
let AuthSuccess {
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 1254c4e779..6fbaa08512 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -105,6 +105,8 @@ class NeonCompare(PgCompare):
self._pg_bin = pg_bin
self.pageserver_http_client = self.env.pageserver.http_client()
+ # note that neon_simple_env now uses LOCAL_FS remote storage
+
# Create tenant
tenant_conf: Dict[str, str] = {}
if False: # TODO add pytest setting for this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0667403ba3..92e7cd06cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -460,9 +460,11 @@ class NeonEnvBuilder:
), "Unexpectedly instantiated from outside a test function"
self.test_name = test_name
- def init_configs(self) -> NeonEnv:
+ def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
# Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized"
+ if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
+ self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
self.env = NeonEnv(self)
return self.env
@@ -470,8 +472,19 @@ class NeonEnvBuilder:
assert self.env is not None, "environment is not already initialized, call init() first"
self.env.start()
- def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
- env = self.init_configs()
+ def init_start(
+ self,
+ initial_tenant_conf: Optional[Dict[str, str]] = None,
+ default_remote_storage_if_missing: bool = True,
+ ) -> NeonEnv:
+ """
+ Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
+
+ To avoid creating initial_tenant, call init_configs to setup the environment.
+
+ Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
+ """
+ env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
self.start()
# Prepare the default branch to start the postgres on later.
@@ -546,7 +559,7 @@ class NeonEnvBuilder:
user: RemoteStorageUser,
bucket_name: Optional[str] = None,
bucket_region: Optional[str] = None,
- ) -> Optional[RemoteStorage]:
+ ) -> RemoteStorage:
ret = kind.configure(
self.repo_dir,
self.mock_s3_server,
@@ -889,6 +902,8 @@ def _shared_simple_env(
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
is set, this is shared by all tests using `neon_simple_env`.
+
+ This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
"""
if os.environ.get("TEST_SHARED_FIXTURES") is None:
@@ -1481,6 +1496,16 @@ class NeonAttachmentService:
self.running = False
return self
+ def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
+ response = requests.post(
+ f"{self.env.control_plane_api}/attach_hook",
+ json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+ )
+ response.raise_for_status()
+ gen = response.json()["gen"]
+ assert isinstance(gen, int)
+ return gen
+
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1689,12 +1714,7 @@ class NeonPageserver(PgProtocol):
to call into the pageserver HTTP client.
"""
if self.env.attachment_service is not None:
- response = requests.post(
- f"{self.env.control_plane_api}/attach_hook",
- json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
- )
- response.raise_for_status()
- generation = response.json()["gen"]
+ generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
else:
generation = None
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 9373073abf..9fdcd22bc2 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session):
},
)
self.verbose_error(res)
+
+ def deletion_queue_flush(self, execute: bool = False):
+ self.put(
+ f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
+ ).raise_for_status()
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 2e5d75a0fc..70c2a06a07 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
def list_prefix(
- neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+ neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
) -> ListObjectsV2OutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
response = remote.client.list_objects_v2(
- Delimiter="/",
+ Delimiter=delimiter,
Bucket=remote.bucket_name,
Prefix=prefix,
)
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index f7cddbc821..535f8c2fe7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -202,9 +202,6 @@ class RemoteStorageKind(str, enum.Enum):
LOCAL_FS = "local_fs"
MOCK_S3 = "mock_s3"
REAL_S3 = "real_s3"
- # Pass to tests that are generic to remote storage
- # to ensure the test pass with or without the remote storage
- NOOP = "noop"
def configure(
self,
@@ -215,10 +212,7 @@ class RemoteStorageKind(str, enum.Enum):
user: RemoteStorageUser,
bucket_name: Optional[str] = None,
bucket_region: Optional[str] = None,
- ) -> Optional[RemoteStorage]:
- if self == RemoteStorageKind.NOOP:
- return None
-
+ ) -> RemoteStorage:
if self == RemoteStorageKind.LOCAL_FS:
return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index d0462844f0..c80f2d8360 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -4,7 +4,12 @@ from typing import List, Tuple
import pytest
from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+ Endpoint,
+ NeonEnv,
+ NeonEnvBuilder,
+ wait_for_last_flush_lsn,
+)
from fixtures.types import TenantId, TimelineId
@@ -26,17 +31,18 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
- for _ in range(4):
+ for _ in range(3):
tenant_id, timeline_id = env.neon_cli.create_tenant()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
+ wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop()
tenant_timelines.append((tenant_id, timeline_id, endpoint))
- # Stop the pageserver
+ # Stop the pageserver -- this has to be not immediate or we need to wait for uploads
env.pageserver.stop()
# Leave the first timeline alone, but corrupt the others in different ways
@@ -45,30 +51,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
(tenant1, timeline1, pg1) = tenant_timelines[1]
metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
- f = open(metadata_path, "w")
- f.write("overwritten with garbage!")
- f.close()
+ with open(metadata_path, "w") as f:
+ f.write("overwritten with garbage!")
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
(tenant2, timeline2, pg2) = tenant_timelines[2]
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
- for filename in os.listdir(timeline_path):
- if filename.startswith("00000"):
- # Looks like a layer file. Remove it
- os.remove(f"{timeline_path}/{filename}")
- log.info(
- f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
- )
-
- (tenant3, timeline3, pg3) = tenant_timelines[3]
- timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/"
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
# Looks like a layer file. Corrupt it
- f = open(f"{timeline_path}/{filename}", "w")
- f.write("overwritten with garbage!")
- f.close()
- log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
+ p = f"{timeline_path}/{filename}"
+ size = os.path.getsize(p)
+ with open(p, "wb") as f:
+ f.truncate(0)
+ f.truncate(size)
+ log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
env.pageserver.start()
@@ -87,22 +84,13 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
)
- # Second timeline has no ancestors, only the metadata file and no layer files locally,
- # and we don't have the remote storage enabled. It is loaded into memory, but getting
- # the basebackup from it will fail.
- with pytest.raises(
- Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
- ) as err:
- pg2.start()
- log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
-
- # Third timeline will also fail during basebackup, because the layer file is corrupt.
+ # Second timeline will fail during basebackup, because the local layer file is corrupt.
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
# (We don't check layer file contents on startup, when loading the timeline)
with pytest.raises(Exception, match="Failed to load delta layer") as err:
- pg3.start()
+ pg2.start()
log.info(
- f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+ f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
)
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 740e489759..d4cf1b4739 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -211,4 +211,12 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
ddl.wait()
ddl.failures(False)
+ cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+ ddl.wait()
+ with pytest.raises(psycopg2.InternalError):
+ ddl.failures(True)
+ cur.execute("DROP DATABASE failure")
+ ddl.wait()
+ ddl.pg.connect(dbname="failure") # Ensure we can connect after a failed drop
+
conn.close()
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index ae62fdf4a4..f3f3a1ddf3 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -74,11 +74,13 @@ class EvictionEnv:
pgbench_init_lsns: Dict[TenantId, Lsn]
def timelines_du(self) -> Tuple[int, int, int]:
- return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
+ return poor_mans_du(
+ self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
+ )
def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
return {
- (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
+ (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
for tid, tlid in self.timelines
}
@@ -89,7 +91,21 @@ class EvictionEnv:
"""
lsn = self.pgbench_init_lsns[tenant_id]
with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
- self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
+ # instead of using pgbench --select-only which does point selects,
+ # run full table scans for all tables
+ with endpoint.connect() as conn:
+ cur = conn.cursor()
+
+ tables_cols = {
+ "pgbench_accounts": "abalance",
+ "pgbench_tellers": "tbalance",
+ "pgbench_branches": "bbalance",
+ "pgbench_history": "delta",
+ }
+
+ for table, column in tables_cols.items():
+ cur.execute(f"select avg({column}) from {table}")
+ _avg = cur.fetchone()
def pageserver_start_with_disk_usage_eviction(
self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -127,6 +143,19 @@ class EvictionEnv:
self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
+def human_bytes(amt: float) -> str:
+ suffixes = ["", "Ki", "Mi", "Gi"]
+
+ last = suffixes[-1]
+
+ for name in suffixes:
+ if amt < 1024 or name == last:
+ return f"{int(round(amt))} {name}B"
+ amt = amt / 1024
+
+ raise RuntimeError("unreachable")
+
+
@pytest.fixture
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
"""
@@ -215,8 +244,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
healthy_tenant_id, healthy_timeline_id = env.timelines[1]
- broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
- healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+ broken_size_pre, _, _ = poor_mans_du(
+ env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+ )
+ healthy_size_pre, _, _ = poor_mans_du(
+ env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+ )
# try to evict everything, then validate that broken tenant wasn't touched
target = broken_size_pre + healthy_size_pre
@@ -224,8 +257,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
- broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
- healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+ broken_size_post, _, _ = poor_mans_du(
+ env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+ )
+ healthy_size_post, _, _ = poor_mans_du(
+ env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+ )
assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
assert healthy_size_post < healthy_size_pre
@@ -366,18 +403,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
du_by_timeline = env.du_by_timeline()
# pick any tenant
- [our_tenant, other_tenant] = list(du_by_timeline.keys())
- (tenant_id, timeline_id) = our_tenant
+ [warm, cold] = list(du_by_timeline.keys())
+ (tenant_id, timeline_id) = warm
- # make our tenant more recently used than the other one
+ # make picked tenant more recently used than the other one
env.warm_up_tenant(tenant_id)
# Build up enough pressure to require evictions from both tenants,
# but not enough to fall into global LRU.
- # So, set target to all occipied space, except 2*env.layer_size per tenant
- target = (
- du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
- )
+ # So, set target to all occupied space, except 2*env.layer_size per tenant
+ target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
@@ -392,22 +427,33 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
later_tenant_usage < du_by_timeline[tenant]
), "all tenants should have lost some layers"
+ warm_size = later_du_by_timeline[warm]
+
+ # bounds for warmed_size
+ warm_lower = 0.5 * du_by_timeline[warm]
+
+ # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+ # So, check for up to 3 here.
+ warm_upper = warm_lower + 3 * env.layer_size
+
+ cold_size = later_du_by_timeline[cold]
+ cold_upper = 2 * env.layer_size
+
+ log.info(
+ f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+ )
+ log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+ assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+ assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
assert (
- later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
- ), "our warmed up tenant should be at about half capacity, part 1"
- assert (
- # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
- # So, check for up to 3 here.
- later_du_by_timeline[our_tenant]
- < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
- ), "our warmed up tenant should be at about half capacity, part 2"
- assert (
- later_du_by_timeline[other_tenant] < 2 * env.layer_size
- ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
+ cold_size < cold_upper
+ ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
def poor_mans_du(
- env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
+ env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
) -> Tuple[int, int, int]:
"""
Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -430,9 +476,11 @@ def poor_mans_du(
smallest_layer = min(smallest_layer, size)
else:
smallest_layer = size
- log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
+ if verbose:
+ log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")
- log.info(f"{tenant_id}/{timeline_id}: sum {total}")
+ if verbose:
+ log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
total_on_disk += total
assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
new file mode 100644
index 0000000000..81d38ac934
--- /dev/null
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -0,0 +1,352 @@
+"""
+
+Tests in this module exercise the pageserver's behavior around generation numbers,
+as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require
+of the pageserver are:
+- Do not start a tenant without a generation number if control_plane_api is set
+- Remote objects must be suffixed with generation
+- Deletions may only be executed after validating generation
+- Updates to remote_consistent_lsn may only be made visible after validating generation
+"""
+
+
+import re
+import time
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+ NeonEnv,
+ NeonEnvBuilder,
+ PgBin,
+ last_flush_lsn_upload,
+ wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import list_prefix
+from fixtures.remote_storage import (
+ RemoteStorageKind,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import print_gc_result, wait_until
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+ # small checkpointing and compaction targets to ensure we generate many upload operations
+ "checkpoint_distance": f"{128 * 1024}",
+ "compaction_threshold": "1",
+ "compaction_target_size": f"{128 * 1024}",
+ # no PITR horizon, we specify the horizon when we request on-demand GC
+ "pitr_interval": "0s",
+ # disable background compaction and GC. We invoke it manually when we want it to happen.
+ "gc_period": "0s",
+ "compaction_period": "0s",
+ # create image layers eagerly, so that GC can remove some layers
+ "image_creation_threshold": "1",
+}
+
+
+def generate_uploads_and_deletions(
+ env: NeonEnv,
+ *,
+ init: bool = True,
+ tenant_id: Optional[TenantId] = None,
+ timeline_id: Optional[TimelineId] = None,
+ data: Optional[str] = None,
+):
+ """
+ Using the environment's default tenant + timeline, generate a load pattern
+ that results in some uploads and some deletions to remote storage.
+ """
+
+ if tenant_id is None:
+ tenant_id = env.initial_tenant
+ assert tenant_id is not None
+
+ if timeline_id is None:
+ timeline_id = env.initial_timeline
+ assert timeline_id is not None
+
+ ps_http = env.pageserver.http_client()
+
+ with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+ if init:
+ endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+ last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+ def churn(data):
+ endpoint.safe_psql_many(
+ [
+ f"""
+ INSERT INTO foo (id, val)
+ SELECT g, '{data}'
+ FROM generate_series(1, 20000) g
+ ON CONFLICT (id) DO UPDATE
+ SET val = EXCLUDED.val
+ """,
+ # to ensure that GC can actually remove some layers
+ "VACUUM foo",
+ ]
+ )
+ assert tenant_id is not None
+ assert timeline_id is not None
+ wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+ ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+ # Compaction should generate some GC-elegible layers
+ for i in range(0, 2):
+ churn(f"{i if data is None else data}")
+
+ gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+ print_gc_result(gc_result)
+ assert gc_result["layers_removed"] > 0
+
+
+def get_metric_or_0(ps_http, metric: str) -> int:
+ v = ps_http.get_metric_value(metric)
+ return 0 if v is None else int(v)
+
+
+def get_deletion_queue_executed(ps_http) -> int:
+ return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
+
+
+def get_deletion_queue_submitted(ps_http) -> int:
+ return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
+
+
+def get_deletion_queue_dropped(ps_http) -> int:
+ return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
+
+
+def get_deletion_queue_unexpected_errors(ps_http) -> int:
+ return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
+
+
+def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
+ return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
+
+
+def get_deletion_queue_depth(ps_http) -> int:
+ """
+ Queue depth if at least one deletion has been submitted, else None
+ """
+ submitted = get_deletion_queue_submitted(ps_http)
+ executed = get_deletion_queue_executed(ps_http)
+ dropped = get_deletion_queue_dropped(ps_http)
+ depth = submitted - executed - dropped
+ log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
+
+ assert depth >= 0
+ return int(depth)
+
+
+def assert_deletion_queue(ps_http, size_fn) -> None:
+ v = get_deletion_queue_depth(ps_http)
+ assert v is not None
+ assert size_fn(v) is True
+
+
+def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
+ """
+ Validate behavior when a pageserver is run without generation support enabled,
+ then started again after activating it:
+ - Before upgrade, no objects should have generation suffixes
+ - After upgrade, the bucket should contain a mixture.
+ - In both cases, postgres I/O should work.
+ """
+ neon_env_builder.enable_generations = True
+ neon_env_builder.enable_pageserver_remote_storage(
+ RemoteStorageKind.MOCK_S3,
+ )
+
+ env = neon_env_builder.init_configs()
+ env.broker.try_start()
+ for sk in env.safekeepers:
+ sk.start()
+ assert env.attachment_service is not None
+ env.attachment_service.start()
+
+ env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+
+ env.neon_cli.create_tenant(
+ tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
+ )
+ generate_uploads_and_deletions(env)
+
+ def parse_generation_suffix(key):
+ m = re.match(".+-([0-9a-zA-Z]{8})$", key)
+ if m is None:
+ return None
+ else:
+ log.info(f"match: {m}")
+ log.info(f"group: {m.group(1)}")
+ return int(m.group(1), 16)
+
+ pre_upgrade_keys = list(
+ [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+ )
+ for key in pre_upgrade_keys:
+ assert parse_generation_suffix(key) is None
+
+ env.pageserver.stop()
+
+ # Starting without the override that disabled control_plane_api
+ env.pageserver.start()
+
+ generate_uploads_and_deletions(env, init=False)
+
+ legacy_objects: list[str] = []
+ suffixed_objects = []
+ post_upgrade_keys = list(
+ [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+ )
+ for key in post_upgrade_keys:
+ log.info(f"post-upgrade key: {key}")
+ if parse_generation_suffix(key) is not None:
+ suffixed_objects.append(key)
+ else:
+ legacy_objects.append(key)
+
+ # Bucket now contains a mixture of suffixed and non-suffixed objects
+ assert len(suffixed_objects) > 0
+ assert len(legacy_objects) > 0
+
+ assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
+
+
+def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_generations = True
+ neon_env_builder.enable_pageserver_remote_storage(
+ RemoteStorageKind.MOCK_S3,
+ )
+ env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+ assert env.attachment_service is not None
+
+ some_other_pageserver = 1234
+ ps_http = env.pageserver.http_client()
+
+ generate_uploads_and_deletions(env)
+
+ # Flush: pending deletions should all complete
+ assert_deletion_queue(ps_http, lambda n: n > 0)
+ ps_http.deletion_queue_flush(execute=True)
+ assert_deletion_queue(ps_http, lambda n: n == 0)
+ assert get_deletion_queue_dropped(ps_http) == 0
+
+ # Our visible remote_consistent_lsn should match projected
+ timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+ assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
+ assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+ env.pageserver.allowed_errors.extend(
+ [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+ )
+
+ # Now advance the generation in the control plane: subsequent validations
+ # from the running pageserver will fail. No more deletions should happen.
+ env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+ generate_uploads_and_deletions(env, init=False)
+
+ assert_deletion_queue(ps_http, lambda n: n > 0)
+ queue_depth_before = get_deletion_queue_depth(ps_http)
+ executed_before = get_deletion_queue_executed(ps_http)
+ ps_http.deletion_queue_flush(execute=True)
+
+ # Queue drains to zero because we dropped deletions
+ assert_deletion_queue(ps_http, lambda n: n == 0)
+ # The executed counter has not incremented
+ assert get_deletion_queue_executed(ps_http) == executed_before
+ # The dropped counter has incremented to consume all of the deletions that were previously enqueued
+ assert get_deletion_queue_dropped(ps_http) == queue_depth_before
+
+ # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
+ # because generation validation fails.
+ timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+ assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
+ assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
+
+ # TODO: list bucket and confirm all objects have a generation suffix.
+
+ assert get_deletion_queue_unexpected_errors(ps_http) == 0
+
+
+@pytest.mark.parametrize("keep_attachment", [True, False])
+def test_deletion_queue_recovery(
+ neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
+):
+ """
+ :param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
+ node took the attachment while we were restarting.
+ """
+ neon_env_builder.enable_generations = True
+ neon_env_builder.enable_pageserver_remote_storage(
+ RemoteStorageKind.MOCK_S3,
+ )
+ env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+ ps_http = env.pageserver.http_client()
+
+ # Prevent deletion lists from being executed, to build up some backlog of deletions
+ ps_http.configure_failpoints(
+ [
+ ("deletion-queue-before-execute", "return"),
+ ]
+ )
+
+ generate_uploads_and_deletions(env)
+
+ # There should be entries in the deletion queue
+ assert_deletion_queue(ps_http, lambda n: n > 0)
+ ps_http.deletion_queue_flush()
+ before_restart_depth = get_deletion_queue_depth(ps_http)
+
+ assert get_deletion_queue_unexpected_errors(ps_http) == 0
+ assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+ log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
+ env.pageserver.stop(immediate=True)
+
+ if not keep_attachment:
+ some_other_pageserver = 101010
+ assert env.attachment_service is not None
+ env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+
+ env.pageserver.start()
+
+ def assert_deletions_submitted(n: int):
+ assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
+
+ # After restart, issue a flush to kick the deletion frontend to do recovery.
+ # It should recover all the operations we submitted before the restart.
+ ps_http.deletion_queue_flush(execute=False)
+ wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+
+ # The queue should drain through completely if we flush it
+ ps_http.deletion_queue_flush(execute=True)
+ wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+
+ if keep_attachment:
+ # If we kept the attachment, then our pre-restart deletions should have executed
+ # successfully
+ assert get_deletion_queue_executed(ps_http) == before_restart_depth
+ else:
+ # If we lost the attachment, we should have dropped our pre-restart deletions.
+ assert get_deletion_queue_dropped(ps_http) == before_restart_depth
+ env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+
+ assert get_deletion_queue_unexpected_errors(ps_http) == 0
+ assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+ # Restart again
+ env.pageserver.stop(immediate=True)
+ env.pageserver.start()
+
+ # No deletion lists should be recovered: this demonstrates that deletion lists
+ # were cleaned up after being executed or dropped in the previous process lifetime.
+ time.sleep(1)
+ assert_deletion_queue(ps_http, lambda n: n == 0)
+
+ assert get_deletion_queue_unexpected_errors(ps_http) == 0
+ assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index dae39d2752..b76dbbee03 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -5,7 +5,6 @@ from pathlib import Path
from queue import SimpleQueue
from typing import Any, Dict, Set
-import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -17,15 +16,13 @@ from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
+# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
+
-@pytest.mark.parametrize(
- "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
-)
def test_metric_collection(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
- remote_storage_kind: RemoteStorageKind,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -55,7 +52,7 @@ def test_metric_collection(
synthetic_size_calculation_interval="3s"
"""
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
@@ -68,6 +65,14 @@ def test_metric_collection(
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+ # we have a fast rate of calculation, these can happen at shutdown
+ env.pageserver.allowed_errors.append(
+ ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+ )
+ env.pageserver.allowed_errors.append(
+ ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+ )
+
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -98,17 +103,14 @@ def test_metric_collection(
total += sample[2]
return int(total)
- remote_uploaded = 0
-
# upload some data to remote storage
- if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
- wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
- pageserver_http = env.pageserver.http_client()
- pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
- pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+ wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+ pageserver_http = env.pageserver.http_client()
+ pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+ pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
- remote_uploaded = get_num_remote_ops("index", "upload")
- assert remote_uploaded > 0
+ remote_uploaded = get_num_remote_ops("index", "upload")
+ assert remote_uploaded > 0
# we expect uploads at 1Hz, on busy runners this could be too optimistic,
# so give 5s we only want to get the following upload after "ready" value.
@@ -211,6 +213,14 @@ def test_metric_collection_cleans_up_tempfile(
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+ # we have a fast rate of calculation, these can happen at shutdown
+ env.pageserver.allowed_errors.append(
+ ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+ )
+ env.pageserver.allowed_errors.append(
+ ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+ )
+
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c6ddb54ee6..3a56ca51a6 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -30,9 +30,7 @@ from fixtures.types import TenantId
from fixtures.utils import run_pg_bench_small
-@pytest.mark.parametrize(
- "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_delete_smoke(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
@@ -43,6 +41,12 @@ def test_tenant_delete_smoke(
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
+ env.pageserver.allowed_errors.extend(
+ [
+ # The deletion queue will complain when it encounters simulated S3 errors
+ ".*deletion executor: DeleteObjects request failed.*",
+ ]
+ )
# lucky race with stopping from flushing a layer we fail to schedule any uploads
env.pageserver.allowed_errors.append(
@@ -138,18 +142,12 @@ FAILPOINTS_BEFORE_BACKGROUND = [
def combinations():
result = []
- remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+ remotes = [RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in FAILPOINTS:
- if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
- "timeline-delete-before-index-delete",
- ):
- # the above failpoint are not relevant for config without remote storage
- continue
-
# Simulate failures for only one type of remote storage
# to avoid log pollution and make tests run faster
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -195,27 +193,32 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
]
)
+ if simulate_failures:
+ env.pageserver.allowed_errors.extend(
+ [
+ # The deletion queue will complain when it encounters simulated S3 errors
+ ".*deletion executor: DeleteObjects request failed.*",
+ ]
+ )
+
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
- if remote_storage_kind is RemoteStorageKind.NOOP:
- wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
- else:
- last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+ last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
- if remote_storage_kind in available_s3_storages():
- assert_prefix_not_empty(
- neon_env_builder,
- prefix="/".join(
- (
- "tenants",
- str(tenant_id),
- )
- ),
- )
+ if remote_storage_kind in available_s3_storages():
+ assert_prefix_not_empty(
+ neon_env_builder,
+ prefix="/".join(
+ (
+ "tenants",
+ str(tenant_id),
+ )
+ ),
+ )
ps_http.configure_failpoints((failpoint, "return"))
@@ -246,12 +249,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
env.pageserver.stop()
env.pageserver.start()
- if (
- remote_storage_kind is RemoteStorageKind.NOOP
- and failpoint == "tenant-delete-before-create-local-mark"
- ):
- tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
- elif failpoint in (
+ if failpoint in (
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
):
@@ -383,6 +381,7 @@ def test_tenant_delete_is_resumed_on_attach(
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
+ ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8ccbcf551d..a20523b1f3 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -519,11 +519,8 @@ def test_detach_while_attaching(
# * restart the pageserver and verify that ignored tenant is still not loaded
# * `load` the same tenant
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
-def test_ignored_tenant_reattach(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 49a6ca5a53..7cea301a9c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_until_tenant_active,
)
-from fixtures.pg_version import PgVersion, xfail_on_postgres
+from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TimelineId
@@ -532,7 +532,24 @@ def test_single_branch_get_tenant_size_grows(
assert size_after == prev, "size after restarting pageserver should not have changed"
-@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
+def assert_size_approx_equal(size_a, size_b):
+ """
+ Tests that evaluate sizes are checking the pageserver space consumption
+ that sits many layers below the user input. The exact space needed
+ varies slightly depending on postgres behavior.
+
+ Rather than expecting postgres to be determinstic and occasionally
+ failing the test, we permit sizes for the same data to vary by a few pages.
+ """
+
+ # Determined empirically from examples of equality failures: they differ
+ # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid
+ # failing on outliers from that observed range.
+ threshold = 4 * 8272
+
+ assert size_a == pytest.approx(size_b, abs=threshold)
+
+
def test_get_tenant_size_with_multiple_branches(
neon_env_builder: NeonEnvBuilder, test_output_dir: Path
):
@@ -573,7 +590,7 @@ def test_get_tenant_size_with_multiple_branches(
)
size_after_first_branch = http_client.tenant_size(tenant_id)
- assert size_after_first_branch == size_at_branch
+ assert_size_approx_equal(size_after_first_branch, size_at_branch)
first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
@@ -599,7 +616,7 @@ def test_get_tenant_size_with_multiple_branches(
"second-branch", main_branch_name, tenant_id
)
size_after_second_branch = http_client.tenant_size(tenant_id)
- assert size_after_second_branch == size_after_continuing_on_main
+ assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
@@ -635,7 +652,7 @@ def test_get_tenant_size_with_multiple_branches(
# tenant_size but so far this has been reliable, even though at least gc
# and tenant_size race for the same locks
size_after = http_client.tenant_size(tenant_id)
- assert size_after == size_after_thinning_branch
+ assert_size_approx_equal(size_after, size_after_thinning_branch)
size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
size_debug = http_client.tenant_size_debug(tenant_id)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 677c0d18e8..40dff194aa 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -12,7 +12,6 @@ from fixtures.log_helper import log
from fixtures.metrics import (
PAGESERVER_GLOBAL_METRICS,
PAGESERVER_PER_TENANT_METRICS,
- PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
parse_metrics,
)
from fixtures.neon_fixtures import (
@@ -232,17 +231,10 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
assert value
-@pytest.mark.parametrize(
- "remote_storage_kind",
- # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
- [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
-)
-def test_pageserver_metrics_removed_after_detach(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
+def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
"""Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
neon_env_builder.num_safekeepers = 3
@@ -282,9 +274,6 @@ def test_pageserver_metrics_removed_after_detach(
for tenant in [tenant_1, tenant_2]:
pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
expected = set(PAGESERVER_PER_TENANT_METRICS)
- if remote_storage_kind == RemoteStorageKind.NOOP:
- # if there's no remote storage configured, we don't expose the remote timeline client metrics
- expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
assert pre_detach_samples == expected
env.pageserver.http_client().tenant_detach(tenant)
@@ -294,9 +283,7 @@ def test_pageserver_metrics_removed_after_detach(
# Check that empty tenants work with or without the remote storage
-@pytest.mark.parametrize(
- "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_pageserver_with_empty_tenants(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0e4df21d83..3af144c31c 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
last_flush_lsn_upload,
- wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
@@ -145,19 +144,12 @@ DELETE_FAILPOINTS = [
def combinations():
result = []
- remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+ remotes = [RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in DELETE_FAILPOINTS:
- if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
- "timeline-delete-before-index-delete",
- "timeline-delete-after-index-delete",
- ):
- # the above failpoints are not relevant for config without remote storage
- continue
-
result.append((remote_storage_kind, delete_failpoint))
return result
@@ -205,23 +197,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
- if remote_storage_kind is RemoteStorageKind.NOOP:
- wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
- else:
- last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
- if remote_storage_kind in available_s3_storages():
- assert_prefix_not_empty(
- neon_env_builder,
- prefix="/".join(
- (
- "tenants",
- str(env.initial_tenant),
- "timelines",
- str(timeline_id),
- )
- ),
- )
+ last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+ if remote_storage_kind in available_s3_storages():
+ assert_prefix_not_empty(
+ neon_env_builder,
+ prefix="/".join(
+ (
+ "tenants",
+ str(env.initial_tenant),
+ "timelines",
+ str(timeline_id),
+ )
+ ),
+ )
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
# It appears when we stopped flush loop during deletion and then pageserver is stopped
@@ -807,6 +797,8 @@ def test_delete_orphaned_objects(
reason = timeline_info["state"]["Broken"]["reason"]
assert reason.endswith(f"failpoint: {failpoint}"), reason
+ ps_http.deletion_queue_flush(execute=True)
+
for orphan in orphans:
assert not orphan.exists()
assert env.pageserver.log_contains(
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f856b26c6e..c2e93c48c7 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -301,12 +301,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
# message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_init(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
- if remote_storage_kind is not None:
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
@@ -337,17 +333,12 @@ def test_timeline_physical_size_init(
)
assert_physical_size_invariants(
- get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
- remote_storage_kind,
+ get_physical_size_values(env, env.initial_tenant, new_timeline_id),
)
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_checkpoint(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
- if remote_storage_kind is not None:
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
@@ -369,19 +360,14 @@ def test_timeline_physical_size_post_checkpoint(
def check():
assert_physical_size_invariants(
- get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
- remote_storage_kind,
+ get_physical_size_values(env, env.initial_tenant, new_timeline_id),
)
wait_until(10, 1, check)
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_compaction(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
- if remote_storage_kind is not None:
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# Disable background compaction as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -420,21 +406,15 @@ def test_timeline_physical_size_post_compaction(
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
- if remote_storage_kind is not None:
- wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+ wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
assert_physical_size_invariants(
- get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
- remote_storage_kind,
+ get_physical_size_values(env, env.initial_tenant, new_timeline_id),
)
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_gc(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
- if remote_storage_kind is not None:
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -471,12 +451,10 @@ def test_timeline_physical_size_post_gc(
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
- if remote_storage_kind is not None:
- wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+ wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
assert_physical_size_invariants(
- get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
- remote_storage_kind,
+ get_physical_size_values(env, env.initial_tenant, new_timeline_id),
)
@@ -560,14 +538,10 @@ def test_timeline_size_metrics(
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_tenant_physical_size(
- neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
random.seed(100)
- if remote_storage_kind is not None:
- neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
@@ -575,12 +549,10 @@ def test_tenant_physical_size(
client = env.pageserver.http_client()
tenant, timeline = env.neon_cli.create_tenant()
- if remote_storage_kind is not None:
- wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
def get_timeline_resident_physical_size(timeline: TimelineId):
- sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
- assert_physical_size_invariants(sizes, remote_storage_kind)
+ sizes = get_physical_size_values(env, tenant, timeline)
+ assert_physical_size_invariants(sizes)
return sizes.prometheus_resident_physical
timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -600,8 +572,7 @@ def test_tenant_physical_size(
wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
pageserver_http.timeline_checkpoint(tenant, timeline)
- if remote_storage_kind is not None:
- wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
+ wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
@@ -630,7 +601,6 @@ def get_physical_size_values(
env: NeonEnv,
tenant_id: TenantId,
timeline_id: TimelineId,
- remote_storage_kind: Optional[RemoteStorageKind],
) -> TimelinePhysicalSizeValues:
res = TimelinePhysicalSizeValues()
@@ -646,12 +616,9 @@ def get_physical_size_values(
res.prometheus_resident_physical = metrics.query_one(
"pageserver_resident_physical_size", metrics_filter
).value
- if remote_storage_kind is not None:
- res.prometheus_remote_physical = metrics.query_one(
- "pageserver_remote_physical_size", metrics_filter
- ).value
- else:
- res.prometheus_remote_physical = None
+ res.prometheus_remote_physical = metrics.query_one(
+ "pageserver_remote_physical_size", metrics_filter
+ ).value
detail = client.timeline_detail(
tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
@@ -664,20 +631,15 @@ def get_physical_size_values(
return res
-def assert_physical_size_invariants(
- sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
# resident phyiscal size is defined as
assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
# we don't do layer eviction, so, all layers are resident
assert sizes.api_current_physical == sizes.prometheus_resident_physical
- if remote_storage_kind is not None:
- assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
- # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
- else:
- assert sizes.prometheus_remote_physical is None
+ assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
+ # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
# Timeline logical size initialization is an asynchronous background task that runs once,