Compare commits

..

12 Commits

Author SHA1 Message Date
John Spray
2ce2574aa4 docs: pageserver controller rfc 2023-09-29 18:24:24 +01:00
John Spray
dc5f107170 docs: sharding phase 1 RFC 2023-09-29 18:20:13 +01:00
John Spray
1569446396 clarifications 2023-09-27 10:38:52 +01:00
John Spray
a8143a3bed Align cutover downloads with heatmap 2023-09-27 10:38:52 +01:00
John Spray
689b6f14b7 Apply suggestions from code review
Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-09-27 10:38:41 +01:00
John Spray
9c1c06ad17 Bump number of migration RFC 2023-09-26 10:57:11 +01:00
John Spray
40d2a73a0c Merge remote-tracking branch 'upstream/main' into jcsp/rfc-migration 2023-09-26 10:56:57 +01:00
John Spray
89ddefb428 Note safety requirement for AttachedMulti & out of scope item 2023-09-12 14:15:46 +01:00
John Spray
cad0799521 Mention disabling consumption metrics in AttachedStale 2023-09-12 14:04:01 +01:00
John Spray
1143e2e9ce Clarifications 2023-09-12 12:13:29 +01:00
Christian Schwarz
ef3e75abc3 for #5029 (rfc tenant migrations): editorial fixes (#5185) 2023-09-01 18:10:44 +01:00
John Spray
cfb285139c docs/rfcs: add RFC for fast tenant migration/failover 2023-08-31 10:55:17 +01:00
77 changed files with 1591 additions and 4824 deletions

View File

@@ -834,7 +834,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.12
VM_BUILDER_VERSION: v0.17.11
steps:
- name: Checkout

View File

@@ -5,7 +5,7 @@
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/storage
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/safekeepers

52
Cargo.lock generated
View File

@@ -158,17 +158,6 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "async-channel"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
dependencies = [
"concurrent-queue",
"event-listener",
"futures-core",
]
[[package]]
name = "async-compression"
version = "0.4.0"
@@ -1026,15 +1015,6 @@ dependencies = [
"zstd",
]
[[package]]
name = "concurrent-queue"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "const_format"
version = "0.2.30"
@@ -1455,12 +1435,6 @@ dependencies = [
"libc",
]
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "fail"
version = "0.5.1"
@@ -1806,9 +1780,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.3.3"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "hex"
@@ -2070,7 +2053,7 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"libc",
"windows-sys 0.48.0",
]
@@ -2087,7 +2070,7 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix 0.37.19",
"windows-sys 0.48.0",
@@ -2461,11 +2444,11 @@ dependencies = [
[[package]]
name = "num_cpus"
version = "1.16.0"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
dependencies = [
"hermit-abi",
"hermit-abi 0.2.6",
"libc",
]
@@ -2682,7 +2665,6 @@ name = "pageserver"
version = "0.1.0"
dependencies = [
"anyhow",
"async-channel",
"async-compression",
"async-stream",
"async-trait",
@@ -3264,7 +3246,6 @@ dependencies = [
"reqwest-tracing",
"routerify",
"rstest",
"rustc-hash",
"rustls",
"rustls-pemfile",
"scopeguard",
@@ -3436,7 +3417,6 @@ dependencies = [
"metrics",
"once_cell",
"pin-project-lite",
"rand",
"scopeguard",
"serde",
"serde_json",

View File

@@ -107,7 +107,6 @@ reqwest-middleware = "0.2.0"
reqwest-retry = "0.2.2"
routerify = "3"
rpds = "0.13"
rustc-hash = "1.1.0"
rustls = "0.21"
rustls-pemfile = "1"
rustls-split = "0.3"

View File

@@ -614,11 +614,15 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
#########################################################################################
#
# Layer "rust extensions" for older extension which hasn't been updated to `pgrx` yet
# Layer "rust extensions"
# This layer is used to build `pgx` deps
#
# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
# dependency on all the rust extension that depend on it, too.
#
#########################################################################################
FROM build-deps AS rust-extensions-build-pgx
FROM build-deps AS rust-extensions-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
@@ -650,34 +654,6 @@ RUN case "${PG_VERSION}" in \
USER root
#########################################################################################
#
# Layer "rust extensions"
# This layer is used to build `pgrx` deps
#
#########################################################################################
FROM build-deps AS rust-extensions-build-pgrx
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y curl libclang-dev cmake && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --locked --version 0.10.2 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layer "pg-jsonschema-pg-build"
@@ -685,7 +661,7 @@ USER root
#
#########################################################################################
FROM rust-extensions-build-pgx AS pg-jsonschema-pg-build
FROM rust-extensions-build AS pg-jsonschema-pg-build
ARG PG_VERSION
# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
@@ -714,7 +690,7 @@ RUN case "${PG_VERSION}" in \
#
#########################################################################################
FROM rust-extensions-build-pgx AS pg-graphql-pg-build
FROM rust-extensions-build AS pg-graphql-pg-build
ARG PG_VERSION
# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
@@ -748,14 +724,24 @@ RUN case "${PG_VERSION}" in \
#
#########################################################################################
FROM rust-extensions-build-pgrx AS pg-tiktoken-pg-build
FROM rust-extensions-build AS pg-tiktoken-pg-build
ARG PG_VERSION
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
cargo pgrx install --release && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
@@ -765,18 +751,24 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
#
#########################################################################################
FROM rust-extensions-build-pgrx AS pg-pgx-ulid-build
FROM rust-extensions-build AS pg-pgx-ulid-build
ARG PG_VERSION
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
;; \
"v16") \
echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
;; \
*) \
echo "unexpected PostgreSQL version" && exit 1 \
;; \
esac && \
wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
echo "********************************************************************************************************" && \
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################

View File

@@ -223,7 +223,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
if attach_req.pageserver_id.is_some() {
tenant_state.generation += 1;
}
tenant_state.pageserver = attach_req.pageserver_id;
let generation = tenant_state.generation;
locked.save().await.map_err(ApiError::InternalServerError)?;

View File

@@ -23,7 +23,7 @@ vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = []
ignore = ["RUSTSEC-2023-0052"]
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:

View File

@@ -0,0 +1,244 @@
# Sharding Phase 1: Static Key-space Sharding
## Summary
To enable databases with sizes approaching the capacity of a pageserver's disk,
it is necessary to break up the storage for the database, or _shard_ it.
Sharding in general is a complex area. This RFC aims to define a modest initial
capability that will permit creating large-capacity databases using a static configuration
defined at time of Tenant creation.
## Motivation
Currently, all data for a Tenant, including all its timelines, is stored on a single
pageserver. The local storage required may be several times larger than the actual
database size, due to LSM write inflation.
If a database is larger than what one pageserver can hold, then it becomes impossible
for the pageserver to hold it in local storage, as it must do to provide service to
clients.
### Prior art
Numerous: sharding is a long-discussed feature for the pageserver.
Prior art in other distributed systems is too broad to capture here: pretty much
any scale out storage system does something like this.
## Requirements
- Enable creating a large (for example, 16TiB) database without requiring dedicated
pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
does not write out a single contiguous ranges of page numbers.
*Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
that a user might create on a current-gen enterprise SSD should also work well on
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
pageserver backend is not the limiting factor in the database size*.
## Non Goals
- Independently distributing timelines within the same tenant. If a tenant has many
timelines, then sharding may be a less efficient mechanism for distributing load than
sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
based on the idea that separate mechanisms will make sense for each dimension.
## Impacted Components
pageserver, control plane, safekeeper (optional)
## Terminology
**Key**: a postgres page number. In the sense that the pageserver is a versioned key-value store,
the page number is the key in that store.
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
of keys and LSNs as a two dimensional space.
## Implementation
### Key sharding vs. LSN sharding
When we think of sharding across the two dimensional key/lsn space, this is an
opportunity to think about how the two dimensions differ:
- Sharding the key space distributes the _write_ workload of ingesting data
and compacting. This work must be carefully managed so that exactly one
node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
can be done by anyone without any special coordination, as long as they can
see the remote index and layers.
The key sharding is the harder part, and also the more urgent one, to support larger
capacity databases. Because distributing historical LSN read work is a relatively
simpler problem that most users don't have, we defer it to future work. It is anticipated
that some quite simple P2P offload model will enable distributing work for historical
reads: a node which is low on space can call out to peer to ask it to download and
serve reads from a historical layer.
### Key mapping scheme
Having decided to focus on key sharding, we must next decide how we will map
keys to shards.
It is proposed to use a "wide striping" approach, to obtain a good compromise
between data locality and avoiding entire large relations mapping to the same shard.
The mapping is quite simple:
- Define a stripe size, such as 256MiB. Map this to a key count, such that a contiguous
range of 256MiB keys would all fall into this stripe, i.e. divide by 8kiB to get 32k.
- Map a key to a stripe by integer division.
- Map a stripe to a shard by taking the shard index modulo the shard count.
This scheme will achieve a good balance as long as there is no aliasing of the keys
to the stripe width. In the example above, if someone had 4 shards and wrote
keys that were all 4*32k apart, they would all map to the same shard. However, we do
not have to worry about this, since end users do not control page numbers: as long as
we do not pick stripe sizes that map to any problematic postgres behaviors, we'll be fine.
### Important Types
#### `ShardMap`
Provides all the information needed to route a request for a particular
key to the correct pageserver:
- Stripe size
- Shard count
- Address of the pageserver hosting each shard
This structure's size is linear with the number of shards.
#### `ShardIdentity`
Provides the information needed to know whether a particular key belongs
to a particular shard:
- Stripe size
- Shard count
- Shard index
This structure's size is constant.
### Pageserver changes
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
TenantShards, which are just a `Tenant` plus a `ShardIdentity` telling it which part
of the keyspace it owns.
When the pageserver subscribes to a safekeeper for WAL updates, it must provide
its `ShardIdentity` to receive the relevant subset of the WAL.
When the pageserver writes layers and index_part.json to remote storage, it must
include the shard index & count in the name, to avoid collisions (the count is
necessary for future-proofing: the count will vary in time). These keys
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
exactly the same for TenantShards as it does for Tenants today: each shard will have
its own generation number.
The pageserver doesn't have to do anything special during ingestion, compaction
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
This will result in sparse layer files, containing keys only in the stripes that this
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
the key range, these should be updated to ignore gaps that are due to sharding, to
avoid spuriously splitting up layers ito stripe-sized pieces.
### Pageserver Controller changes
The pageserver controller is a new component, which is responsible for abstracting
away the business of managing individual tenant placement on pagservers. It will
also act as the abstraction on top of sharding, so that the control plane continue
to see a Tenant as a single object, even though the reality is that it is many
TenantShards.
For the rest of this RFC, think of the Pageserver Controller as a component of
the control plane. The actual implementation is beyond the scope of this RFC
and will be described in more detail elsewhere.
### Safekeeper changes
The safekeeper's API for subscribing to a WAL will be extended to enable callers
to provide a `ShardIdentity`. In this mode it will only send WAL entries that
fall within the keyspace belonging to the shard, and WAL entries that are to
be mirrored to all shards.
Metadata updates describing databases+relations are mirrored to
all shards, and other WAL messages are only provided to the shard
that owns the key being updated. For any operation that updates multiple
keys, it will be provided to all the shards whose key ranges intersect with
one or more of the keys referenced in the WAL message.
### Pageserver Controller
### Endpoints
Compute endpoints will need to:
- Accept a ShardMap as part of their configuration from the control plane
- Route pageserver requests according to that ShardMap
### Control Plane
#### Publishing ShardMap updates
The control plane will provide an API for the pageserver controller to publish updates
to the ShardMap for a tenant. When such an update is provided, it will be used to
update the configuration of any endpoints currently active for the tenant.
The ShardMap will be opaque to the Control Plane: it doesn't need to do anything with it
other than storing and passing on to endpoints.
#### Attaching via the Pageserver Controller
The Control Plane will issue attach/create API calls to the pageserver controller
instead of directly to pageservers. This will relieve the control plane of the need
to know about sharding.
#### Enabling sharding for large tenants
When a Tenant is created, it is up to the control plane to provide a hint to
the pageserver about how large it will be. This may be implemented as a service tier,
where users creating very large databases would be onboarded to the tier, and then
the Tenants they create would be created with a larger number of shards. For the
general population of users we should continue to use 1 shard by default.
## Next Steps
Clearly, the mechanism described in this RFC has substantial limitations:
- A) the number of shards in a tenant is defined at creation time.
- B) data is not distributed across the LSN dimension
To address `A`, a _splitting_ feature will later be added. One shard can split its
data into a number of children by doing a special compaction operation to generate
image layers broken up child-shard-wise, and then writing out an index_part.json for
each child. This will then require coordination with the pageserver controller to
safely attach these new child shards and then move them around to distribute work.
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
once a Tenant has been sharded, there is little value in merging it again.
To address `B`, it is envisaged to have some gossip mechanism for pageservers to communicate
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
ask another to go read the necessary layers from remote storage to serve the read. This
requires relativly little coordination because it is read-only: any node can service any
read. All reads to a particular shard would still flow through one node, but the
disk capactity & I/O impact of servicing the read would be distributed.
## FAQ/Alternatives
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
When a database is growing under a write workload, writes may predominantly hit the
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
is intensively re-writing a particular relation, if that relation lived in a particular
shard then it would not achieve our goal of distributing the write work across shards.
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
Two reasons:
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
database would still cause a load hotspot on the pageserver routing its read requests.
2. Implementing a proxy model as a stop-gap would not be a cheap option, because
it requires making pageservers aware of their peers, and adding synchronisation to
keep pageservers aware of their peers as they come and go.

View File

@@ -0,0 +1,119 @@
# Pageserver Controller Phase 1: Generations
## Summary
In the [generation numbers RFC](025-generation-numbers.md), it was proposed that
the console/control plane would act as the central coordinator for issuing generation
numbers.
That approach has not proven practical, so this RFC proposes an alternative implementation
where generation numbers are managed in a different service.
Calls to generation-aware pageserver APIs like create/attach will call out to this
new _pageserver controller_ to acquire generation numbers. This service will also
form the basis for satisfying future pageserver management requirements, such as
coordinating sharding, doing automatic capacity balancing, and many more.
## Motivation
This is a dependency for delivering high availability.
### Prior art
None
## Requirements
- Provide a hook for the pageserver to use when it receives an attach/create/load API
call, which will yield a generation that is safe for the pageserver to use.
- Implement the /re-attach and /validate APIs required for the generation numbers feature
to work.
## Non Goals
- This is not intended to interact with any components other than the pageserver, or
to integrate with the broader control plane in any way.
## Impacted Components
pageserver, pageserver controller (new)
## Implementation
We may start from the minimal `attachment_service` used in automated tests.
### Data store
For generation numbers, we need a persistent, linearizable data store. Postgres is sufficient for
this: we already have postgres instances used for other control plane work.
The storage for the Pageserver Controller will be independent of other components:
it might use the same physical database server but would use an independent database.
### Deployment
There will be one instance per region. In future we would aim to define the concept
of a pageserver cluster and have one controller per cluster, but in the short term
one per region will be functionally okay for current scale.
The pageserver controller will be deployed within kubernetes, in the same way as
the storage broker (which is currently via a [helm chart](https://github.com/neondatabase/helm-charts/tree/main/charts/neon-storage-broker)).
### Security
The pageserver controller's API will do authentication with JWT, the same as
the pageserver's existing API.
### Correctness
It is essential that pageservers call into the controller at the _very start_ of
handling attach/create/load API requests. They should not do any work at all until
they have acquired that generation number.
If the call fails, they must retry: it is not safe to proceed without a generation number.
## Future
Having a call chain that goes `Control plane -> Pageserver -> Pageserver controller`
is clearly a little strange: we are only doing this to avoid needing to make changes
to the control plane.
In future, we will change the control plane to call directly into the pageserver
controller, which would then call onwards into the pageserver. This would be a fairly
small change to the controller, since all the logic around storing and updating
generation numbers would stay the same: just the behavior of the API frontend
would be different.
The work to enable pageservers to communicate with the controller is not wasted,
because they still communicate in that direction when invoking `/re-attach`
and `/validate`
## Alternatives considered
### Run in the console/control plane codebase
The control plane is a large Go codebase that uses extensive code generation, and
has to be quite generic to manage many different types of component.
### Direct DB access
We could have pageservers call directly into a shared database to acquire and update
generation numbers (with carefully crafted transactions to protect against concurrent
attaches getting the same generation, etc).
Pros:
- No extra service required, simpler deployment
Cons:
- No future path to a cleaner architecture: the pageserver controller can be implemented
as an extensible place for implement more functionality in future, whereas a mechanism
to do generation numbers via SQL queries from the pageserver would be specialized
and the code would probably be disposed of in the relatively near future.
- Puts onus entirely on SQL query correctness to mediate concurrent access.
The pageserver controller also has to be correct in this respect in case there
is more than one instance running, but it is much less likely to hit this path,
so the overall risk of issues is lower when using a central service.
The main downside to that approach is that it doesn't provide the future path that
the pageserver controller does

View File

@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize, serde::Deserialize)]
#[derive(serde::Serialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
}

View File

@@ -363,15 +363,8 @@ pub struct TimelineInfo {
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
/// The LSN that we are advertizing to safekeepers
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.

View File

@@ -29,4 +29,3 @@ workspace_hack.workspace = true
[dev-dependencies]
tempfile.workspace = true
test-context.workspace = true
rand.workspace = true

View File

@@ -20,7 +20,6 @@ use std::{
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use tokio::io;
use toml_edit::Item;
use tracing::info;
@@ -43,9 +42,6 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
/// As defined in S3 docs
pub const MAX_KEYS_PER_DELETE: usize = 1000;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
/// Path on the remote storage, relative to some inner prefix.
@@ -54,25 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
impl Serialize for RemotePath {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl<'de> Deserialize<'de> for RemotePath {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let str = String::deserialize(deserializer)?;
Ok(Self(PathBuf::from(&str)))
}
}
impl std::fmt::Display for RemotePath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.display())
@@ -111,10 +88,6 @@ impl RemotePath {
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}
}
/// Storage (potentially remote) API to manage its state.

View File

@@ -33,10 +33,11 @@ use tracing::debug;
use super::StorageMetadata;
use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics;
use self::metrics::{AttemptOutcome, RequestKind};
@@ -499,7 +500,7 @@ impl RemoteStorage for S3Bucket {
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
let started_at = start_measuring_requests(kind);
let resp = self

View File

@@ -378,30 +378,21 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
// due to how time works, we've had test runners use the same nanos as bucket prefixes.
// millis is just a debugging aid for easier finding the prefix later.
let millis = std::time::SystemTime::now()
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
// because nanos can be the same for two threads so can millis, add randomness
let random = rand::thread_rng().gen::<u32>();
.as_nanos();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,
prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,

View File

@@ -89,22 +89,6 @@ impl Generation {
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn next(&self) -> Generation {
match self {
Self::Valid(n) => Self::Valid(*n + 1),
Self::None => Self::Valid(1),
Self::Broken => panic!("Attempted to use a broken generation"),
}
}
pub fn into(self) -> Option<u32> {
if let Self::Valid(v) = self {
Some(v)
} else {
None
}
}
}
impl Serialize for Generation {

View File

@@ -24,9 +24,6 @@ pub enum ApiError {
#[error("Precondition failed: {0}")]
PreconditionFailed(Box<str>),
#[error("Shutting down")]
ShuttingDown,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -55,10 +52,6 @@ impl ApiError {
self.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -216,24 +216,6 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
}
}
/// When you will store a secret but want to make sure it won't
/// be accidentally logged, wrap it in a SecretString, whose Debug
/// implementation does not expose the contents.
#[derive(Clone, Eq, PartialEq)]
pub struct SecretString(String);
impl SecretString {
pub fn get_contents(&self) -> &str {
self.0.as_str()
}
}
impl std::fmt::Debug for SecretString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[SECRET]")
}
}
#[cfg(test)]
mod tests {
use metrics::{core::Opts, IntCounterVec};

View File

@@ -431,14 +431,14 @@ impl CgroupWatcher {
.context("failed to request upscale")?;
let memory_high =
self.get_memory_high_bytes().context("failed to get memory.high")?;
self.get_high_bytes().context("failed to get memory.high")?;
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
info!(
current_high_bytes = memory_high,
new_high_bytes = new_high,
"updating memory.high"
);
self.set_memory_high_bytes(new_high)
self.set_high_bytes(new_high)
.context("failed to set memory.high")?;
last_memory_high_increase_at = Some(Instant::now());
continue;
@@ -556,6 +556,14 @@ impl CgroupWatcher {
}
}
/// Represents a set of limits we apply to a cgroup to control memory usage.
///
/// Setting these values also affects the thresholds for receiving usage alerts.
#[derive(Debug)]
pub struct MemoryLimits {
pub high: u64,
}
// Methods for manipulating the actual cgroup
impl CgroupWatcher {
/// Get a handle on the freezer subsystem.
@@ -616,29 +624,50 @@ impl CgroupWatcher {
}
/// Set cgroup memory.high threshold.
pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
}
/// Set the cgroup's memory.high to 'max', disabling it.
pub fn unset_memory_high(&self) -> anyhow::Result<()> {
self.set_memory_high_internal(MaxValue::Max)
}
fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
self.memory()
.context("failed to get memory subsystem")?
.set_mem(cgroups_rs::memory::SetMemory {
low: None,
high: Some(value),
high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
min: None,
max: None,
})
.map_err(anyhow::Error::from)
.context("failed to set memory.high")
}
/// Set cgroup memory.high and memory.max.
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
info!(limits.high, path = self.path(), "writing new memory limits",);
self.memory()
.context("failed to get memory subsystem while setting memory limits")?
.set_mem(cgroups_rs::memory::SetMemory {
min: None,
low: None,
high: Some(MaxValue::Value(
u64::min(limits.high, i64::MAX as u64) as i64
)),
max: None,
})
.context("failed to set memory limits")
}
/// Given some amount of available memory, set the desired cgroup memory limits
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
let new_high = self.config.calculate_memory_high_value(available_memory);
let limits = MemoryLimits { high: new_high };
info!(
path = self.path(),
memory = ?limits,
"setting cgroup memory",
);
self.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
Ok(())
}
/// Get memory.high threshold.
pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
let high = self
.memory()
.context("failed to get memory subsystem while getting memory statistics")?

View File

@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
use crate::cgroup::{CgroupWatcher, Sequenced};
use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
use crate::dispatcher::Dispatcher;
use crate::filecache::{FileCacheConfig, FileCacheState};
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,51 +106,6 @@ impl Runner {
kill,
};
// If we have both the cgroup and file cache integrations enabled, it's possible for
// temporary failures to result in cgroup throttling (from memory.high), that in turn makes
// it near-impossible to connect to the file cache (because it times out). Unfortunately,
// we *do* still want to determine the file cache size before setting the cgroup's
// memory.high, so it's not as simple as just swapping the order.
//
// Instead, the resolution here is that on vm-monitor startup (note: happens on each
// connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
// temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
// of a hacky solution, but helps with reliability.
if let Some(name) = &args.cgroup {
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
// now, and then set limits later.
info!("initializing cgroup");
let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
.context("failed to create cgroup manager")?;
info!("temporarily unsetting memory.high");
// Temporarily un-set cgroup memory.high; see above.
cgroup
.unset_memory_high()
.context("failed to unset memory.high")?;
let cgroup = Arc::new(cgroup);
let cgroup_clone = Arc::clone(&cgroup);
spawn_with_cancel(
token.clone(),
|_| error!("cgroup watcher terminated"),
async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
);
state.cgroup = Some(cgroup);
} else {
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
// This allows us to poll it in `Monitor::run` regardless of whether we
// are managing a cgroup or not. If we don't forget it, all receives will
// immediately return an error because the sender is droped and it will
// claim all select! statements, effectively turning `Monitor::run` into
// `loop { fail to receive }`.
mem::forget(requesting_send);
}
let mut file_cache_reserved_bytes = 0;
let mem = get_total_system_memory();
@@ -164,7 +119,7 @@ impl Runner {
false => FileCacheConfig::default_in_memory(),
};
let mut file_cache = FileCacheState::new(connstr, config, token)
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
.await
.context("failed to create file cache")?;
@@ -197,15 +152,35 @@ impl Runner {
state.filecache = Some(file_cache);
}
if let Some(cgroup) = &state.cgroup {
let available = mem - file_cache_reserved_bytes;
let value = cgroup.config.calculate_memory_high_value(available);
if let Some(name) = &args.cgroup {
let (mut cgroup, cgroup_event_stream) =
CgroupWatcher::new(name.clone(), requesting_send)
.context("failed to create cgroup manager")?;
info!(value, "setting memory.high");
let available = mem - file_cache_reserved_bytes;
cgroup
.set_memory_high_bytes(value)
.context("failed to set cgroup memory.high")?;
.set_memory_limits(available)
.context("failed to set cgroup memory limits")?;
let cgroup = Arc::new(cgroup);
// Some might call this . . . cgroup v2
let cgroup_clone = Arc::clone(&cgroup);
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
cgroup_clone.watch(notified_recv, cgroup_event_stream).await
});
state.cgroup = Some(cgroup);
} else {
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
// This allows us to poll it in `Monitor::run` regardless of whether we
// are managing a cgroup or not. If we don't forget it, all receives will
// immediately return an error because the sender is droped and it will
// claim all select! statements, effectively turning `Monitor::run` into
// `loop { fail to receive }`.
mem::forget(requesting_send);
}
Ok(state)
@@ -282,11 +257,14 @@ impl Runner {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
// new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
// since it is properly initialized in the previous cgroup if let block
let limits = MemoryLimits {
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
// since it is properly initialized in the previous cgroup if let block
high: new_cgroup_mem_high,
};
cgroup
.set_memory_high_bytes(new_cgroup_mem_high)
.context("failed to set cgroup memory.high")?;
.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
let message = format!(
"set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -349,9 +327,12 @@ impl Runner {
name = cgroup.path(),
"updating cgroup memory.high",
);
let limits = MemoryLimits {
high: new_cgroup_mem_high,
};
cgroup
.set_memory_high_bytes(new_cgroup_mem_high)
.context("failed to set cgroup memory.high")?;
.set_limits(&limits)
.context("failed to set file cache size")?;
}
Ok(())

View File

@@ -81,7 +81,6 @@ enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true
tempfile.workspace = true
async-channel = "1.9.0"
[dev-dependencies]
criterion.workspace = true

View File

@@ -8,7 +8,6 @@ use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -21,7 +20,6 @@ use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -348,22 +346,9 @@ fn start_pageserver(
}
};
// Top-level cancellation token for the process
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
remote_storage.clone(),
ControlPlaneClient::new(conf, &shutdown_pageserver),
conf,
);
if let Some(deletion_workers) = deletion_workers {
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
}
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
@@ -394,13 +379,13 @@ fn start_pageserver(
};
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client,
},
order,
shutdown_pageserver.clone(),
@@ -496,10 +481,9 @@ fn start_pageserver(
http::routes::State::new(
conf,
http_auth.clone(),
remote_storage.clone(),
remote_storage,
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
)
.context("Failed to initialize router state")?,
);
@@ -605,31 +589,6 @@ fn start_pageserver(
);
}
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::BackgroundRuntimeTurnaroundMeasure,
None,
None,
"background runtime turnaround measure",
true,
async move {
let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
let server = server
.serve(hyper::service::make_service_fn(|_| async move {
Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
move |_: hyper::Request<hyper::Body>| async move {
Ok::<_, std::convert::Infallible>(hyper::Response::new(
hyper::Body::from(format!("alive")),
))
},
))
}))
.with_graceful_shutdown(task_mgr::shutdown_watcher());
server.await?;
Ok(())
},
);
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
@@ -652,12 +611,7 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}
})
@@ -669,7 +623,7 @@ fn create_remote_storage_client(
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
tracing::warn!("no remote storage configured, this is a deprecated configuration");
// No remote storage configured.
return Ok(None);
};

View File

@@ -11,7 +11,6 @@ use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
use reqwest::Url;
@@ -208,9 +207,6 @@ pub struct PageServerConf {
pub background_task_maximum_delay: Duration,
pub control_plane_api: Option<Url>,
/// JWT token for use with the control plane API.
pub control_plane_api_token: Option<SecretString>,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -287,7 +283,6 @@ struct PageServerConfigBuilder {
background_task_maximum_delay: BuilderValue<Duration>,
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
}
impl Default for PageServerConfigBuilder {
@@ -352,7 +347,6 @@ impl Default for PageServerConfigBuilder {
.unwrap()),
control_plane_api: Set(None),
control_plane_api_token: Set(None),
}
}
}
@@ -481,8 +475,8 @@ impl PageServerConfigBuilder {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn control_plane_api(&mut self, api: Option<Url>) {
self.control_plane_api = BuilderValue::Set(api)
pub fn control_plane_api(&mut self, api: Url) {
self.control_plane_api = BuilderValue::Set(Some(api))
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -573,9 +567,6 @@ impl PageServerConfigBuilder {
control_plane_api: self
.control_plane_api
.ok_or(anyhow!("missing control_plane_api"))?,
control_plane_api_token: self
.control_plane_api_token
.ok_or(anyhow!("missing control_plane_api_token"))?,
})
}
}
@@ -589,27 +580,6 @@ impl PageServerConf {
self.workdir.join(TENANTS_SEGMENT_NAME)
}
pub fn deletion_prefix(&self) -> PathBuf {
self.workdir.join("deletion")
}
pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix()
.join(format!("{sequence:016x}-{VERSION:02x}.list"))
}
pub fn deletion_header_path(&self) -> PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
const VERSION: u8 = 1;
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
@@ -777,14 +747,7 @@ impl PageServerConf {
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
"control_plane_api" => {
let parsed = parse_toml_string(key, item)?;
if parsed.is_empty() {
builder.control_plane_api(None)
} else {
builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
}
},
"control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -954,7 +917,6 @@ impl PageServerConf {
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
control_plane_api: None,
control_plane_api_token: None,
}
}
}
@@ -1178,8 +1140,7 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
control_plane_api: None,
control_plane_api_token: None
control_plane_api: None
},
"Correct defaults should be used when no config values are provided"
);
@@ -1235,8 +1196,7 @@ background_task_maximum_delay = '334 s'
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None
control_plane_api: None
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -1,9 +1,7 @@
use std::collections::HashMap;
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
};
use serde::{de::DeserializeOwned, Serialize};
use hyper::StatusCode;
use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{
@@ -14,34 +12,25 @@ use utils::{
use crate::config::PageServerConf;
// Backoffs when control plane requests do not succeed: compromise between reducing load
// on control plane, and retrying frequently when we are blocked on a control plane
// response to make progress.
const BACKOFF_INCREMENT: f64 = 0.1;
const BACKOFF_MAX: f64 = 10.0;
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
pub struct ControlPlaneClient {
pub(crate) struct ControlPlaneClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
cancel: CancellationToken,
}
/// Represent operations which internally retry on all errors other than
/// cancellation token firing: the only way they can fail is ShuttingDown.
pub enum RetryForeverError {
ShuttingDown,
}
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
let mut url = match conf.control_plane_api.as_ref() {
Some(u) => u.clone(),
None => return None,
@@ -53,78 +42,39 @@ impl ControlPlaneClient {
segs.pop_if_empty().push("");
}
let mut client = reqwest::ClientBuilder::new();
if let Some(jwt) = &conf.control_plane_api_token {
let mut headers = hyper::HeaderMap::new();
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
client = client.default_headers(headers);
}
let client = reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
Some(Self {
http_client: client.build().expect("Failed to construct HTTP client"),
http_client: client,
base_url: url,
node_id: conf.id,
cancel: cancel.clone(),
})
}
async fn retry_http_forever<R, T>(
async fn try_re_attach(
&self,
url: &url::Url,
request: R,
) -> Result<T, RetryForeverError>
where
R: Serialize,
T: DeserializeOwned,
{
#[derive(thiserror::Error, Debug)]
enum RemoteAttemptError {
#[error("shutdown")]
Shutdown,
#[error("remote: {0}")]
Remote(reqwest::Error),
}
match backoff::retry(
|| async {
let response = self
.http_client
.post(url.clone())
.json(&request)
.send()
.await
.map_err(RemoteAttemptError::Remote)?;
response
.error_for_status_ref()
.map_err(RemoteAttemptError::Remote)?;
response
.json::<T>()
.await
.map_err(RemoteAttemptError::Remote)
},
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
)
.await
{
Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
Err(RemoteAttemptError::Remote(_)) => {
panic!("We retry forever, this should never be reached");
url: Url,
request: &ReAttachRequest,
) -> anyhow::Result<ReAttachResponse> {
match self.http_client.post(url).json(request).send().await {
Err(e) => Err(anyhow::Error::from(e)),
Ok(r) => {
if r.status() == StatusCode::OK {
r.json::<ReAttachResponse>()
.await
.map_err(anyhow::Error::from)
} else {
Err(anyhow::anyhow!("Unexpected status {}", r.status()))
}
}
Ok(r) => Ok(r),
}
}
}
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
/// Block until we get a successful response
pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -133,47 +83,37 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
node_id: self.node_id,
};
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
tracing::info!(
"Received re-attach response with {} tenants",
response.tenants.len()
);
let mut attempt = 0;
loop {
let result = self.try_re_attach(re_attach_path.clone(), &request).await;
match result {
Ok(res) => {
tracing::info!(
"Received re-attach response with {} tenants",
res.tenants.len()
);
Ok(response
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>())
}
/// Block until we get a successful response, or error out if we are shut down
async fn validate(
&self,
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("validate")
.expect("Failed to build validate path");
let request = ValidateRequest {
tenants: tenants
.into_iter()
.map(|(id, gen)| ValidateRequestTenant {
id,
gen: gen
.into()
.expect("Generation should always be valid for a Tenant doing deletions"),
})
.collect(),
};
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
Ok(response
.tenants
.into_iter()
.map(|rt| (rt.id, rt.valid))
.collect())
return Ok(res
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.generation)))
.collect::<HashMap<_, _>>());
}
Err(e) => {
tracing::error!("Error re-attaching tenants, retrying: {e:#}");
backoff::exponential_backoff(
attempt,
BACKOFF_INCREMENT,
BACKOFF_MAX,
&self.cancel,
)
.await;
if self.cancel.is_cancelled() {
return Err(anyhow::anyhow!("Shutting down"));
}
attempt += 1;
}
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,156 +0,0 @@
//! The deleter is the final stage in the deletion queue. It accumulates remote
//! paths to delete, and periodically executes them in batches of up to 1000
//! using the DeleteObjects request.
//!
//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
//! number of full-sized DeleteObjects requests, rather than a larger number of
//! smaller requests.
use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::MAX_KEYS_PER_DELETE;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
use tracing::warn;
use crate::metrics;
use super::DeletionQueueError;
use super::FlushOp;
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
pub(super) enum DeleterMessage {
Delete(Vec<RemotePath>),
Flush(FlushOp),
}
/// Non-persistent deletion queue, for coalescing multiple object deletes into
/// larger DeleteObjects requests.
pub(super) struct Deleter {
// Accumulate up to 1000 keys for the next deletion operation
accumulator: Vec<RemotePath>,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
remote_storage: GenericRemoteStorage,
}
impl Deleter {
pub(super) fn new(
remote_storage: GenericRemoteStorage,
rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
cancel: CancellationToken,
) -> Self {
Self {
remote_storage,
rx,
cancel,
accumulator: Vec::new(),
}
}
/// Wrap the remote `delete_objects` with a failpoint
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
fail::fail_point!("deletion-queue-before-execute", |_| {
info!("Skipping execution, failpoint set");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["failpoint"])
.inc();
Err(anyhow::anyhow!("failpoint hit"))
});
self.remote_storage.delete_objects(&self.accumulator).await
}
/// Block until everything in accumulator has been executed
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
match self.remote_delete().await {
Ok(()) => {
// Note: we assume that the remote storage layer returns Ok(()) if some
// or all of the deleted objects were already gone.
metrics::DELETION_QUEUE
.keys_executed
.inc_by(self.accumulator.len() as u64);
info!(
"Executed deletion batch {}..{}",
self.accumulator
.first()
.expect("accumulator should be non-empty"),
self.accumulator
.last()
.expect("accumulator should be non-empty"),
);
self.accumulator.clear();
}
Err(e) => {
warn!("DeleteObjects request failed: {e:#}, will retry");
metrics::DELETION_QUEUE
.remote_errors
.with_label_values(&["execute"])
.inc();
}
};
}
if self.cancel.is_cancelled() {
// Expose an error because we may not have actually flushed everything
Err(DeletionQueueError::ShuttingDown)
} else {
Ok(())
}
}
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
loop {
if self.cancel.is_cancelled() {
return Err(DeletionQueueError::ShuttingDown);
}
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
return Err(DeletionQueueError::ShuttingDown);
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending
self.flush().await?;
continue;
}
};
match msg {
DeleterMessage::Delete(mut list) => {
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
self.flush().await?;
// If we have received this number of keys, proceed with attempting to execute
assert_eq!(self.accumulator.len(), 0);
}
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
let take_count = std::cmp::min(available_slots, list.len());
for path in list.drain(list.len() - take_count..) {
self.accumulator.push(path);
}
}
}
DeleterMessage::Flush(flush_op) => {
// If flush() errors, we drop the flush_op and the caller will get
// an error recv()'ing their oneshot channel.
self.flush().await?;
flush_op.notify();
}
}
}
}
}

View File

@@ -1,487 +0,0 @@
//! The list writer is the first stage in the deletion queue. It accumulates
//! layers to delete, and periodically writes out these layers into a persistent
//! DeletionList.
//!
//! The purpose of writing DeletionLists is to decouple the decision to
//! delete an object from the validation required to execute it: even if
//! validation is not possible, e.g. due to a control plane outage, we can
//! still persist our intent to delete an object, in a way that would
//! survive a restart.
//!
//! DeletionLists are passed onwards to the Validator.
use super::DeletionHeader;
use super::DeletionList;
use super::FlushOp;
use super::ValidatorQueueMessage;
use std::collections::HashMap;
use std::fs::create_dir_all;
use std::time::Duration;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
// of magnitude 1MB when we are under heavy delete load.
const DELETION_LIST_TARGET_SIZE: usize = 16384;
// Ordinarily, we only flush to DeletionList periodically, to bound the window during
// which we might leak objects from not flushing a DeletionList after
// the objects are already unlinked from timeline metadata.
const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
// more objects before doing the flush.
const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerFileName, Generation)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) struct RecoverOp {
pub(super) attached_tenants: HashMap<TenantId, Generation>,
}
#[derive(Debug)]
pub(super) enum ListWriterQueueMessage {
Delete(DeletionOp),
// Wait until all prior deletions make it into a persistent DeletionList
Flush(FlushOp),
// Wait until all prior deletions have been executed (i.e. objects are actually deleted)
FlushExecute(FlushOp),
// Call once after re-attaching to control plane, to notify the deletion queue about
// latest attached generations & load any saved deletion lists from disk.
Recover(RecoverOp),
}
pub(super) struct ListWriter {
conf: &'static PageServerConf,
// Incoming frontend requests to delete some keys
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
// Outbound requests to the backend to execute deletion lists we have composed.
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
// The list we are currently building, contains a buffer of keys to delete
// and our next sequence number
pending: DeletionList,
// These FlushOps should notify the next time we flush
pending_flushes: Vec<FlushOp>,
// Worker loop is torn down when this fires.
cancel: CancellationToken,
// Safety guard to do recovery exactly once
recovered: bool,
}
impl ListWriter {
// Initially DeletionHeader.validated_sequence is zero. The place we start our
// sequence numbers must be higher than that.
const BASE_SEQUENCE: u64 = 1;
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
cancel: CancellationToken,
) -> Self {
Self {
pending: DeletionList::new(Self::BASE_SEQUENCE),
conf,
rx,
tx,
pending_flushes: Vec::new(),
cancel,
recovered: false,
}
}
/// Try to flush `list` to persistent storage
///
/// This does not return errors, because on failure to flush we do not lose
/// any state: flushing will be retried implicitly on the next deadline
async fn flush(&mut self) {
if self.pending.is_empty() {
for f in self.pending_flushes.drain(..) {
f.notify();
}
return;
}
match self.pending.save(self.conf).await {
Ok(_) => {
info!(sequence = self.pending.sequence, "Stored deletion list");
for f in self.pending_flushes.drain(..) {
f.notify();
}
// Take the list we've accumulated, replace it with a fresh list for the next sequence
let next_list = DeletionList::new(self.pending.sequence + 1);
let list = std::mem::replace(&mut self.pending, next_list);
if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
// This is allowed to fail: it will only happen if the backend worker is shut down,
// so we can just drop this on the floor.
info!("Deletion list dropped, this is normal during shutdown ({e:#})");
}
}
Err(e) => {
metrics::DELETION_QUEUE.unexpected_errors.inc();
warn!(
sequence = self.pending.sequence,
"Failed to write deletion list, will retry later ({e:#})"
);
}
}
}
/// Load the header, to learn the sequence number up to which deletions
/// have been validated. We will apply validated=true to DeletionLists
/// <= this sequence when loading them.
///
/// It is not an error for the header to not exist: we return None, and
/// the caller should act as if validated_sequence is 0
async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
let header_path = self.conf.deletion_header_path();
match tokio::fs::read(&header_path).await {
Ok(header_bytes) => {
match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
Ok(h) => Ok(Some(h.validated_sequence)),
Err(e) => {
warn!(
"Failed to deserialize deletion header, ignoring {}: {e:#}",
header_path.display()
);
// This should never happen unless we make a mistake with our serialization.
// Ignoring a deletion header is not consequential for correctnes because all deletions
// are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
metrics::DELETION_QUEUE.unexpected_errors.inc();
Ok(None)
}
}
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
debug!(
"Deletion header {} not found, first start?",
header_path.display()
);
Ok(None)
} else {
Err(anyhow::anyhow!(e))
}
}
}
}
async fn recover(
&mut self,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), anyhow::Error> {
debug!(
"recovering with {} attached tenants",
attached_tenants.len()
);
// Load the header
let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
self.pending.sequence = validated_sequence + 1;
let deletion_directory = self.conf.deletion_prefix();
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
Ok(d) => d,
Err(e) => {
warn!(
"Failed to open deletion list directory {}: {e:#}",
deletion_directory.display(),
);
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let list_name_pattern =
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
let header_path = self.conf.deletion_header_path();
let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await? {
let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy();
if Some(file_name.as_os_str()) == header_path.file_name() {
// Don't try and parse the header's name like a list
continue;
}
if dentry_str.ends_with(TEMP_SUFFIX) {
info!("Cleaning up temporary file {dentry_str}");
let absolute_path = deletion_directory.join(dentry.file_name());
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
// Non-fatal error: we will just leave the file behind but not
// try and load it.
warn!(
"Failed to clean up temporary file {}: {e:#}",
absolute_path.display()
);
}
continue;
}
let file_name = dentry.file_name().to_owned();
let basename = file_name.to_string_lossy();
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
m.name("sequence")
.expect("Non optional group should be present")
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
};
let seq: u64 = match u64::from_str_radix(seq_part, 16) {
Ok(s) => s,
Err(e) => {
warn!("Malformed key '{basename}': {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
seqs.push(seq);
}
seqs.sort();
// Start our next deletion list from after the last location validated by
// previous process lifetime, or after the last location found (it is updated
// below after enumerating the deletion lists)
self.pending.sequence = validated_sequence + 1;
if let Some(max_list_seq) = seqs.last() {
self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
}
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,
Err(e) => {
// Drop the list on the floor: any objects it referenced will be left behind
// for scrubbing to clean up. This should never happen unless we have a serialization bug.
warn!(sequence = s, "Failed to deserialize deletion list: {e}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
}
};
if deletion_list.sequence <= validated_sequence {
// If the deletion list falls below valid_seq, we may assume that it was
// already validated the last time this pageserver ran. Otherwise, we still
// load it, as it may still contain content valid in this generation.
deletion_list.validated = true;
} else {
// Special case optimization: if a tenant is still attached, and no other
// generation was issued to another node in the interval while we restarted,
// then we may treat deletion lists from the previous generation as if they
// belong to our currently attached generation, and proceed to validate & execute.
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
if attached_gen.previous() == tenant_list.generation {
tenant_list.generation = *attached_gen;
}
}
}
}
info!(
validated = deletion_list.validated,
sequence = deletion_list.sequence,
"Recovered deletion list"
);
// We will drop out of recovery if this fails: it indicates that we are shutting down
// or the backend has panicked
metrics::DELETION_QUEUE
.keys_submitted
.inc_by(deletion_list.len() as u64);
self.tx
.send(ValidatorQueueMessage::Delete(deletion_list))
.await?;
}
info!(next_sequence = self.pending.sequence, "Replay complete");
Ok(())
}
/// This is the front-end ingest, where we bundle up deletion requests into DeletionList
/// and write them out, for later validation by the backend and execution by the executor.
pub(super) async fn background(&mut self) {
info!("Started deletion frontend worker");
// Synchronous, but we only do it once per process lifetime so it's tolerable
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
tracing::error!(
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
self.conf.deletion_prefix().display()
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
}
while !self.cancel.is_cancelled() {
let timeout = if self.pending_flushes.is_empty() {
FRONTEND_DEFAULT_TIMEOUT
} else {
FRONTEND_FLUSHING_TIMEOUT
};
let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
Ok(Some(msg)) => msg,
Ok(None) => {
// Queue sender destroyed, shutting down
break;
}
Err(_) => {
// Hit deadline, flush.
self.flush().await;
continue;
}
};
match msg {
ListWriterQueueMessage::Delete(op) => {
assert!(
self.recovered,
"Cannot process deletions before recovery. This is a bug."
);
debug!(
"Delete: ingesting {} layers, {} other objects",
op.layers.len(),
op.objects.len()
);
let mut layer_paths = Vec::new();
for (layer, generation) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_id,
&op.timeline_id,
&layer,
generation,
));
}
layer_paths.extend(op.objects);
if !self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) {
self.flush().await;
let retry_succeeded = self.pending.push(
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
);
if !retry_succeeded {
// Unexpected: after we flush, we should have
// drained self.pending, so a conflict on
// generation numbers should be impossible.
tracing::error!(
"Failed to enqueue deletions, leaking objects. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
ListWriterQueueMessage::Flush(op) => {
if self.pending.is_empty() {
// Execute immediately
debug!("Flush: No pending objects, flushing immediately");
op.notify()
} else {
// Execute next time we flush
debug!("Flush: adding to pending flush list for next deadline flush");
self.pending_flushes.push(op);
}
}
ListWriterQueueMessage::FlushExecute(op) => {
debug!("FlushExecute: passing through to backend");
// We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
info!("Can't flush, shutting down ({e})");
// Caller will get error when their oneshot sender was dropped.
}
}
ListWriterQueueMessage::Recover(op) => {
if self.recovered {
tracing::error!(
"Deletion queue recovery called more than once. This is a bug."
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
continue;
}
if let Err(e) = self.recover(op.attached_tenants).await {
// This should only happen in truly unrecoverable cases, like the recovery finding that the backend
// queue receiver has been dropped, or something is critically broken with
// the local filesystem holding deletion lists.
info!(
"Deletion queue recover aborted, deletion queue will not proceed ({e})"
);
metrics::DELETION_QUEUE.unexpected_errors.inc();
return;
} else {
self.recovered = true;
}
}
}
if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
self.flush().await;
}
}
info!("Deletion queue shut down.");
}
}

View File

@@ -1,414 +0,0 @@
//! The validator is responsible for validating DeletionLists for execution,
//! based on whethe the generation in the DeletionList is still the latest
//! generation for a tenant.
//!
//! The purpose of validation is to ensure split-brain safety in the cluster
//! of pageservers: a deletion may only be executed if the tenant generation
//! that originated it is still current. See docs/rfcs/025-generation-numbers.md
//! The purpose of accumulating lists before validating them is to reduce load
//! on the control plane API by issuing fewer, larger requests.
//!
//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
//! to decide when old
//!
//! Deletions are passed onward to the Deleter.
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::info;
use tracing::warn;
use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::metrics;
use super::deleter::DeleterMessage;
use super::DeletionHeader;
use super::DeletionList;
use super::DeletionQueueError;
use super::FlushOp;
use super::VisibleLsnUpdates;
// After this length of time, do any validation work that is pending,
// even if we haven't accumulated many keys to delete.
//
// This also causes updates to remote_consistent_lsn to be validated, even
// if there were no deletions enqueued.
const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
// If we have received this number of keys, proceed with attempting to execute
const AUTOFLUSH_KEY_COUNT: usize = 16384;
#[derive(Debug)]
pub(super) enum ValidatorQueueMessage {
Delete(DeletionList),
Flush(FlushOp),
}
pub(super) struct Validator<C>
where
C: ControlPlaneGenerationsApi,
{
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
// Client for calling into control plane API for validation of deletes
control_plane_client: Option<C>,
// DeletionLists which are waiting generation validation. Not safe to
// execute until [`validate`] has processed them.
pending_lists: Vec<DeletionList>,
// DeletionLists which have passed validation and are ready to execute.
validated_lists: Vec<DeletionList>,
// Sum of all the lengths of lists in pending_lists
pending_key_count: usize,
// Lsn validation state: we read projected LSNs and write back visible LSNs
// after validation. This is the LSN equivalent of `pending_validation_lists`:
// it is drained in [`validate`]
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
// If we failed to rewrite a deletion list due to local filesystem I/O failure,
// we must remember that and refuse to advance our persistent validated sequence
// number past the failure.
list_write_failed: Option<u64>,
cancel: CancellationToken,
}
impl<C> Validator<C>
where
C: ControlPlaneGenerationsApi,
{
pub(super) fn new(
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
control_plane_client: Option<C>,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
cancel: CancellationToken,
) -> Self {
Self {
conf,
rx,
tx,
control_plane_client,
lsn_table,
pending_lists: Vec::new(),
validated_lists: Vec::new(),
pending_key_count: 0,
list_write_failed: None,
cancel,
}
}
/// Process any outstanding validations of generations of pending LSN updates or pending
/// DeletionLists.
///
/// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
/// go into the queue of ready-to-execute lists.
async fn validate(&mut self) -> Result<(), DeletionQueueError> {
let mut tenant_generations = HashMap::new();
for list in &self.pending_lists {
for (tenant_id, tenant_list) in &list.tenants {
// Note: DeletionLists are in logical time order, so generation always
// goes up. By doing a simple insert() we will always end up with
// the latest generation seen for a tenant.
tenant_generations.insert(*tenant_id, tenant_list.generation);
}
}
let pending_lsn_updates = {
let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
std::mem::take(&mut *lsn_table)
};
for (tenant_id, update) in &pending_lsn_updates.tenants {
let entry = tenant_generations
.entry(*tenant_id)
.or_insert(update.generation);
if update.generation > *entry {
*entry = update.generation;
}
}
if tenant_generations.is_empty() {
// No work to do
return Ok(());
}
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
match control_plane_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
// The only way a validation call returns an error is when the cancellation token fires
return Err(DeletionQueueError::ShuttingDown);
}
}
} else {
// Control plane API disabled. In legacy mode we consider everything valid.
tenant_generations.keys().map(|k| (*k, true)).collect()
};
let mut validated_sequence: Option<u64> = None;
// Apply the validation results to the pending LSN updates
for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
let validated_generation = tenant_generations
.get(&tenant_id)
.expect("Map was built from the same keys we're reading");
let valid = tenants_valid
.get(&tenant_id)
.copied()
// If the tenant was missing from the validation response, it has been deleted.
// The Timeline that requested the LSN update is probably already torn down,
// or will be torn down soon. In this case, drop the update by setting valid=false.
.unwrap_or(false);
if valid && *validated_generation == tenant_lsn_state.generation {
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
pending_lsn.result_slot.store(pending_lsn.projected);
}
} else {
// If we failed validation, then do not apply any of the projected updates
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
// Apply the validation results to the pending deletion lists
for list in &mut self.pending_lists {
// Filter the list based on whether the server responded valid: true.
// If a tenant is omitted in the response, it has been deleted, and we should
// proceed with deletion.
let mut mutated = false;
list.tenants.retain(|tenant_id, tenant| {
let validated_generation = tenant_generations
.get(tenant_id)
.expect("Map was built from the same keys we're reading");
// If the tenant was missing from the validation response, it has been deleted.
// This means that a deletion is valid, but also redundant since the tenant's
// objects should have already been deleted. Treat it as invalid to drop the
// redundant deletion.
let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
// A list is valid if it comes from the current _or previous_ generation.
// - The previous generation case is permitted due to how we store deletion lists locally:
// if we see the immediately previous generation in a locally stored deletion list,
// it proves that this node's disk was used for both current & previous generations,
// and therefore no other node was involved in between: the two generations may be
// logically treated as the same.
// - In that previous generation case, we rewrote it to the current generation
// in recover(), so the comparison here is simply an equality.
let this_list_valid = valid
&& (tenant.generation == *validated_generation);
if !this_list_valid {
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
}
this_list_valid
});
list.validated = true;
if mutated {
// Save the deletion list if we had to make changes due to stale generations. The
// saved list is valid for execution.
if let Err(e) = list.save(self.conf).await {
// Highly unexpected. Could happen if e.g. disk full.
// If we didn't save the trimmed list, it is _not_ valid to execute.
warn!("Failed to save modified deletion list {list}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
// Rather than have a complex retry process, just drop it and leak the objects,
// scrubber will clean up eventually.
list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
// We must remember this failure, to prevent later writing out a header that
// would imply the unwritable list was valid on disk.
if self.list_write_failed.is_none() {
self.list_write_failed = Some(list.sequence);
}
}
}
validated_sequence = Some(list.sequence);
}
if let Some(validated_sequence) = validated_sequence {
if let Some(list_write_failed) = self.list_write_failed {
// Rare error case: we failed to write out a deletion list to excise invalid
// entries, so we cannot advance the header's valid sequence number past that point.
//
// In this state we will continue to validate, execute and delete deletion lists,
// we just cannot update the header. It should be noticed and fixed by a human due to
// the nonzero value of our unexpected_errors metric.
warn!(
sequence_number = list_write_failed,
"Cannot write header because writing a deletion list failed earlier",
);
} else {
// Write the queue header to record how far validation progressed. This avoids having
// to rewrite each DeletionList to set validated=true in it.
let header = DeletionHeader::new(validated_sequence);
// Drop result because the validated_sequence is an optimization. If we fail to save it,
// then restart, we will drop some deletion lists, creating work for scrubber.
// The save() function logs a warning on error.
if let Err(e) = header.save(self.conf).await {
warn!("Failed to write deletion queue header: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
}
}
}
// Transfer the validated lists to the validated queue, for eventual execution
self.validated_lists.append(&mut self.pending_lists);
Ok(())
}
async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
for list_path in list_paths {
debug!("Removing deletion list {}", list_path.display());
if let Err(e) = tokio::fs::remove_file(&list_path).await {
// Unexpected: we should have permissions and nothing else should
// be touching these files. We will leave the file behind. Subsequent
// pageservers will try and load it again: hopefully whatever storage
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {}: {e:#}", list_path.display());
metrics::DELETION_QUEUE.unexpected_errors.inc();
break;
}
}
}
async fn flush(&mut self) -> Result<(), DeletionQueueError> {
tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
// Issue any required generation validation calls to the control plane
self.validate().await?;
// After successful validation, nothing is pending: any lists that
// made it through validation will be in validated_lists.
assert!(self.pending_lists.is_empty());
self.pending_key_count = 0;
tracing::debug!(
"Validation complete, have {} validated lists",
self.validated_lists.len()
);
// Return quickly if we have no validated lists to execute. This avoids flushing the
// executor when an idle backend hits its autoflush interval
if self.validated_lists.is_empty() {
return Ok(());
}
// Drain `validated_lists` into the executor
let mut executing_lists = Vec::new();
for list in self.validated_lists.drain(..) {
let list_path = self.conf.deletion_list_path(list.sequence);
let objects = list.into_remote_paths();
self.tx
.send(DeleterMessage::Delete(objects))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
executing_lists.push(list_path);
}
self.flush_executor().await?;
// Erase the deletion lists whose keys have all be deleted from remote storage
self.cleanup_lists(executing_lists).await;
Ok(())
}
async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
// Flush the executor, so that all the keys referenced by these deletion lists
// are actually removed from remote storage. This is a precondition to deleting
// the deletion lists themselves.
let (flush_op, rx) = FlushOp::new();
self.tx
.send(DeleterMessage::Flush(flush_op))
.await
.map_err(|_| DeletionQueueError::ShuttingDown)?;
rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
}
pub(super) async fn background(&mut self) {
tracing::info!("Started deletion backend worker");
while !self.cancel.is_cancelled() {
let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
Ok(Some(m)) => m,
Ok(None) => {
// All queue senders closed
info!("Shutting down");
break;
}
Err(_) => {
// Timeout, we hit deadline to execute whatever we have in hand. These functions will
// return immediately if no work is pending.
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
continue;
}
};
match msg {
ValidatorQueueMessage::Delete(list) => {
if list.validated {
// A pre-validated list may only be seen during recovery, if we are recovering
// a DeletionList whose on-disk state has validated=true
self.validated_lists.push(list)
} else {
self.pending_key_count += list.len();
self.pending_lists.push(list);
}
if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
match self.flush().await {
Ok(()) => {}
Err(DeletionQueueError::ShuttingDown) => {
// If we are shutting down, then auto-flush can safely be skipped
}
}
}
}
ValidatorQueueMessage::Flush(op) => {
match self.flush().await {
Ok(()) => {
op.notify();
}
Err(DeletionQueueError::ShuttingDown) => {
// If we fail due to shutting down, we will just drop `op` to propagate that status.
}
}
}
}
}
}
}

View File

@@ -1093,9 +1093,6 @@ components:
remote_consistent_lsn:
type: string
format: hex
remote_consistent_lsn_visible:
type: string
format: hex
ancestor_timeline_id:
type: string
format: hex

View File

@@ -5,7 +5,6 @@ use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
@@ -25,7 +24,6 @@ use super::models::{
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
@@ -36,7 +34,7 @@ use crate::tenant::mgr::{
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::Timeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use utils::{
@@ -63,7 +61,6 @@ pub struct State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
}
impl State {
@@ -73,7 +70,6 @@ impl State {
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
@@ -86,17 +82,8 @@ impl State {
remote_storage,
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
})
}
fn tenant_resources(&self) -> TenantSharedResources {
TenantSharedResources {
broker_client: self.broker_client.clone(),
remote_storage: self.remote_storage.clone(),
deletion_queue_client: self.deletion_queue_client.clone(),
}
}
}
#[inline(always)]
@@ -296,12 +283,7 @@ async fn build_timeline_info_common(
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
let remote_consistent_lsn_visible = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
let walreceiver_status = timeline.walreceiver_status();
@@ -311,8 +293,7 @@ async fn build_timeline_info_common(
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
remote_consistent_lsn: remote_consistent_lsn_projected,
remote_consistent_lsn_visible,
remote_consistent_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -511,23 +492,24 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
if state.remote_storage.is_none() {
if let Some(remote_storage) = &state.remote_storage {
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.broker_client.clone(),
remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
mgr::attach_tenant(
state.conf,
tenant_id,
generation,
tenant_conf,
state.tenant_resources(),
&ctx,
)
.instrument(info_span!("tenant_attach", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -588,7 +570,6 @@ async fn tenant_load_handler(
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.instrument(info_span!("load", %tenant_id))
@@ -930,7 +911,8 @@ async fn tenant_create_handler(
tenant_conf,
target_tenant_id,
generation,
state.tenant_resources(),
state.broker_client.clone(),
state.remote_storage.clone(),
&ctx,
)
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1147,39 +1129,6 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let flush = async {
if execute {
state.deletion_queue_client.flush_execute().await
} else {
state.deletion_queue_client.flush().await
}
}
// DeletionQueueError's only case is shutting down.
.map_err(|_| ApiError::ShuttingDown);
tokio::select! {
res = flush => {
res.map(|()| json_response(StatusCode::OK, ()))?
}
_ = cancel.cancelled() => {
Err(ApiError::ShuttingDown)
}
}
}
async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -1514,9 +1463,6 @@ pub fn make_router(
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -3,8 +3,7 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
pub mod control_plane_client;
pub mod deletion_queue;
mod control_plane_client;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
@@ -28,7 +27,6 @@ pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;
use deletion_queue::DeletionQueue;
use tracing::info;
/// Current storage format version
@@ -50,8 +48,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
#[tracing::instrument]
pub async fn shutdown_pageserver(exit_code: i32) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -79,11 +77,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.

View File

@@ -264,46 +264,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
"Time spent acquiring a pinned slot in the page cache",
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
"Number of timeouts while acquiring a pinned slot in the page cache",
&["error_kind"]
)
.expect("failed to define a metric")
});
#[derive(IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum PageCacheErrorKind {
AcquirePinnedSlotTimeout,
EvictIterLimit,
}
pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
PAGE_CACHE_ERRORS
.get_metric_with_label_values(&[error_kind.into()])
.unwrap()
.inc();
}
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wait_lsn_seconds",
@@ -331,14 +291,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_resident_physical_size_global",
"Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
@@ -349,14 +301,6 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_remote_physical_size_global",
"Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
@@ -943,54 +887,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
.expect("failed to define a metric")
});
pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter,
pub(crate) keys_executed: IntCounter,
pub(crate) dropped_lsn_updates: IntCounter,
pub(crate) unexpected_errors: IntCounter,
pub(crate) remote_errors: IntCounterVec,
}
pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
DeletionQueueMetrics{
keys_submitted: register_int_counter!(
"pageserver_deletion_queue_submitted_total",
"Number of objects submitted for deletion"
)
.expect("failed to define a metric"),
keys_dropped: register_int_counter!(
"pageserver_deletion_queue_dropped_total",
"Number of object deletions dropped due to stale generation."
)
.expect("failed to define a metric"),
keys_executed: register_int_counter!(
"pageserver_deletion_queue_executed_total",
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
)
.expect("failed to define a metric"),
dropped_lsn_updates: register_int_counter!(
"pageserver_deletion_queue_dropped_lsn_updates_total",
"Updates to remote_consistent_lsn dropped due to stale generation number."
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_deletion_queue_unexpected_errors_total",
"Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
)
.expect("failed to define a metric"),
remote_errors: register_int_counter_vec!(
"pageserver_deletion_queue_remote_errors_total",
"Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
&["op_kind"],
)
.expect("failed to define a metric")
}
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1265,7 +1161,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
resident_physical_size_gauge: UIntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -1343,29 +1239,10 @@ impl TimelineMetrics {
}
pub fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_add(sz);
self.resident_physical_size_gauge.add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}
pub fn resident_physical_size_sub(&self, sz: u64) {
self.resident_physical_size_gauge.sub(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
}
pub fn resident_physical_size_add(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
}
pub fn resident_physical_size_set(&self, sz: u64) {
self.resident_physical_size_gauge.set(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
}
pub fn resident_physical_size_get(&self) -> u64 {
self.resident_physical_size_gauge.get()
}
}
impl Drop for TimelineMetrics {
@@ -1373,10 +1250,7 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1430,43 +1304,10 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
/// Maintain a per timeline gauge in addition to the global gauge.
struct PerTimelineRemotePhysicalSizeGauge {
last_set: u64,
gauge: UIntGauge,
}
impl PerTimelineRemotePhysicalSizeGauge {
fn new(per_timeline_gauge: UIntGauge) -> Self {
Self {
last_set: per_timeline_gauge.get(),
gauge: per_timeline_gauge,
}
}
fn set(&mut self, sz: u64) {
self.gauge.set(sz);
if sz < self.last_set {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
} else {
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
};
self.last_set = sz;
}
fn get(&self) -> u64 {
self.gauge.get()
}
}
impl Drop for PerTimelineRemotePhysicalSizeGauge {
fn drop(&mut self) {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
}
}
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1484,24 +1325,18 @@ impl RemoteTimelineClientMetrics {
}
}
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
let gauge = guard.get_or_insert_with(|| {
PerTimelineRemotePhysicalSizeGauge::new(
guard
.get_or_insert_with(|| {
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap(),
)
});
gauge.set(sz);
}
pub(crate) fn remote_physical_size_get(&self) -> u64 {
let guard = self.remote_physical_size_gauge.lock().unwrap();
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
.unwrap()
})
.clone()
}
pub fn remote_operation_time(
@@ -1840,9 +1675,6 @@ pub fn preinitialize_metrics() {
Lazy::force(c);
});
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()

View File

@@ -75,12 +75,7 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
Arc, Weak,
},
task::Poll,
time::Duration,
sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
};
use anyhow::Context;
@@ -170,8 +165,6 @@ struct Slot {
struct SlotInner {
key: Option<CacheKey>,
// for `coalesce_readers_permit`
permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
buf: &'static mut [u8; PAGE_SZ],
}
@@ -214,22 +207,6 @@ impl Slot {
}
}
impl SlotInner {
/// If there is aready a reader, drop our permit and share its permit, just like we share read access.
fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
let mut guard = self.permit.lock().unwrap();
if let Some(existing_permit) = guard.upgrade() {
drop(guard);
drop(permit);
existing_permit
} else {
let permit = Arc::new(permit);
*guard = Arc::downgrade(&permit);
permit
}
}
}
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
@@ -247,42 +224,30 @@ pub struct PageCache {
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
pinned_slots: Arc<tokio::sync::Semaphore>,
/// Index of the next candidate to evict, for the Clock replacement algorithm.
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
find_victim_sender:
async_channel::Sender<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
find_victim_waiters:
async_channel::Receiver<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
/// until the guard is dropped.
///
pub struct PageReadGuard<'i> {
_permit: Arc<PinnedSlotsPermit>,
slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
}
pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
self.slot_guard.buf
self.0.buf
}
}
impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
fn as_ref(&self) -> &[u8; PAGE_SZ] {
self.slot_guard.buf
self.0.buf
}
}
@@ -297,23 +262,16 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
/// to initialize.
///
pub struct PageWriteGuard<'i> {
state: PageWriteGuardState<'i>,
}
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
enum PageWriteGuardState<'i> {
Invalid {
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
_permit: PinnedSlotsPermit,
},
Downgraded,
// Are the page contents currently valid?
// Used to mark pages as invalid that are assigned but not yet filled with data.
valid: bool,
}
impl std::ops::DerefMut for PageWriteGuard<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => &mut inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
self.inner.buf
}
}
@@ -321,37 +279,25 @@ impl std::ops::Deref for PageWriteGuard<'_> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
match &self.state {
PageWriteGuardState::Invalid { inner, _permit } => &inner.buf,
PageWriteGuardState::Downgraded => unreachable!(),
}
self.inner.buf
}
}
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => &mut inner.buf,
PageWriteGuardState::Downgraded => todo!(),
}
self.inner.buf
}
}
impl<'a> PageWriteGuard<'a> {
impl PageWriteGuard<'_> {
/// Mark that the buffer contents are now valid.
#[must_use]
pub fn mark_valid(mut self) -> PageReadGuard<'a> {
let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
match prev {
PageWriteGuardState::Invalid { inner, _permit } => {
assert!(inner.key.is_some());
PageReadGuard {
_permit: Arc::new(_permit),
slot_guard: inner.downgrade(),
}
}
PageWriteGuardState::Downgraded => unreachable!(),
}
pub fn mark_valid(&mut self) {
assert!(self.inner.key.is_some());
assert!(
!self.valid,
"mark_valid called on a buffer that was already valid"
);
self.valid = true;
}
}
@@ -362,13 +308,11 @@ impl Drop for PageWriteGuard<'_> {
/// initializing it, remove the mapping from the page cache.
///
fn drop(&mut self) {
match &mut self.state {
PageWriteGuardState::Invalid { inner, _permit } => {
let self_key = inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
inner.key = None;
}
PageWriteGuardState::Downgraded => {}
assert!(self.inner.key.is_some());
if !self.valid {
let self_key = self.inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
self.inner.key = None;
}
}
}
@@ -381,7 +325,7 @@ pub enum ReadBufResult<'a> {
/// lock_for_write() return value
pub enum WriteBufResult<'a> {
Found(PageReadGuard<'a>),
Found(PageWriteGuard<'a>),
NotFound(PageWriteGuard<'a>),
}
@@ -404,10 +348,6 @@ impl PageCache {
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
return None;
};
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
@@ -422,10 +362,7 @@ impl PageCache {
lsn,
};
if let Some(guard) = self
.try_lock_for_read(&mut cache_key, &mut Some(permit))
.await
{
if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
@@ -455,7 +392,7 @@ impl PageCache {
/// Store an image of the given page in the cache.
///
pub async fn memorize_materialized_page(
&'static self,
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
@@ -472,15 +409,15 @@ impl PageCache {
};
match self.lock_for_write(&cache_key).await? {
WriteBufResult::Found(read_guard) => {
WriteBufResult::Found(write_guard) => {
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
// replayed.
assert!(*read_guard == img);
assert!(*write_guard == img);
}
WriteBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(img);
let _ = write_guard.mark_valid();
write_guard.mark_valid();
}
}
@@ -490,7 +427,7 @@ impl PageCache {
// Section 1.2: Public interface functions for working with immutable file pages.
pub async fn read_immutable_buf(
&'static self,
&self,
file_id: FileId,
blkno: u32,
ctx: &RequestContext,
@@ -508,16 +445,6 @@ impl PageCache {
// "mappings" after this section. But the routines in this section should
// not require changes.
async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
let _timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
Ok(PinnedSlotsPermit(
Arc::clone(&self.pinned_slots)
.acquire_owned()
.await
.unwrap(),
))
}
/// Look up a page in the cache.
///
/// If the search criteria is not exact, *cache_key is updated with the key
@@ -527,11 +454,7 @@ impl PageCache {
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
async fn try_lock_for_read(
&self,
cache_key: &mut CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageReadGuard> {
async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
@@ -541,10 +464,7 @@ impl PageCache {
let inner = slot.inner.read().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageReadGuard {
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
slot_guard: inner,
});
return Some(PageReadGuard(inner));
} else {
// search_mapping might have modified the search key; restore it.
*cache_key = cache_key_orig;
@@ -583,12 +503,10 @@ impl PageCache {
/// ```
///
async fn lock_for_read(
&'static self,
&self,
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
@@ -605,21 +523,17 @@ impl PageCache {
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
if is_first_iteration {
hit.inc();
}
return Ok(ReadBufResult::Found(read_guard));
}
debug_assert!(permit.is_some());
is_first_iteration = false;
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -641,41 +555,27 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Ok(ReadBufResult::NotFound(PageWriteGuard {
state: PageWriteGuardState::Invalid {
_permit: permit.take().unwrap(),
inner,
},
inner,
valid: false,
}));
}
}
// FIXME: the name is wrong.
async fn try_lock_for_write(
&self,
cache_key: &CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageReadGuard> {
/// Look up a page in the cache and lock it in write mode. If it's not
/// found, returns None.
///
/// When locking a page for writing, the search criteria is always "exact".
async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.read().await;
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageReadGuard {
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
slot_guard: inner,
});
return Some(PageWriteGuard { inner, valid: true });
}
}
None
@@ -685,21 +585,16 @@ impl PageCache {
///
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
async fn lock_for_write(&'static self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
loop {
// First check if the key already exists in the cache.
if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
debug_assert!(permit.is_none());
if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
return Ok(WriteBufResult::Found(write_guard));
}
debug_assert!(permit.is_some());
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -721,19 +616,9 @@ impl PageCache {
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
return Ok(WriteBufResult::NotFound(PageWriteGuard {
state: PageWriteGuardState::Invalid {
_permit: permit.take().unwrap(),
inner,
},
inner,
valid: false,
}));
}
}
@@ -884,21 +769,8 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
async fn find_victim(
&'static self,
_permit_witness: &PinnedSlotsPermit,
) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
// Get in line.
let mut receiver = self.find_victim_waiters.recv();
// If we get cancelled at the receiver.await below, the victim slot
// remains in the channel. Consume these first before going into
// the loop below.
match futures::poll!(&mut receiver) {
Poll::Ready(Ok(res)) => return Ok(res),
Poll::Ready(Err(_closed)) => unreachable!("we never close the channel"),
Poll::Pending => {} // the regular case where we aren't cancelled below
};
fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
iters += 1;
@@ -910,8 +782,14 @@ impl PageCache {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(_err) => {
if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
unreachable!("find_victim_waiters prevents starvation");
// If we have looped through the whole buffer pool 10 times
// and still haven't found a victim buffer, something's wrong.
// Maybe all the buffers were in locked. That could happen in
// theory, if you have more threads holding buffers locked than
// there are buffers in the pool. In practice, with a reasonably
// large buffer pool it really shouldn't happen.
if iters > iter_limit {
anyhow::bail!("exceeded evict iter limit");
}
continue;
}
@@ -921,11 +799,7 @@ impl PageCache {
self.remove_mapping(old_key);
inner.key = None;
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
self.find_victim_sender
.try_send((slot_idx, inner))
.expect("we always get in line first");
return Ok(receiver.await.unwrap());
return Ok((slot_idx, inner));
}
}
}
@@ -952,26 +826,18 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: tokio::sync::RwLock::new(SlotInner {
key: None,
buf,
permit: std::sync::Mutex::new(Weak::new()),
}),
inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
usage_count: AtomicU8::new(0),
}
})
.collect();
let (find_victim_sender, find_victim_waiters) = async_channel::bounded(num_pages);
Self {
materialized_page_map: Default::default(),
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
find_victim_sender,
find_victim_waiters,
}
}
}

View File

@@ -37,7 +37,7 @@ impl Key {
| self.field6 as i128
}
pub const fn from_i128(x: i128) -> Self {
pub fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,

View File

@@ -293,8 +293,6 @@ pub enum TaskKind {
DebugTool,
BackgroundRuntimeTurnaroundMeasure,
#[cfg(test)]
UnitTest,
}
@@ -458,7 +456,7 @@ async fn task_finish(
}
if shutdown_process {
shutdown_pageserver(None, 1).await;
shutdown_pageserver(1).await;
}
}

View File

@@ -57,7 +57,6 @@ use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
@@ -118,7 +117,7 @@ mod span;
pub mod metadata;
mod par_fsync;
pub mod remote_timeline_client;
mod remote_timeline_client;
pub mod storage_layer;
pub mod config;
@@ -158,7 +157,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: Option<GenericRemoteStorage>,
pub deletion_queue_client: DeletionQueueClient,
}
///
@@ -199,9 +197,6 @@ pub struct Tenant {
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: Option<GenericRemoteStorage>,
// Access to global deletion queue for when this tenant wants to schedule a deletion
deletion_queue_client: DeletionQueueClient,
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -528,20 +523,15 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
// TODO dedup with spawn_load
let tenant_conf =
Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
let TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client,
} = resources;
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Arc::new(Tenant::new(
TenantState::Attaching,
@@ -550,8 +540,7 @@ impl Tenant {
wal_redo_manager,
tenant_id,
generation,
remote_storage.clone(),
deletion_queue_client,
Some(remote_storage.clone()),
));
// Do all the hard work in the background
@@ -582,7 +571,7 @@ impl Tenant {
let pending_deletion = {
match DeleteTenantFlow::should_resume_deletion(
conf,
remote_storage.as_ref(),
Some(&remote_storage),
&tenant_clone,
)
.await
@@ -671,7 +660,6 @@ impl Tenant {
for timeline_id in remote_timeline_ids {
let client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -738,7 +726,6 @@ impl Tenant {
remote_metadata,
TimelineResources {
remote_client: Some(remote_client),
deletion_queue_client: self.deletion_queue_client.clone(),
},
ctx,
)
@@ -763,7 +750,6 @@ impl Tenant {
timeline_id,
&index_part.metadata,
Some(remote_timeline_client),
self.deletion_queue_client.clone(),
None,
)
.await
@@ -865,7 +851,6 @@ impl Tenant {
tenant_id,
Generation::broken(),
None,
DeletionQueueClient::broken(),
))
}
@@ -910,7 +895,6 @@ impl Tenant {
tenant_id,
generation,
remote_storage.clone(),
resources.deletion_queue_client.clone(),
);
let tenant = Arc::new(tenant);
@@ -1318,7 +1302,6 @@ impl Tenant {
timeline_id,
&local_metadata,
Some(remote_client),
self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -1368,7 +1351,6 @@ impl Tenant {
timeline_id,
&local_metadata,
None,
self.deletion_queue_client.clone(),
init_order,
)
.await
@@ -2260,9 +2242,6 @@ impl Tenant {
Ok(timeline)
}
// Allow too_many_arguments because a constructor's argument list naturally grows with the
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
#[allow(clippy::too_many_arguments)]
fn new(
state: TenantState,
conf: &'static PageServerConf,
@@ -2271,7 +2250,6 @@ impl Tenant {
tenant_id: TenantId,
generation: Generation,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
) -> Tenant {
let (state, mut rx) = watch::channel(state);
@@ -2339,7 +2317,6 @@ impl Tenant {
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
deletion_queue_client,
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2879,7 +2856,6 @@ impl Tenant {
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_id,
timeline_id,
@@ -2890,10 +2866,7 @@ impl Tenant {
None
};
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
}
TimelineResources { remote_client }
}
/// Creates intermediate timeline structure and its files.
@@ -3349,7 +3322,6 @@ pub mod harness {
use utils::logging;
use utils::lsn::Lsn;
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::{
config::PageServerConf,
repository::Key,
@@ -3411,7 +3383,6 @@ pub mod harness {
pub generation: Generation,
pub remote_storage: GenericRemoteStorage,
pub remote_fs_dir: PathBuf,
pub deletion_queue: MockDeletionQueue,
}
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3460,7 +3431,6 @@ pub mod harness {
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
Ok(Self {
conf,
@@ -3469,7 +3439,6 @@ pub mod harness {
generation: Generation::new(0xdeadbeef),
remote_storage,
remote_fs_dir,
deletion_queue,
})
}
@@ -3494,7 +3463,6 @@ pub mod harness {
self.tenant_id,
self.generation,
Some(self.remote_storage.clone()),
self.deletion_queue.new_client(),
));
tenant
.load(None, ctx)
@@ -4225,8 +4193,7 @@ mod tests {
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_bulk_insert")?;
let (tenant, ctx) = harness.load().await;
let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4273,8 +4240,7 @@ mod tests {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_random_updates")?;
let (tenant, ctx) = harness.load().await;
let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;

View File

@@ -186,22 +186,27 @@ impl FileBlockReader {
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => return Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
return Ok(write_guard.mark_valid().into());
}
};
loop {
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => break Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
}
}

View File

@@ -70,34 +70,38 @@ impl EphemeralFile {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.file.path.display(),
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));
}
};
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.file.path.display(),
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -167,7 +171,7 @@ impl EphemeralFile {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
let _ = write_guard.mark_valid();
write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {

View File

@@ -20,10 +20,7 @@ use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::control_plane_client::ControlPlaneClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
@@ -119,28 +116,7 @@ pub async fn init_tenant_mgr(
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
let result = match client.re_attach().await {
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
};
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(result.clone())
.await?;
}
Some(result)
Some(client.re_attach().await?)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
@@ -309,21 +285,29 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if resources.remote_storage.is_none() {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
if let Some(remote_storage) = resources.remote_storage {
match Tenant::spawn_attach(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
} else {
match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
generation,
resources.broker_client,
tenants,
remote_storage,
ctx,
) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
}
}
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(
conf,
tenant_id,
"attaching mark file present but no remote storage configured".to_string(),
)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -454,7 +438,8 @@ pub async fn create_tenant(
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
generation: Generation,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -465,9 +450,13 @@ pub async fn create_tenant(
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_resources = TenantSharedResources {
broker_client,
remote_storage,
};
let created_tenant =
schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
generation, resources, None, &TENANTS, ctx)?;
generation, tenant_resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -633,7 +622,6 @@ pub async fn load_tenant(
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -647,7 +635,6 @@ pub async fn load_tenant(
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client
};
let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx)
.with_context(|| {
@@ -715,7 +702,8 @@ pub async fn attach_tenant(
tenant_id: TenantId,
generation: Generation,
tenant_conf: TenantConfOpt,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
@@ -730,7 +718,10 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let resources = TenantSharedResources {
broker_client,
remote_storage: Some(remote_storage),
};
let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233

View File

@@ -116,12 +116,8 @@
//! # Completion
//!
//! Once an operation has completed, we update
//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
//! and submit a request through the DeletionQueue to update
//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
//! validated that our generation is not stale. It is this visible value
//! that is advertized to safekeepers as a signal that that they can
//! delete the WAL up to that LSN.
//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
//! to safekeepers that they can delete the WAL up to that LSN.
//!
//! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
//! for all pending operations to complete. It does not prevent more
@@ -204,6 +200,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod delete;
mod download;
pub mod index;
mod upload;
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -328,8 +324,6 @@ pub struct RemoteTimelineClient {
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
}
impl RemoteTimelineClient {
@@ -341,7 +335,6 @@ impl RemoteTimelineClient {
///
pub fn new(
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -359,7 +352,6 @@ impl RemoteTimelineClient {
timeline_id,
generation,
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
@@ -421,24 +413,13 @@ impl RemoteTimelineClient {
Ok(())
}
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
match &*self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
UploadQueue::Stopped(q) => q
.upload_queue_for_deletion
.get_last_remote_consistent_lsn_projected(),
}
}
pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
UploadQueue::Stopped(q) => Some(
q.upload_queue_for_deletion
.get_last_remote_consistent_lsn_visible(),
),
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
UploadQueue::Stopped(q) => {
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
}
}
}
@@ -453,11 +434,11 @@ impl RemoteTimelineClient {
} else {
0
};
self.metrics.remote_physical_size_set(size);
self.metrics.remote_physical_size_gauge().set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_get()
self.metrics.remote_physical_size_gauge().get()
}
//
@@ -662,7 +643,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: Vec<LayerFileName>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -682,10 +663,10 @@ impl RemoteTimelineClient {
// Decorate our list of names with each name's generation, dropping
// makes that are unexpectedly missing from our metadata.
let with_generations: Vec<_> = names
.into_iter()
.iter()
.filter_map(|name| {
// Remove from latest_files, learning the file's remote generation in the process
let meta = upload_queue.latest_files.remove(&name);
let meta = upload_queue.latest_files.remove(name);
if let Some(meta) = meta {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -707,16 +688,18 @@ impl RemoteTimelineClient {
self.schedule_index_upload(upload_queue, metadata);
}
for (name, gen) in &with_generations {
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
}
// schedule the actual deletions
let op = UploadOp::Delete(Delete {
layers: with_generations,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
for (name, generation) in with_generations {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: false,
generation,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {name}");
}
// Launch the tasks immediately, if possible
self.launch_queued_tasks(upload_queue);
@@ -850,7 +833,9 @@ impl RemoteTimelineClient {
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
let layers: Vec<RemotePath> = {
let (mut receiver, deletions_queued) = {
let mut deletions_queued = 0;
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
@@ -862,30 +847,42 @@ impl RemoteTimelineClient {
stopped
.upload_queue_for_deletion
.latest_files
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_id,
&self.timeline_id,
&file_name,
meta.generation,
)
})
.collect()
.queued_operations
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
// schedule the actual deletions
for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: true,
generation: meta.generation,
});
self.calls_unfinished_metric_begin(&op);
stopped
.upload_queue_for_deletion
.queued_operations
.push_back(op);
info!("scheduled layer file deletion {name}");
deletions_queued += 1;
}
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
(
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
deletions_queued,
)
};
let layer_deletion_count = layers.len();
self.deletion_queue_client.push_immediate(layers).await?;
receiver.changed().await.context("upload queue shut down")?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.deletion_queue_client.flush_immediate().await?;
let remaining = backoff::retry(
|| async {
self.storage_impl
@@ -913,9 +910,17 @@ impl RemoteTimelineClient {
})
.collect();
let not_referenced_count = remaining.len();
if !remaining.is_empty() {
self.deletion_queue_client.push_immediate(remaining).await?;
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -926,14 +931,18 @@ impl RemoteTimelineClient {
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("enqueuing index part deletion");
self.deletion_queue_client
.push_immediate([index_file_path].to_vec())
.await?;
debug!("deleting index part");
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.deletion_queue_client.flush_immediate().await?;
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -941,7 +950,7 @@ impl RemoteTimelineClient {
))?
});
info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
Ok(())
}
@@ -1131,16 +1140,21 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(delete) => self
.deletion_queue_client
.push_layers(
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
)
.await
.map_err(|e| anyhow::anyhow!(e)),
UploadOp::Delete(delete) => {
let path = &self
.conf
.timeline_path(&self.tenant_id, &self.timeline_id)
.join(delete.layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
delete.file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::Barrier(_) => {
// unreachable. Barrier operations are handled synchronously in
// launch_queued_tasks
@@ -1196,12 +1210,18 @@ impl RemoteTimelineClient {
}
// The task has completed successfully. Remove it from the in-progress list.
let lsn_update = {
{
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
UploadQueue::Stopped(_stopped) => {
None
UploadQueue::Stopped(stopped) => {
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
// then stop() took care of it so we just return.
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
match &task.op {
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
_ => None
}
},
UploadQueue::Initialized(qi) => { Some(qi) }
};
@@ -1216,51 +1236,23 @@ impl RemoteTimelineClient {
upload_queue.inprogress_tasks.remove(&task.task_id);
let lsn_update = match task.op {
match task.op {
UploadOp::UploadLayer(_, _) => {
upload_queue.num_inprogress_layer_uploads -= 1;
None
}
UploadOp::UploadMetadata(_, lsn) => {
upload_queue.num_inprogress_metadata_uploads -= 1;
// XXX monotonicity check?
upload_queue.projected_remote_consistent_lsn = Some(lsn);
if self.generation.is_none() {
// Legacy mode: skip validating generation
upload_queue.visible_remote_consistent_lsn.store(lsn);
None
} else {
Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
}
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
}
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
None
}
UploadOp::Barrier(_) => unreachable!(),
};
// Launch any queued tasks that were unblocked by this one.
self.launch_queued_tasks(upload_queue);
lsn_update
};
if let Some((lsn, slot)) = lsn_update {
// Updates to the remote_consistent_lsn we advertise to pageservers
// are all routed through the DeletionQueue, to enforce important
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
self.deletion_queue_client
.update_remote_consistent_lsn(
self.tenant_id,
self.timeline_id,
self.generation,
lsn,
slot,
)
.await;
}
self.calls_unfinished_metric_end(&task.op);
}
@@ -1286,8 +1278,8 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(_delete) => (
RemoteOpFileKind::Layer,
UploadOp::Delete(delete) => (
delete.file_kind,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
@@ -1349,10 +1341,7 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
.clone(),
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
@@ -1416,13 +1405,13 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
layer_file_name: &LayerFileName,
generation: Generation,
layer_meta: &LayerFileMetadata,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
layer_file_name.file_name(),
generation.get_suffix()
layer_meta.generation.get_suffix()
);
RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1565,6 +1554,7 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
// Use a current-thread runtime in the test
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
@@ -1590,7 +1580,6 @@ mod tests {
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_id,
@@ -1760,7 +1749,7 @@ mod tests {
)
.unwrap();
client
.schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -1786,7 +1775,6 @@ mod tests {
// Finish them
client.wait_completion().await.unwrap();
harness.deletion_queue.pump().await;
assert_remote_files(
&[

View File

@@ -0,0 +1,34 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::{
config::PageServerConf,
tenant::{remote_timeline_client::remote_path, Generation},
};
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
generation: Generation,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
let path_to_delete = remote_path(conf, local_layer_path, generation)?;
// We don't want to print an error if the delete failed if the file has
// already been deleted. Thankfully, in this situation S3 already
// does not yield an error. While OS-provided local file system APIs do yield
// errors, we avoid them in the `LocalFs` wrapper.
storage
.delete(&path_to_delete)
.await
.with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
}

View File

@@ -50,12 +50,7 @@ pub async fn download_layer_file<'a>(
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_id,
&timeline_id,
layer_file_name,
layer_metadata.generation,
);
let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:

View File

@@ -864,11 +864,11 @@ impl DeltaLayerInner {
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
// bail!(
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
// actual_summary,
// expected_summary
// );
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
}
}

View File

@@ -457,11 +457,11 @@ impl ImageLayerInner {
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
// bail!(
// "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
// actual_summary,
// expected_summary
// );
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
}
}

View File

@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
@@ -144,7 +143,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option<RemoteTimelineClient>,
pub deletion_queue_client: DeletionQueueClient,
}
pub struct Timeline {
@@ -523,23 +521,9 @@ impl Timeline {
self.disk_consistent_lsn.load()
}
/// remote_consistent_lsn from the perspective of the tenant's current generation,
/// not validated with control plane yet.
/// See [`Self::get_remote_consistent_lsn_visible`].
pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_projected()
} else {
None
}
}
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
/// i.e. a value of remote_consistent_lsn_projected which has undergone
/// generation validation in the deletion queue.
pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_visible()
remote_client.last_uploaded_consistent_lsn()
} else {
None
}
@@ -559,7 +543,7 @@ impl Timeline {
}
pub fn resident_physical_size(&self) -> u64 {
self.metrics.resident_physical_size_get()
self.metrics.resident_physical_size_gauge.get()
}
///
@@ -655,38 +639,38 @@ impl Timeline {
) -> anyhow::Result<()> {
const ROUNDS: usize = 2;
// static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
// once_cell::sync::Lazy::new(|| {
// let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
// let permits = usize::max(
// 1,
// // while a lot of the work is done on spawn_blocking, we still do
// // repartitioning in the async context. this should give leave us some workers
// // unblocked to be blocked on other work, hopefully easing any outside visible
// // effects of restarts.
// //
// // 6/8 is a guess; previously we ran with unlimited 8 and more from
// // spawn_blocking.
// (total_threads * 3).checked_div(4).unwrap_or(0),
// );
// assert_ne!(permits, 0, "we will not be adding in permits later");
// assert!(
// permits < total_threads,
// "need threads avail for shorter work"
// );
// tokio::sync::Semaphore::new(permits)
// });
static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
// repartitioning in the async context. this should give leave us some workers
// unblocked to be blocked on other work, hopefully easing any outside visible
// effects of restarts.
//
// 6/8 is a guess; previously we ran with unlimited 8 and more from
// spawn_blocking.
(total_threads * 3).checked_div(4).unwrap_or(0),
);
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(
permits < total_threads,
"need threads avail for shorter work"
);
tokio::sync::Semaphore::new(permits)
});
// // this wait probably never needs any "long time spent" logging, because we already nag if
// // compaction task goes over it's period (20s) which is quite often in production.
// let _permit = tokio::select! {
// permit = CONCURRENT_COMPACTIONS.acquire() => {
// permit
// },
// _ = cancel.cancelled() => {
// return Ok(());
// }
// };
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let _permit = tokio::select! {
permit = CONCURRENT_COMPACTIONS.acquire() => {
permit
},
_ = cancel.cancelled() => {
return Ok(());
}
};
let last_record_lsn = self.get_last_record_lsn();
@@ -1309,7 +1293,10 @@ impl Timeline {
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics.resident_physical_size_sub(layer_file_size);
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
self.metrics.evictions.inc();
if let Some(delta) = local_layer_residence_duration {
@@ -1833,7 +1820,7 @@ impl Timeline {
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_layer_file_deletion(needs_cleanup)?;
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
@@ -1843,7 +1830,9 @@ impl Timeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
self.metrics.resident_physical_size_set(total_physical_size);
self.metrics
.resident_physical_size_gauge
.set(total_physical_size);
timer.stop_and_record();
Ok(())
@@ -3886,7 +3875,7 @@ impl Timeline {
// Also schedule the deletions in remote storage
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
Ok(())
@@ -4221,7 +4210,7 @@ impl Timeline {
}
if let Some(remote_client) = &self.remote_client {
remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
}
apply.flush();
@@ -4393,7 +4382,7 @@ impl Timeline {
// XXX the temp file is still around in Err() case
// and consumes space until we clean up upon pageserver restart.
self_clone.metrics.resident_physical_size_add(*size);
self_clone.metrics.resident_physical_size_gauge.add(*size);
// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.

View File

@@ -14,7 +14,6 @@ use utils::{
use crate::{
config::PageServerConf,
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -408,7 +407,6 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -418,10 +416,7 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
TimelineResources {
remote_client,
deletion_queue_client,
},
TimelineResources { remote_client },
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -263,7 +263,7 @@ impl LayerManager {
let desc = layer.layer_desc();
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
metrics.resident_physical_size_sub(desc.file_size);
metrics.resident_physical_size_gauge.sub(desc.file_size);
}
// TODO Removing from the bottom of the layer map is expensive.

View File

@@ -370,9 +370,8 @@ pub(super) async fn handle_walreceiver_connection(
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn = timeline
.get_remote_consistent_lsn_visible()
.unwrap_or(Lsn(0));
let timeline_remote_consistent_lsn =
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
let last_received_lsn = last_lsn;

View File

@@ -1,3 +1,5 @@
use crate::metrics::RemoteOpFileKind;
use super::storage_layer::LayerFileName;
use super::Generation;
use crate::tenant::metadata::TimelineMetadata;
@@ -9,7 +11,6 @@ use std::fmt::Debug;
use chrono::NaiveDateTime;
use std::sync::Arc;
use tracing::info;
use utils::lsn::AtomicLsn;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
@@ -57,12 +58,7 @@ pub(crate) struct UploadQueueInitialized {
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
///
/// visible_remote_consistent_lsn is only updated after our generation has been validated with
/// the control plane (unlesss a timeline's generation is None, in which case
/// we skip validation)
pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
pub(crate) last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
@@ -85,14 +81,6 @@ impl UploadQueueInitialized {
pub(super) fn no_pending_work(&self) -> bool {
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
}
pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
self.visible_remote_consistent_lsn.load()
}
pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
self.projected_remote_consistent_lsn
}
}
#[derive(Clone, Copy)]
@@ -126,8 +114,9 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -169,10 +158,7 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
visible_remote_consistent_lsn: Arc::new(
index_part.metadata.disk_consistent_lsn().into(),
),
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
@@ -215,11 +201,12 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
/// A deletion of some layers within the lifetime of a timeline. This is not used
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerFileName, Generation)>,
pub(crate) file_kind: RemoteOpFileKind,
pub(crate) layer_file_name: LayerFileName,
pub(crate) scheduled_from_timeline_delete: bool,
pub(crate) generation: Generation,
}
#[derive(Debug)]
@@ -230,7 +217,7 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete layer files
/// Delete a layer file
Delete(Delete),
/// Barrier. When the barrier operation is reached,
@@ -252,9 +239,13 @@ impl std::fmt::Display for UploadOp {
UploadOp::UploadMetadata(_, lsn) => {
write!(f, "UploadMetadata(lsn: {})", lsn)
}
UploadOp::Delete(delete) => {
write!(f, "Delete({} layers)", delete.layers.len(),)
}
UploadOp::Delete(delete) => write!(
f,
"Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
delete.layer_file_name.file_name(),
delete.scheduled_from_timeline_delete,
delete.generation
),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}

View File

@@ -18,8 +18,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use std::sync::{RwLock, RwLockWriteGuard};
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -111,7 +110,7 @@ impl OpenFiles {
///
/// On return, we hold a lock on the slot, and its 'tag' has been updated
/// recently_used has been set. It's all ready for reuse.
async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
//
// Run the clock algorithm to find a slot to replace.
//
@@ -143,7 +142,7 @@ impl OpenFiles {
}
retries += 1;
} else {
slot_guard = slot.inner.write().await;
slot_guard = slot.inner.write().unwrap();
index = next;
break;
}
@@ -154,7 +153,7 @@ impl OpenFiles {
// old file.
//
if let Some(old_file) = slot_guard.file.take() {
// the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
// distinguish the two.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::CloseByReplace)
@@ -209,29 +208,6 @@ impl CrashsafeOverwriteError {
}
}
/// Observe duration for the given storage I/O operation
///
/// Unlike `observe_closure_duration`, this supports async,
/// where "support" means that we measure wall clock time.
macro_rules! observe_duration {
($op:expr, $($body:tt)*) => {{
let instant = Instant::now();
let result = $($body)*;
let elapsed = instant.elapsed().as_secs_f64();
STORAGE_IO_TIME_METRIC
.get($op)
.observe(elapsed);
result
}}
}
macro_rules! with_file {
($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
let $ident = $this.lock_file().await?;
observe_duration!($op, $($body)*)
}};
}
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
@@ -268,9 +244,11 @@ impl VirtualFile {
tenant_id = "*".to_string();
timeline_id = "*".to_string();
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| open_options.open(path))?;
// Strip all options other than read and write.
//
@@ -353,24 +331,22 @@ impl VirtualFile {
/// Call File::sync_all() on the underlying File.
pub async fn sync_all(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file| file
.as_ref()
.sync_all())
self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
.await?
}
pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
with_file!(self, StorageIoOperation::Metadata, |file| file
.as_ref()
.metadata())
self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
.await?
}
/// Helper function internal to `VirtualFile` that looks up the underlying File,
/// opens it and evicts some other File if necessary. The passed parameter is
/// assumed to be a function available for the physical `File`.
///
/// We are doing it via a macro as Rust doesn't support async closures that
/// take on parameters with lifetimes.
async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
/// Helper function that looks up the underlying File for this VirtualFile,
/// opening it and evicting some other File if necessary. It calls 'func'
/// with the physical File.
async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
where
F: FnMut(&File) -> R,
{
let open_files = get_open_files();
let mut handle_guard = {
@@ -380,23 +356,27 @@ impl VirtualFile {
// We only need to hold the handle lock while we read the current handle. If
// another thread closes the file and recycles the slot for a different file,
// we will notice that the handle we read is no longer valid and retry.
let mut handle = *self.handle.read().await;
let mut handle = *self.handle.read().unwrap();
loop {
// Check if the slot contains our File
{
let slot = &open_files.slots[handle.index];
let slot_guard = slot.inner.read().await;
if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(FileGuard { slot_guard });
let slot_guard = slot.inner.read().unwrap();
if slot_guard.tag == handle.tag {
if let Some(file) = &slot_guard.file {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(file)));
}
}
}
// The slot didn't contain our File. We will have to open it ourselves,
// but before that, grab a write lock on handle in the VirtualFile, so
// that no other thread will try to concurrently open the same file.
let handle_guard = self.handle.write().await;
let handle_guard = self.handle.write().unwrap();
// If another thread changed the handle while we were not holding the lock,
// then the handle might now be valid again. Loop back to retry.
@@ -410,10 +390,17 @@ impl VirtualFile {
// We need to open the file ourselves. The handle in the VirtualFile is
// now locked in write-mode. Find a free slot to put it in.
let (handle, mut slot_guard) = open_files.find_victim_slot().await;
let (handle, mut slot_guard) = open_files.find_victim_slot();
// Open the physical file
let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it
let result = STORAGE_IO_TIME_METRIC
.get(op)
.observe_closure_duration(|| func(&file));
// Store the File in the slot and update the handle in the VirtualFile
// to point to it.
@@ -421,9 +408,7 @@ impl VirtualFile {
*handle_guard = handle;
return Ok(FileGuard {
slot_guard: slot_guard.downgrade(),
});
Ok(result)
}
pub fn remove(self) {
@@ -438,9 +423,11 @@ impl VirtualFile {
self.pos = offset;
}
SeekFrom::End(offset) => {
self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
.as_ref()
.seek(SeekFrom::End(offset)))?
self.pos = self
.with_file(StorageIoOperation::Seek, |mut file| {
file.seek(SeekFrom::End(offset))
})
.await??
}
SeekFrom::Current(offset) => {
let pos = self.pos as i128 + offset as i128;
@@ -528,9 +515,9 @@ impl VirtualFile {
}
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Read, |file| file
.as_ref()
.read_at(buf, offset));
let result = self
.with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
.await?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -540,9 +527,9 @@ impl VirtualFile {
}
async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
let result = with_file!(self, StorageIoOperation::Write, |file| file
.as_ref()
.write_at(buf, offset));
let result = self
.with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
.await?;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -552,18 +539,6 @@ impl VirtualFile {
}
}
struct FileGuard<'a> {
slot_guard: RwLockReadGuard<'a, SlotInner>,
}
impl<'a> AsRef<File> for FileGuard<'a> {
fn as_ref(&self) -> &File {
// This unwrap is safe because we only create `FileGuard`s
// if we know that the file is Some.
self.slot_guard.file.as_ref().unwrap()
}
}
#[cfg(test)]
impl VirtualFile {
pub(crate) async fn read_blk(
@@ -596,39 +571,20 @@ impl VirtualFile {
impl Drop for VirtualFile {
/// If a VirtualFile is dropped, close the underlying file if it was open.
fn drop(&mut self) {
let handle = self.handle.get_mut();
let handle = self.handle.get_mut().unwrap();
fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
if slot_guard.tag == tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also the `CloseByReplace` operation for closes done on eviction for
// comparison.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(slot_guard.file.take()));
}
}
// We don't have async drop so we cannot directly await the lock here.
// Instead, first do a best-effort attempt at closing the underlying
// file descriptor by using `try_write`, and if that fails, spawn
// a tokio task to do it asynchronously: we just want it to be
// cleaned up eventually.
// Most of the time, the `try_lock` should succeed though,
// as we have `&mut self` access. In other words, if the slot
// is still occupied by our file, there should be no access from
// other I/O operations; the only other possible place to lock
// the slot is the lock algorithm looking for free slots.
// We could check with a read-lock first, to avoid waiting on an
// unrelated I/O.
let slot = &get_open_files().slots[handle.index];
if let Ok(slot_guard) = slot.inner.try_write() {
clean_slot(slot, slot_guard, handle.tag);
} else {
let tag = handle.tag;
tokio::spawn(async move {
let slot_guard = slot.inner.write().await;
clean_slot(slot, slot_guard, tag);
});
};
let mut slot_guard = slot.inner.write().unwrap();
if slot_guard.tag == handle.tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(slot_guard.file.take()));
}
}
}

View File

@@ -741,13 +741,6 @@ NeonProcessUtility(
break;
case T_DropdbStmt:
HandleDropDb(castNode(DropdbStmt, parseTree));
/*
* We do this here to hack around the fact that Postgres performs the drop
* INSIDE of standard_ProcessUtility, which means that if we try to
* abort the drop normally it'll be too late. DROP DATABASE can't be inside
* of a transaction block anyway, so this should be fine to do.
*/
NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
break;
case T_CreateRoleStmt:
HandleCreateRole(castNode(CreateRoleStmt, parseTree));

View File

@@ -14,6 +14,7 @@
*/
#include <sys/file.h>
#include <sys/statvfs.h>
#include <unistd.h>
#include <fcntl.h>
@@ -37,6 +38,9 @@
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/buf_internals.h"
#include "storage/procsignal.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
@@ -62,6 +66,9 @@
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
typedef struct FileCacheEntry
{
BufferTag key;
@@ -84,12 +91,14 @@ static int lfc_desc = 0;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_free_space_watermark;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
void FileCacheMonitorMain(Datum main_arg);
@@ -245,6 +254,80 @@ lfc_change_limit_hook(int newval, void *extra)
LWLockRelease(lfc_lock);
}
/*
* Local file system state monitor check available free space.
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
* but throwing away least recently accessed chunks.
* First time low space watermark is reached cache size is divided by two,
* second time by four,... Finally we remove all chunks from local cache.
*
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
* We only throw away cached chunks but do not prevent from filling cache by new chunks.
*
* Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
* disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
* Calling statvfs each second should not add any noticeable overhead.
*/
void
FileCacheMonitorMain(Datum main_arg)
{
/*
* Choose file system state monitor interval so that space can not be exosted
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
*/
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
BackgroundWorkerUnblockSignals();
/* Periodically dump buffers until terminated. */
while (!ShutdownRequestPending)
{
if (lfc_size_limit != 0)
{
struct statvfs sfs;
if (statvfs(lfc_path, &sfs) < 0)
{
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
}
else
{
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
{
if (lfc_shrinking_factor < 31) {
lfc_shrinking_factor += 1;
}
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
}
else
lfc_shrinking_factor = 0; /* reset to initial value */
}
}
pg_usleep(monitor_interval);
}
}
static void
lfc_register_free_space_monitor(void)
{
BackgroundWorker bgw;
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
void
lfc_init(void)
{
@@ -281,6 +364,19 @@ lfc_init(void)
lfc_change_limit_hook,
NULL);
DefineCustomIntVariable("neon.free_space_watermark",
"Minimal free space in local file system after reaching which local file cache will be truncated",
NULL,
&lfc_free_space_watermark,
1024, /* 1GB */
0,
INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MB,
NULL,
NULL,
NULL);
DefineCustomStringVariable("neon.file_cache_path",
"Path to local file cache (can be raw device)",
NULL,
@@ -295,6 +391,9 @@ lfc_init(void)
if (lfc_max_size == 0)
return;
if (lfc_free_space_watermark != 0)
lfc_register_free_space_monitor();
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = lfc_shmem_startup;
#if PG_VERSION_NUM>=150000

View File

@@ -1790,14 +1790,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if (!XLogInsertAllowed())
return;
/* ensure we have enough xlog buffers to log max-sized records */
XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
/*
* Iterate over all the pages. They are collected into batches of
* XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
* batch.
*/
while (remblocks > 0)
{
int count = Min(remblocks, XLR_MAX_BLOCK_ID);

View File

@@ -42,7 +42,6 @@ reqwest-middleware.workspace = true
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
routerify.workspace = true
rustc-hash.workspace = true
rustls-pemfile.workspace = true
rustls.workspace = true
scopeguard.workspace = true

View File

@@ -160,19 +160,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
Test(_) => Some("test".to_owned()),
}
}
/// Get username from the credentials.
pub fn get_user(&self) -> &str {
use BackendType::*;
match self {
Console(_, creds) => creds.user,
Postgres(_, creds) => creds.user,
Link(_) => "link",
Test(_) => "test",
}
}
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(

View File

@@ -17,12 +17,11 @@ use std::{
use tokio::time;
use tokio_postgres::AsyncMessage;
use crate::{
auth, console,
metrics::{Ids, MetricCounter, USAGE_METRICS},
};
use crate::{auth, console};
use crate::{compute, config};
use super::sql_over_http::MAX_RESPONSE_SIZE;
use crate::proxy::ConnectMechanism;
use tracing::{error, warn};
@@ -401,6 +400,7 @@ async fn connect_to_compute_once(
.user(&conn_info.username)
.password(&conn_info.password)
.dbname(&conn_info.dbname)
.max_backend_message_size(MAX_RESPONSE_SIZE)
.connect_timeout(timeout)
.connect(tokio_postgres::NoTls)
.await?;
@@ -412,10 +412,6 @@ async fn connect_to_compute_once(
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
});
let ids = Ids {
endpoint_id: node_info.aux.endpoint_id.to_string(),
branch_id: node_info.aux.branch_id.to_string(),
};
tokio::spawn(
poll_fn(move |cx| {
@@ -454,18 +450,10 @@ async fn connect_to_compute_once(
Ok(Client {
inner: client,
session: tx,
ids,
})
}
pub struct Client {
pub inner: tokio_postgres::Client,
session: tokio::sync::watch::Sender<uuid::Uuid>,
ids: Ids,
}
impl Client {
pub fn metrics(&self) -> Arc<MetricCounter> {
USAGE_METRICS.register(self.ids.clone())
}
}

View File

@@ -3,12 +3,10 @@ use std::sync::Arc;
use anyhow::bail;
use futures::pin_mut;
use futures::StreamExt;
use hashbrown::HashMap;
use hyper::body::HttpBody;
use hyper::header;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::Response;
use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Map;
@@ -18,11 +16,7 @@ use tokio_postgres::types::Type;
use tokio_postgres::GenericClient;
use tokio_postgres::IsolationLevel;
use tokio_postgres::Row;
use tracing::error;
use tracing::instrument;
use url::Url;
use utils::http::error::ApiError;
use utils::http::json::json_response;
use super::conn_pool::ConnInfo;
use super::conn_pool::GlobalConnPool;
@@ -45,6 +39,7 @@ enum Payload {
Batch(BatchQueryData),
}
pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -187,45 +182,7 @@ pub async fn handle(
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
) -> Result<Response<Body>, ApiError> {
let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
let mut response = match result {
Ok(r) => r,
Err(e) => {
let message = format!("{:?}", e);
let code = match e.downcast_ref::<tokio_postgres::Error>() {
Some(e) => match e.code() {
Some(e) => serde_json::to_value(e.code()).unwrap(),
None => Value::Null,
},
None => Value::Null,
};
error!(
?code,
"sql-over-http per-client task finished with an error: {e:#}"
);
// TODO: this shouldn't always be bad request.
json_response(
StatusCode::BAD_REQUEST,
json!({ "message": message, "code": code }),
)?
}
};
response.headers_mut().insert(
"Access-Control-Allow-Origin",
hyper::http::HeaderValue::from_static("*"),
);
Ok(response)
}
#[instrument(name = "sql-over-http", skip_all)]
async fn handle_inner(
request: Request<Body>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
) -> anyhow::Result<Response<Body>> {
) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
//
// Determine the destination and connection params
//
@@ -276,18 +233,13 @@ async fn handle_inner(
let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
let mut response = Response::builder()
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/json");
//
// Now execute the query and return the result
//
let mut size = 0;
let result = match payload {
Payload::Single(query) => {
query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
}
Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
.await
.map(|x| (x, HashMap::default())),
Payload::Batch(batch_query) => {
let mut results = Vec::new();
let mut builder = client.inner.build_transaction();
@@ -302,8 +254,7 @@ async fn handle_inner(
}
let transaction = builder.start().await?;
for query in batch_query.queries {
let result =
query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
let result = query_to_json(&transaction, query, raw_output, array_mode).await;
match result {
Ok(r) => results.push(r),
Err(e) => {
@@ -313,27 +264,26 @@ async fn handle_inner(
}
}
transaction.commit().await?;
let mut headers = HashMap::default();
if txn_read_only {
response = response.header(
headers.insert(
TXN_READ_ONLY.clone(),
HeaderValue::try_from(txn_read_only.to_string())?,
);
}
if txn_deferrable {
response = response.header(
headers.insert(
TXN_DEFERRABLE.clone(),
HeaderValue::try_from(txn_deferrable.to_string())?,
);
}
if let Some(txn_isolation_level) = txn_isolation_level_raw {
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
}
Ok(json!({ "results": results }))
Ok((json!({ "results": results }), headers))
}
};
let metrics = client.metrics();
if allow_pool {
let current_span = tracing::Span::current();
// return connection to the pool
@@ -343,30 +293,12 @@ async fn handle_inner(
});
}
match result {
Ok(value) => {
// how could this possibly fail
let body = serde_json::to_string(&value).expect("json serialization should not fail");
let len = body.len();
let response = response
.body(Body::from(body))
// only fails if invalid status code or invalid header/values are given.
// these are not user configurable so it cannot fail dynamically
.expect("building response payload should not fail");
// count the egress bytes - we miss the TLS and header overhead but oh well...
// moving this later in the stack is going to be a lot of effort and ehhhh
metrics.record_egress(len as u64);
Ok(response)
}
Err(e) => Err(e),
}
result
}
async fn query_to_json<T: GenericClient>(
client: &T,
data: QueryData,
current_size: &mut usize,
raw_output: bool,
array_mode: bool,
) -> anyhow::Result<Value> {
@@ -380,10 +312,16 @@ async fn query_to_json<T: GenericClient>(
// big.
pin_mut!(row_stream);
let mut rows: Vec<tokio_postgres::Row> = Vec::new();
let mut current_size = 0;
while let Some(row) = row_stream.next().await {
let row = row?;
*current_size += row.body_len();
current_size += row.body_len();
rows.push(row);
if current_size > MAX_RESPONSE_SIZE {
return Err(anyhow::anyhow!(
"response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
));
}
}
// grab the command tag and number of rows affected

View File

@@ -7,6 +7,7 @@ use crate::{
};
use bytes::{Buf, Bytes};
use futures::{Sink, Stream, StreamExt};
use hashbrown::HashMap;
use hyper::{
server::{
accept,
@@ -17,6 +18,7 @@ use hyper::{
};
use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
use serde_json::{json, Value};
use std::{
convert::Infallible,
@@ -202,7 +204,44 @@ async fn ws_handler(
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
.instrument(info_span!("sql-over-http"))
.await;
let status_code = match result {
Ok(_) => StatusCode::OK,
Err(_) => StatusCode::BAD_REQUEST,
};
let (json, headers) = match result {
Ok(r) => r,
Err(e) => {
let message = format!("{:?}", e);
let code = match e.downcast_ref::<tokio_postgres::Error>() {
Some(e) => match e.code() {
Some(e) => serde_json::to_value(e.code()).unwrap(),
None => Value::Null,
},
None => Value::Null,
};
error!(
?code,
"sql-over-http per-client task finished with an error: {e:#}"
);
(
json!({ "message": message, "code": code }),
HashMap::default(),
)
}
};
json_response(status_code, json).map(|mut r| {
r.headers_mut().insert(
"Access-Control-Allow-Origin",
hyper::http::HeaderValue::from_static("*"),
);
for (k, v) in headers {
r.headers_mut().insert(k, v);
}
r
})
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
@@ -214,7 +253,7 @@ async fn ws_handler(
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.map_err(|e| ApiError::InternalServerError(e.into()))
.map_err(|e| ApiError::BadRequest(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")
}

View File

@@ -3,18 +3,9 @@
use crate::{config::MetricCollectionConfig, http};
use chrono::{DateTime, Utc};
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use dashmap::{mapref::entry::Entry, DashMap};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::{
convert::Infallible,
sync::{
atomic::{AtomicU64, AtomicUsize, Ordering},
Arc,
},
time::Duration,
};
use tracing::{error, info, instrument, trace};
use serde::Serialize;
use std::{collections::HashMap, convert::Infallible, time::Duration};
use tracing::{error, info, instrument, trace, warn};
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
@@ -27,95 +18,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
/// so while the project-id is unique across regions the whole pipeline will work correctly
/// because we enrich the event with project_id in the control-plane endpoint.
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
pub struct Ids {
pub endpoint_id: String,
pub branch_id: String,
}
#[derive(Debug)]
pub struct MetricCounter {
transmitted: AtomicU64,
opened_connections: AtomicUsize,
}
impl MetricCounter {
/// Record that some bytes were sent from the proxy to the client
pub fn record_egress(&self, bytes: u64) {
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
}
/// extract the value that should be reported
fn should_report(self: &Arc<Self>) -> Option<u64> {
// heuristic to see if the branch is still open
// if a clone happens while we are observing, the heuristic will be incorrect.
//
// Worst case is that we won't report an event for this endpoint.
// However, for the strong count to be 1 it must have occured that at one instant
// all the endpoints were closed, so missing a report because the endpoints are closed is valid.
let is_open = Arc::strong_count(self) > 1;
let opened = self.opened_connections.swap(0, Ordering::AcqRel);
// update cached metrics eagerly, even if they can't get sent
// (to avoid sending the same metrics twice)
// see the relevant discussion on why to do so even if the status is not success:
// https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
let value = self.transmitted.swap(0, Ordering::AcqRel);
// Our only requirement is that we report in every interval if there was an open connection
// if there were no opened connections since, then we don't need to report
if value == 0 && !is_open && opened == 0 {
None
} else {
Some(value)
}
}
/// Determine whether the counter should be cleared from the global map.
fn should_clear(self: &mut Arc<Self>) -> bool {
// we can't clear this entry if it's acquired elsewhere
let Some(counter) = Arc::get_mut(self) else {
return false;
};
let opened = *counter.opened_connections.get_mut();
let value = *counter.transmitted.get_mut();
// clear if there's no data to report
value == 0 && opened == 0
}
}
// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos
type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
#[derive(Default)]
pub struct Metrics {
endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
}
impl Metrics {
/// Register a new byte metrics counter for this endpoint
pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
let entry = if let Some(entry) = self.endpoints.get(&ids) {
entry.clone()
} else {
self.endpoints
.entry(ids)
.or_insert_with(|| {
Arc::new(MetricCounter {
transmitted: AtomicU64::new(0),
opened_connections: AtomicUsize::new(0),
})
})
.clone()
};
entry.opened_connections.fetch_add(1, Ordering::AcqRel);
entry
}
}
pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
info!("metrics collector config: {config:?}");
scopeguard::defer! {
@@ -123,83 +31,145 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
}
let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
let mut prev = Utc::now();
let mut ticker = tokio::time::interval(config.interval);
loop {
ticker.tick().await;
let now = Utc::now();
collect_metrics_iteration(
&USAGE_METRICS,
let res = collect_metrics_iteration(
&http_client,
&mut cached_metrics,
&config.endpoint,
&hostname,
prev,
now,
)
.await;
prev = now;
match res {
Err(e) => error!("failed to send consumption metrics: {e} "),
Ok(_) => trace!("periodic metrics collection completed successfully"),
}
}
}
fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
let metrics = prometheus::default_registry().gather();
for m in metrics {
if m.get_name() == "proxy_io_bytes_per_client" {
for ms in m.get_metric() {
let direction = ms
.get_label()
.iter()
.find(|l| l.get_name() == "direction")
.unwrap()
.get_value();
// Only collect metric for outbound traffic
if direction == "tx" {
let endpoint_id = ms
.get_label()
.iter()
.find(|l| l.get_name() == "endpoint_id")
.unwrap()
.get_value();
let branch_id = ms
.get_label()
.iter()
.find(|l| l.get_name() == "branch_id")
.unwrap()
.get_value();
let value = ms.get_counter().get_value() as u64;
// Report if the metric value is suspiciously large
if value > (1u64 << 40) {
warn!(
"potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
branch_id, endpoint_id, value
);
}
current_metrics.push((
Ids {
endpoint_id: endpoint_id.to_string(),
branch_id: branch_id.to_string(),
},
(value, Utc::now()),
));
}
}
}
}
current_metrics
}
#[instrument(skip_all)]
async fn collect_metrics_iteration(
metrics: &Metrics,
client: &http::ClientWithMiddleware,
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
metric_collection_endpoint: &reqwest::Url,
hostname: &str,
prev: DateTime<Utc>,
now: DateTime<Utc>,
) {
) -> anyhow::Result<()> {
info!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
);
let mut metrics_to_clear = Vec::new();
let current_metrics = gather_proxy_io_bytes_per_client();
let metrics_to_send: Vec<(Ids, u64)> = metrics
.endpoints
let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
.iter()
.filter_map(|counter| {
let key = counter.key().clone();
let Some(value) = counter.should_report() else {
metrics_to_clear.push(key);
return None;
.filter_map(|(curr_key, (curr_val, curr_time))| {
let mut start_time = *curr_time;
let mut value = *curr_val;
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
// Only send metrics updates if the metric has increased
if curr_val > prev_val {
value = curr_val - prev_val;
start_time = *prev_time;
} else {
if curr_val < prev_val {
error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
prev_val, curr_val, curr_key);
}
return None;
}
};
Some((key, value))
Some(Event {
kind: EventType::Incremental {
start_time,
stop_time: *curr_time,
},
metric: PROXY_IO_BYTES_PER_CLIENT,
idempotency_key: idempotency_key(hostname),
value,
extra: Ids {
endpoint_id: curr_key.endpoint_id.clone(),
branch_id: curr_key.branch_id.clone(),
},
})
})
.collect();
if metrics_to_send.is_empty() {
trace!("no new metrics to send");
return Ok(());
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
let events = chunk
.iter()
.map(|(ids, value)| Event {
kind: EventType::Incremental {
start_time: prev,
stop_time: now,
},
metric: PROXY_IO_BYTES_PER_CLIENT,
idempotency_key: idempotency_key(hostname),
value: *value,
extra: Ids {
endpoint_id: ids.endpoint_id.clone(),
branch_id: ids.branch_id.clone(),
},
})
.collect();
let res = client
.post(metric_collection_endpoint.clone())
.json(&EventChunk { events })
.json(&EventChunk {
events: chunk.into(),
})
.send()
.await;
@@ -213,113 +183,34 @@ async fn collect_metrics_iteration(
if !res.status().is_success() {
error!("metrics endpoint refused the sent metrics: {:?}", res);
for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
// Report if the metric value is suspiciously large
error!("potentially abnormal metric value: {:?}", metric);
}
}
}
// update cached metrics after they were sent
// (to avoid sending the same metrics twice)
// see the relevant discussion on why to do so even if the status is not success:
// https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
for send_metric in chunk {
let stop_time = match send_metric.kind {
EventType::Incremental { stop_time, .. } => stop_time,
_ => unreachable!(),
};
for metric in metrics_to_clear {
match metrics.endpoints.entry(metric) {
Entry::Occupied(mut counter) => {
if counter.get_mut().should_clear() {
counter.remove_entry();
}
}
Entry::Vacant(_) => {}
cached_metrics
.entry(Ids {
endpoint_id: send_metric.extra.endpoint_id.clone(),
branch_id: send_metric.extra.branch_id.clone(),
})
// update cached value (add delta) and time
.and_modify(|e| {
e.0 = e.0.saturating_add(send_metric.value);
e.1 = stop_time
})
// cache new metric
.or_insert((send_metric.value, stop_time));
}
}
}
#[cfg(test)]
mod tests {
use std::{
net::TcpListener,
sync::{Arc, Mutex},
};
use anyhow::Error;
use chrono::Utc;
use consumption_metrics::{Event, EventChunk};
use hyper::{
service::{make_service_fn, service_fn},
Body, Response,
};
use url::Url;
use super::{collect_metrics_iteration, Ids, Metrics};
use crate::http;
#[tokio::test]
async fn metrics() {
let listener = TcpListener::bind("0.0.0.0:0").unwrap();
let reports = Arc::new(Mutex::new(vec![]));
let reports2 = reports.clone();
let server = hyper::server::Server::from_tcp(listener)
.unwrap()
.serve(make_service_fn(move |_| {
let reports = reports.clone();
async move {
Ok::<_, Error>(service_fn(move |req| {
let reports = reports.clone();
async move {
let bytes = hyper::body::to_bytes(req.into_body()).await?;
let events: EventChunk<'static, Event<Ids, String>> =
serde_json::from_slice(&bytes)?;
reports.lock().unwrap().push(events);
Ok::<_, Error>(Response::new(Body::from(vec![])))
}
}))
}
}));
let addr = server.local_addr();
tokio::spawn(server);
let metrics = Metrics::default();
let client = http::new_client();
let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
let now = Utc::now();
// no counters have been registered
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
let r = std::mem::take(&mut *reports2.lock().unwrap());
assert!(r.is_empty());
// register a new counter
let counter = metrics.register(Ids {
endpoint_id: "e1".to_string(),
branch_id: "b1".to_string(),
});
// the counter should be observed despite 0 egress
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
let r = std::mem::take(&mut *reports2.lock().unwrap());
assert_eq!(r.len(), 1);
assert_eq!(r[0].events.len(), 1);
assert_eq!(r[0].events[0].value, 0);
// record egress
counter.record_egress(1);
// egress should be observered
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
let r = std::mem::take(&mut *reports2.lock().unwrap());
assert_eq!(r.len(), 1);
assert_eq!(r[0].events.len(), 1);
assert_eq!(r[0].events[0].value, 1);
// release counter
drop(counter);
// we do not observe the counter
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
let r = std::mem::take(&mut *reports2.lock().unwrap());
assert!(r.is_empty());
// counter is unregistered
assert!(metrics.endpoints.is_empty());
}
Ok(())
}

View File

@@ -7,7 +7,6 @@ use crate::{
compute::{self, PostgresConnection},
config::{ProxyConfig, TlsConfig},
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
metrics::{Ids, USAGE_METRICS},
protocol2::WithClientIp,
stream::{PqStream, Stream},
};
@@ -603,11 +602,6 @@ pub async fn proxy_pass(
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: &MetricsAuxInfo,
) -> anyhow::Result<()> {
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.to_string(),
branch_id: aux.branch_id.to_string(),
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
let mut client = MeasuredStream::new(
client,
@@ -615,7 +609,6 @@ pub async fn proxy_pass(
|cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
usage.record_egress(cnt as u64);
},
);
@@ -697,14 +690,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
.await
{
Ok(auth_result) => auth_result,
Err(e) => {
let user = creds.get_user();
let db = params.get("database");
let app = params.get("application_name");
let params_span = tracing::info_span!("", ?user, ?db, ?app);
return stream.throw_error(e).instrument(params_span).await;
}
Err(e) => return stream.throw_error(e).await,
};
let AuthSuccess {

View File

@@ -105,8 +105,6 @@ class NeonCompare(PgCompare):
self._pg_bin = pg_bin
self.pageserver_http_client = self.env.pageserver.http_client()
# note that neon_simple_env now uses LOCAL_FS remote storage
# Create tenant
tenant_conf: Dict[str, str] = {}
if False: # TODO add pytest setting for this

View File

@@ -460,11 +460,9 @@ class NeonEnvBuilder:
), "Unexpectedly instantiated from outside a test function"
self.test_name = test_name
def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized"
if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
self.env = NeonEnv(self)
return self.env
@@ -472,19 +470,8 @@ class NeonEnvBuilder:
assert self.env is not None, "environment is not already initialized, call init() first"
self.env.start()
def init_start(
self,
initial_tenant_conf: Optional[Dict[str, str]] = None,
default_remote_storage_if_missing: bool = True,
) -> NeonEnv:
"""
Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
To avoid creating initial_tenant, call init_configs to setup the environment.
Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
"""
env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
env = self.init_configs()
self.start()
# Prepare the default branch to start the postgres on later.
@@ -559,7 +546,7 @@ class NeonEnvBuilder:
user: RemoteStorageUser,
bucket_name: Optional[str] = None,
bucket_region: Optional[str] = None,
) -> RemoteStorage:
) -> Optional[RemoteStorage]:
ret = kind.configure(
self.repo_dir,
self.mock_s3_server,
@@ -902,8 +889,6 @@ def _shared_simple_env(
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
is set, this is shared by all tests using `neon_simple_env`.
This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
"""
if os.environ.get("TEST_SHARED_FIXTURES") is None:
@@ -1496,16 +1481,6 @@ class NeonAttachmentService:
self.running = False
return self
def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
response = requests.post(
f"{self.env.control_plane_api}/attach_hook",
json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
)
response.raise_for_status()
gen = response.json()["gen"]
assert isinstance(gen, int)
return gen
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1714,7 +1689,12 @@ class NeonPageserver(PgProtocol):
to call into the pageserver HTTP client.
"""
if self.env.attachment_service is not None:
generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
response = requests.post(
f"{self.env.control_plane_api}/attach_hook",
json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
)
response.raise_for_status()
generation = response.json()["gen"]
else:
generation = None

View File

@@ -620,8 +620,3 @@ class PageserverHttpClient(requests.Session):
},
)
self.verbose_error(res)
def deletion_queue_flush(self, execute: bool = False):
self.put(
f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
).raise_for_status()

View File

@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
def list_prefix(
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
) -> ListObjectsV2OutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
response = remote.client.list_objects_v2(
Delimiter=delimiter,
Delimiter="/",
Bucket=remote.bucket_name,
Prefix=prefix,
)

View File

@@ -202,6 +202,9 @@ class RemoteStorageKind(str, enum.Enum):
LOCAL_FS = "local_fs"
MOCK_S3 = "mock_s3"
REAL_S3 = "real_s3"
# Pass to tests that are generic to remote storage
# to ensure the test pass with or without the remote storage
NOOP = "noop"
def configure(
self,
@@ -212,7 +215,10 @@ class RemoteStorageKind(str, enum.Enum):
user: RemoteStorageUser,
bucket_name: Optional[str] = None,
bucket_region: Optional[str] = None,
) -> RemoteStorage:
) -> Optional[RemoteStorage]:
if self == RemoteStorageKind.NOOP:
return None
if self == RemoteStorageKind.LOCAL_FS:
return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))

View File

@@ -1,52 +0,0 @@
import queue
import threading
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
from fixtures.types import TenantId
"""
553 sudo mkfs.ext4 /dev/nvme1n1
555 mkdir test_output
556 sudo mount /dev/nvme1n1 test_output
557 htop
559 ./scripts/pysync
560 NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
561 sudo chown -R admin:admin test_output
cargo build_testing --release
562 NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
cd test_output/test_pageserver_startup_many_tenants/repo
sudo env NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000 ../../../target/release/neon_local start
# watch initial load complete, then background jobs start. That's the interesting part.
sudo env NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000 ../../../target/release/neon_local stop
# usually pageserver won't be responsive, kill with
sudo pkill -9 pageserver
"""
def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
# below doesn't work because summaries contain tenant and timeline ids and we check for them
tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
pshttp = env.pageserver.http_client()
ep = env.endpoints.create_start("main")
ep.safe_psql("create table foo(b text)")
for i in range(0, 8):
ep.safe_psql("insert into foo(b) values ('some text')")
# pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
pshttp.timeline_checkpoint(tenant_id, timeline_id)
ep.stop_and_destroy()
env.pageserver.stop()
for sk in env.safekeepers:
sk.stop()
tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
for i in range(0, 20_000):
import shutil
shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))

View File

@@ -4,12 +4,7 @@ from typing import List, Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
from fixtures.types import TenantId, TimelineId
@@ -31,18 +26,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
for _ in range(3):
for _ in range(4):
tenant_id, timeline_id = env.neon_cli.create_tenant()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop()
tenant_timelines.append((tenant_id, timeline_id, endpoint))
# Stop the pageserver -- this has to be not immediate or we need to wait for uploads
# Stop the pageserver
env.pageserver.stop()
# Leave the first timeline alone, but corrupt the others in different ways
@@ -51,21 +45,30 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
(tenant1, timeline1, pg1) = tenant_timelines[1]
metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
with open(metadata_path, "w") as f:
f.write("overwritten with garbage!")
f = open(metadata_path, "w")
f.write("overwritten with garbage!")
f.close()
log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
(tenant2, timeline2, pg2) = tenant_timelines[2]
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
# Looks like a layer file. Remove it
os.remove(f"{timeline_path}/{filename}")
log.info(
f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
)
(tenant3, timeline3, pg3) = tenant_timelines[3]
timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/"
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
# Looks like a layer file. Corrupt it
p = f"{timeline_path}/{filename}"
size = os.path.getsize(p)
with open(p, "wb") as f:
f.truncate(0)
f.truncate(size)
log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
f = open(f"{timeline_path}/{filename}", "w")
f.write("overwritten with garbage!")
f.close()
log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
env.pageserver.start()
@@ -84,13 +87,22 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
)
# Second timeline will fail during basebackup, because the local layer file is corrupt.
# Second timeline has no ancestors, only the metadata file and no layer files locally,
# and we don't have the remote storage enabled. It is loaded into memory, but getting
# the basebackup from it will fail.
with pytest.raises(
Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
) as err:
pg2.start()
log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
# Third timeline will also fail during basebackup, because the layer file is corrupt.
# It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
# (We don't check layer file contents on startup, when loading the timeline)
with pytest.raises(Exception, match="Failed to load delta layer") as err:
pg2.start()
pg3.start()
log.info(
f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
)

View File

@@ -211,12 +211,4 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
ddl.wait()
ddl.failures(False)
cur.execute("CREATE DATABASE failure WITH OWNER=cork")
ddl.wait()
with pytest.raises(psycopg2.InternalError):
ddl.failures(True)
cur.execute("DROP DATABASE failure")
ddl.wait()
ddl.pg.connect(dbname="failure") # Ensure we can connect after a failed drop
conn.close()

View File

@@ -74,13 +74,11 @@ class EvictionEnv:
pgbench_init_lsns: Dict[TenantId, Lsn]
def timelines_du(self) -> Tuple[int, int, int]:
return poor_mans_du(
self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
)
return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
return {
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
for tid, tlid in self.timelines
}
@@ -91,21 +89,7 @@ class EvictionEnv:
"""
lsn = self.pgbench_init_lsns[tenant_id]
with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
# instead of using pgbench --select-only which does point selects,
# run full table scans for all tables
with endpoint.connect() as conn:
cur = conn.cursor()
tables_cols = {
"pgbench_accounts": "abalance",
"pgbench_tellers": "tbalance",
"pgbench_branches": "bbalance",
"pgbench_history": "delta",
}
for table, column in tables_cols.items():
cur.execute(f"select avg({column}) from {table}")
_avg = cur.fetchone()
self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
def pageserver_start_with_disk_usage_eviction(
self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -143,19 +127,6 @@ class EvictionEnv:
self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
def human_bytes(amt: float) -> str:
suffixes = ["", "Ki", "Mi", "Gi"]
last = suffixes[-1]
for name in suffixes:
if amt < 1024 or name == last:
return f"{int(round(amt))} {name}B"
amt = amt / 1024
raise RuntimeError("unreachable")
@pytest.fixture
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
"""
@@ -244,12 +215,8 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
healthy_tenant_id, healthy_timeline_id = env.timelines[1]
broken_size_pre, _, _ = poor_mans_du(
env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
)
healthy_size_pre, _, _ = poor_mans_du(
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
)
broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
# try to evict everything, then validate that broken tenant wasn't touched
target = broken_size_pre + healthy_size_pre
@@ -257,12 +224,8 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
broken_size_post, _, _ = poor_mans_du(
env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
)
healthy_size_post, _, _ = poor_mans_du(
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
)
broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
assert healthy_size_post < healthy_size_pre
@@ -403,16 +366,18 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
du_by_timeline = env.du_by_timeline()
# pick any tenant
[warm, cold] = list(du_by_timeline.keys())
(tenant_id, timeline_id) = warm
[our_tenant, other_tenant] = list(du_by_timeline.keys())
(tenant_id, timeline_id) = our_tenant
# make picked tenant more recently used than the other one
# make our tenant more recently used than the other one
env.warm_up_tenant(tenant_id)
# Build up enough pressure to require evictions from both tenants,
# but not enough to fall into global LRU.
# So, set target to all occupied space, except 2*env.layer_size per tenant
target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
# So, set target to all occipied space, except 2*env.layer_size per tenant
target = (
du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
)
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
@@ -427,33 +392,22 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
later_tenant_usage < du_by_timeline[tenant]
), "all tenants should have lost some layers"
warm_size = later_du_by_timeline[warm]
# bounds for warmed_size
warm_lower = 0.5 * du_by_timeline[warm]
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
# So, check for up to 3 here.
warm_upper = warm_lower + 3 * env.layer_size
cold_size = later_du_by_timeline[cold]
cold_upper = 2 * env.layer_size
log.info(
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
)
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
assert (
cold_size < cold_upper
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
), "our warmed up tenant should be at about half capacity, part 1"
assert (
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
# So, check for up to 3 here.
later_du_by_timeline[our_tenant]
< 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
), "our warmed up tenant should be at about half capacity, part 2"
assert (
later_du_by_timeline[other_tenant] < 2 * env.layer_size
), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
def poor_mans_du(
env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
) -> Tuple[int, int, int]:
"""
Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -476,11 +430,9 @@ def poor_mans_du(
smallest_layer = min(smallest_layer, size)
else:
smallest_layer = size
if verbose:
log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")
log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
if verbose:
log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
log.info(f"{tenant_id}/{timeline_id}: sum {total}")
total_on_disk += total
assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0

View File

@@ -1,352 +0,0 @@
"""
Tests in this module exercise the pageserver's behavior around generation numbers,
as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require
of the pageserver are:
- Do not start a tenant without a generation number if control_plane_api is set
- Remote objects must be suffixed with generation
- Deletions may only be executed after validating generation
- Updates to remote_consistent_lsn may only be made visible after validating generation
"""
import re
import time
from typing import Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import list_prefix
from fixtures.remote_storage import (
RemoteStorageKind,
)
from fixtures.types import TenantId, TimelineId
from fixtures.utils import print_gc_result, wait_until
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
TENANT_CONF = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
}
def generate_uploads_and_deletions(
env: NeonEnv,
*,
init: bool = True,
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
data: Optional[str] = None,
):
"""
Using the environment's default tenant + timeline, generate a load pattern
that results in some uploads and some deletions to remote storage.
"""
if tenant_id is None:
tenant_id = env.initial_tenant
assert tenant_id is not None
if timeline_id is None:
timeline_id = env.initial_timeline
assert timeline_id is not None
ps_http = env.pageserver.http_client()
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
if init:
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
def churn(data):
endpoint.safe_psql_many(
[
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, 20000) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
# to ensure that GC can actually remove some layers
"VACUUM foo",
]
)
assert tenant_id is not None
assert timeline_id is not None
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Compaction should generate some GC-elegible layers
for i in range(0, 2):
churn(f"{i if data is None else data}")
gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
def get_metric_or_0(ps_http, metric: str) -> int:
v = ps_http.get_metric_value(metric)
return 0 if v is None else int(v)
def get_deletion_queue_executed(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
def get_deletion_queue_submitted(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
def get_deletion_queue_dropped(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
def get_deletion_queue_unexpected_errors(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
def get_deletion_queue_depth(ps_http) -> int:
"""
Queue depth if at least one deletion has been submitted, else None
"""
submitted = get_deletion_queue_submitted(ps_http)
executed = get_deletion_queue_executed(ps_http)
dropped = get_deletion_queue_dropped(ps_http)
depth = submitted - executed - dropped
log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
assert depth >= 0
return int(depth)
def assert_deletion_queue(ps_http, size_fn) -> None:
v = get_deletion_queue_depth(ps_http)
assert v is not None
assert size_fn(v) is True
def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
"""
Validate behavior when a pageserver is run without generation support enabled,
then started again after activating it:
- Before upgrade, no objects should have generation suffixes
- After upgrade, the bucket should contain a mixture.
- In both cases, postgres I/O should work.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_configs()
env.broker.try_start()
for sk in env.safekeepers:
sk.start()
assert env.attachment_service is not None
env.attachment_service.start()
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
)
generate_uploads_and_deletions(env)
def parse_generation_suffix(key):
m = re.match(".+-([0-9a-zA-Z]{8})$", key)
if m is None:
return None
else:
log.info(f"match: {m}")
log.info(f"group: {m.group(1)}")
return int(m.group(1), 16)
pre_upgrade_keys = list(
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in pre_upgrade_keys:
assert parse_generation_suffix(key) is None
env.pageserver.stop()
# Starting without the override that disabled control_plane_api
env.pageserver.start()
generate_uploads_and_deletions(env, init=False)
legacy_objects: list[str] = []
suffixed_objects = []
post_upgrade_keys = list(
[o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
)
for key in post_upgrade_keys:
log.info(f"post-upgrade key: {key}")
if parse_generation_suffix(key) is not None:
suffixed_objects.append(key)
else:
legacy_objects.append(key)
# Bucket now contains a mixture of suffixed and non-suffixed objects
assert len(suffixed_objects) > 0
assert len(legacy_objects) > 0
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
some_other_pageserver = 1234
ps_http = env.pageserver.http_client()
generate_uploads_and_deletions(env)
# Flush: pending deletions should all complete
assert_deletion_queue(ps_http, lambda n: n > 0)
ps_http.deletion_queue_flush(execute=True)
assert_deletion_queue(ps_http, lambda n: n == 0)
assert get_deletion_queue_dropped(ps_http) == 0
# Our visible remote_consistent_lsn should match projected
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
generate_uploads_and_deletions(env, init=False)
assert_deletion_queue(ps_http, lambda n: n > 0)
queue_depth_before = get_deletion_queue_depth(ps_http)
executed_before = get_deletion_queue_executed(ps_http)
ps_http.deletion_queue_flush(execute=True)
# Queue drains to zero because we dropped deletions
assert_deletion_queue(ps_http, lambda n: n == 0)
# The executed counter has not incremented
assert get_deletion_queue_executed(ps_http) == executed_before
# The dropped counter has incremented to consume all of the deletions that were previously enqueued
assert get_deletion_queue_dropped(ps_http) == queue_depth_before
# Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
# because generation validation fails.
timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
# TODO: list bucket and confirm all objects have a generation suffix.
assert get_deletion_queue_unexpected_errors(ps_http) == 0
@pytest.mark.parametrize("keep_attachment", [True, False])
def test_deletion_queue_recovery(
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
):
"""
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
node took the attachment while we were restarting.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
ps_http = env.pageserver.http_client()
# Prevent deletion lists from being executed, to build up some backlog of deletions
ps_http.configure_failpoints(
[
("deletion-queue-before-execute", "return"),
]
)
generate_uploads_and_deletions(env)
# There should be entries in the deletion queue
assert_deletion_queue(ps_http, lambda n: n > 0)
ps_http.deletion_queue_flush()
before_restart_depth = get_deletion_queue_depth(ps_http)
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
if not keep_attachment:
some_other_pageserver = 101010
assert env.attachment_service is not None
env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
env.pageserver.start()
def assert_deletions_submitted(n: int):
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
# After restart, issue a flush to kick the deletion frontend to do recovery.
# It should recover all the operations we submitted before the restart.
ps_http.deletion_queue_flush(execute=False)
wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
# The queue should drain through completely if we flush it
ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment:
# If we kept the attachment, then our pre-restart deletions should have executed
# successfully
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
# Restart again
env.pageserver.stop(immediate=True)
env.pageserver.start()
# No deletion lists should be recovered: this demonstrates that deletion lists
# were cleaned up after being executed or dropped in the previous process lifetime.
time.sleep(1)
assert_deletion_queue(ps_http, lambda n: n == 0)
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

View File

@@ -5,6 +5,7 @@ from pathlib import Path
from queue import SimpleQueue
from typing import Any, Dict, Set
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -16,13 +17,15 @@ from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
@pytest.mark.parametrize(
"remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
)
def test_metric_collection(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
remote_storage_kind: RemoteStorageKind,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -52,7 +55,7 @@ def test_metric_collection(
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
@@ -65,14 +68,6 @@ def test_metric_collection(
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -103,14 +98,17 @@ def test_metric_collection(
total += sample[2]
return int(total)
# upload some data to remote storage
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http = env.pageserver.http_client()
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
remote_uploaded = 0
remote_uploaded = get_num_remote_ops("index", "upload")
assert remote_uploaded > 0
# upload some data to remote storage
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http = env.pageserver.http_client()
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
remote_uploaded = get_num_remote_ops("index", "upload")
assert remote_uploaded > 0
# we expect uploads at 1Hz, on busy runners this could be too optimistic,
# so give 5s we only want to get the following upload after "ready" value.
@@ -213,14 +211,6 @@ def test_metric_collection_cleans_up_tempfile(
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

View File

@@ -30,7 +30,9 @@ from fixtures.types import TenantId
from fixtures.utils import run_pg_bench_small
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@pytest.mark.parametrize(
"remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
)
def test_tenant_delete_smoke(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
@@ -41,12 +43,6 @@ def test_tenant_delete_smoke(
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
]
)
# lucky race with stopping from flushing a layer we fail to schedule any uploads
env.pageserver.allowed_errors.append(
@@ -142,12 +138,18 @@ FAILPOINTS_BEFORE_BACKGROUND = [
def combinations():
result = []
remotes = [RemoteStorageKind.MOCK_S3]
remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in FAILPOINTS:
if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
"timeline-delete-before-index-delete",
):
# the above failpoint are not relevant for config without remote storage
continue
# Simulate failures for only one type of remote storage
# to avoid log pollution and make tests run faster
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -193,32 +195,27 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
]
)
if simulate_failures:
env.pageserver.allowed_errors.extend(
[
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
]
)
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
if remote_storage_kind is RemoteStorageKind.NOOP:
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
else:
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
ps_http.configure_failpoints((failpoint, "return"))
@@ -249,7 +246,12 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
env.pageserver.stop()
env.pageserver.start()
if failpoint in (
if (
remote_storage_kind is RemoteStorageKind.NOOP
and failpoint == "tenant-delete-before-create-local-mark"
):
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
elif failpoint in (
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
):
@@ -381,7 +383,6 @@ def test_tenant_delete_is_resumed_on_attach(
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(

View File

@@ -519,8 +519,11 @@ def test_detach_while_attaching(
# * restart the pageserver and verify that ignored tenant is still not loaded
# * `load` the same tenant
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
def test_ignored_tenant_reattach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

View File

@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion
from fixtures.pg_version import PgVersion, xfail_on_postgres
from fixtures.types import Lsn, TenantId, TimelineId
@@ -532,24 +532,7 @@ def test_single_branch_get_tenant_size_grows(
assert size_after == prev, "size after restarting pageserver should not have changed"
def assert_size_approx_equal(size_a, size_b):
"""
Tests that evaluate sizes are checking the pageserver space consumption
that sits many layers below the user input. The exact space needed
varies slightly depending on postgres behavior.
Rather than expecting postgres to be determinstic and occasionally
failing the test, we permit sizes for the same data to vary by a few pages.
"""
# Determined empirically from examples of equality failures: they differ
# by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid
# failing on outliers from that observed range.
threshold = 4 * 8272
assert size_a == pytest.approx(size_b, abs=threshold)
@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
def test_get_tenant_size_with_multiple_branches(
neon_env_builder: NeonEnvBuilder, test_output_dir: Path
):
@@ -590,7 +573,7 @@ def test_get_tenant_size_with_multiple_branches(
)
size_after_first_branch = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after_first_branch, size_at_branch)
assert size_after_first_branch == size_at_branch
first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
@@ -616,7 +599,7 @@ def test_get_tenant_size_with_multiple_branches(
"second-branch", main_branch_name, tenant_id
)
size_after_second_branch = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
assert size_after_second_branch == size_after_continuing_on_main
second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
@@ -652,7 +635,7 @@ def test_get_tenant_size_with_multiple_branches(
# tenant_size but so far this has been reliable, even though at least gc
# and tenant_size race for the same locks
size_after = http_client.tenant_size(tenant_id)
assert_size_approx_equal(size_after, size_after_thinning_branch)
assert size_after == size_after_thinning_branch
size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
size_debug = http_client.tenant_size_debug(tenant_id)

View File

@@ -12,6 +12,7 @@ from fixtures.log_helper import log
from fixtures.metrics import (
PAGESERVER_GLOBAL_METRICS,
PAGESERVER_PER_TENANT_METRICS,
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
parse_metrics,
)
from fixtures.neon_fixtures import (
@@ -231,10 +232,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
assert value
def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"remote_storage_kind",
# exercise both the code paths where remote_storage=None and remote_storage=Some(...)
[RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
)
def test_pageserver_metrics_removed_after_detach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
neon_env_builder.num_safekeepers = 3
@@ -274,6 +282,9 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
for tenant in [tenant_1, tenant_2]:
pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
expected = set(PAGESERVER_PER_TENANT_METRICS)
if remote_storage_kind == RemoteStorageKind.NOOP:
# if there's no remote storage configured, we don't expose the remote timeline client metrics
expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
assert pre_detach_samples == expected
env.pageserver.http_client().tenant_detach(tenant)
@@ -283,7 +294,9 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
# Check that empty tenants work with or without the remote storage
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@pytest.mark.parametrize(
"remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP]
)
def test_pageserver_with_empty_tenants(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):

View File

@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
@@ -144,12 +145,19 @@ DELETE_FAILPOINTS = [
def combinations():
result = []
remotes = [RemoteStorageKind.MOCK_S3]
remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in DELETE_FAILPOINTS:
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
"timeline-delete-before-index-delete",
"timeline-delete-after-index-delete",
):
# the above failpoints are not relevant for config without remote storage
continue
result.append((remote_storage_kind, delete_failpoint))
return result
@@ -197,21 +205,23 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
if remote_storage_kind is RemoteStorageKind.NOOP:
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
else:
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(env.initial_tenant),
"timelines",
str(timeline_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(env.initial_tenant),
"timelines",
str(timeline_id),
)
),
)
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
# It appears when we stopped flush loop during deletion and then pageserver is stopped
@@ -797,8 +807,6 @@ def test_delete_orphaned_objects(
reason = timeline_info["state"]["Broken"]["reason"]
assert reason.endswith(f"failpoint: {failpoint}"), reason
ps_http.deletion_queue_flush(execute=True)
for orphan in orphans:
assert not orphan.exists()
assert env.pageserver.log_contains(

View File

@@ -301,8 +301,12 @@ def test_timeline_initial_logical_size_calculation_cancellation(
# message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_timeline_physical_size_init(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
@@ -333,12 +337,17 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
)
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id),
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_timeline_physical_size_post_checkpoint(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
@@ -360,14 +369,19 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder
def check():
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id),
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
wait_until(10, 1, check)
def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_timeline_physical_size_post_compaction(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# Disable background compaction as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -406,15 +420,21 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
if remote_storage_kind is not None:
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id),
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_timeline_physical_size_post_gc(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -451,10 +471,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
if remote_storage_kind is not None:
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id),
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
@@ -538,10 +560,14 @@ def test_timeline_size_metrics(
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_tenant_physical_size(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
random.seed(100)
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
if remote_storage_kind is not None:
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
@@ -549,10 +575,12 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
client = env.pageserver.http_client()
tenant, timeline = env.neon_cli.create_tenant()
if remote_storage_kind is not None:
wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
def get_timeline_resident_physical_size(timeline: TimelineId):
sizes = get_physical_size_values(env, tenant, timeline)
assert_physical_size_invariants(sizes)
sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
assert_physical_size_invariants(sizes, remote_storage_kind)
return sizes.prometheus_resident_physical
timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -572,7 +600,8 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
pageserver_http.timeline_checkpoint(tenant, timeline)
wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
if remote_storage_kind is not None:
wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
@@ -601,6 +630,7 @@ def get_physical_size_values(
env: NeonEnv,
tenant_id: TenantId,
timeline_id: TimelineId,
remote_storage_kind: Optional[RemoteStorageKind],
) -> TimelinePhysicalSizeValues:
res = TimelinePhysicalSizeValues()
@@ -616,9 +646,12 @@ def get_physical_size_values(
res.prometheus_resident_physical = metrics.query_one(
"pageserver_resident_physical_size", metrics_filter
).value
res.prometheus_remote_physical = metrics.query_one(
"pageserver_remote_physical_size", metrics_filter
).value
if remote_storage_kind is not None:
res.prometheus_remote_physical = metrics.query_one(
"pageserver_remote_physical_size", metrics_filter
).value
else:
res.prometheus_remote_physical = None
detail = client.timeline_detail(
tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
@@ -631,15 +664,20 @@ def get_physical_size_values(
return res
def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
def assert_physical_size_invariants(
sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
):
# resident phyiscal size is defined as
assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
# we don't do layer eviction, so, all layers are resident
assert sizes.api_current_physical == sizes.prometheus_resident_physical
assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
if remote_storage_kind is not None:
assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
else:
assert sizes.prometheus_remote_physical is None
# Timeline logical size initialization is an asynchronous background task that runs once,